diff options
Diffstat (limited to 'arch/x86/kernel')
117 files changed, 8949 insertions, 5255 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 88d1bfc847d..6c327b852e2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -24,11 +24,13 @@ CFLAGS_vsyscall_64.o	:= $(PROFILING) -g0 $(nostackp)  CFLAGS_hpet.o		:= $(nostackp)  CFLAGS_tsc.o		:= $(nostackp)  CFLAGS_paravirt.o	:= $(nostackp) +GCOV_PROFILE_vsyscall_64.o	:= n +GCOV_PROFILE_hpet.o		:= n  obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o  obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o  obj-y			+= time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y			+= setup.o i8259.o irqinit_$(BITS).o +obj-y			+= setup.o i8259.o irqinit.o  obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o  obj-$(CONFIG_X86_32)	+= probe_roms_32.o  obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o @@ -44,6 +46,7 @@ obj-y				+= process.o  obj-y				+= i387.o xsave.o  obj-y				+= ptrace.o  obj-$(CONFIG_X86_DS)		+= ds.o +obj-$(CONFIG_X86_DS_SELFTEST)		+= ds_selftest.o  obj-$(CONFIG_X86_32)		+= tls.o  obj-$(CONFIG_IA32_EMULATION)	+= tls.o  obj-y				+= step.o @@ -72,7 +75,7 @@ obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o  obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o  obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o  obj-$(CONFIG_KPROBES)		+= kprobes.o -obj-$(CONFIG_MODULES)		+= module_$(BITS).o +obj-$(CONFIG_MODULES)		+= module.o  obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o  obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o  obj-$(CONFIG_KGDB)		+= kgdb.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 723989d7f80..6b8ca3a0285 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -33,6 +33,7 @@  #include <linux/irq.h>  #include <linux/bootmem.h>  #include <linux/ioport.h> +#include <linux/pci.h>  #include <asm/pgtable.h>  #include <asm/io_apic.h> @@ -43,11 +44,7 @@  static int __initdata acpi_force = 0;  u32 acpi_rsdt_forced; -#ifdef	CONFIG_ACPI -int acpi_disabled = 0; -#else -int acpi_disabled = 1; -#endif +int acpi_disabled;  EXPORT_SYMBOL(acpi_disabled);  #ifdef	CONFIG_X86_64 @@ -121,72 +118,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size)  	early_iounmap(map, size);  } -#ifdef CONFIG_PCI_MMCONFIG - -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; - -/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */ -struct acpi_mcfg_allocation *pci_mmcfg_config; -int pci_mmcfg_config_num; - -static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) -{ -	if (!strcmp(mcfg->header.oem_id, "SGI")) -		acpi_mcfg_64bit_base_addr = TRUE; - -	return 0; -} - -int __init acpi_parse_mcfg(struct acpi_table_header *header) -{ -	struct acpi_table_mcfg *mcfg; -	unsigned long i; -	int config_size; - -	if (!header) -		return -EINVAL; - -	mcfg = (struct acpi_table_mcfg *)header; - -	/* how many config structures do we have */ -	pci_mmcfg_config_num = 0; -	i = header->length - sizeof(struct acpi_table_mcfg); -	while (i >= sizeof(struct acpi_mcfg_allocation)) { -		++pci_mmcfg_config_num; -		i -= sizeof(struct acpi_mcfg_allocation); -	}; -	if (pci_mmcfg_config_num == 0) { -		printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); -		return -ENODEV; -	} - -	config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); -	pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); -	if (!pci_mmcfg_config) { -		printk(KERN_WARNING PREFIX -		       "No memory for MCFG config tables\n"); -		return -ENOMEM; -	} - -	memcpy(pci_mmcfg_config, &mcfg[1], config_size); - -	acpi_mcfg_oem_check(mcfg); - -	for (i = 0; i < pci_mmcfg_config_num; ++i) { -		if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && -		    !acpi_mcfg_64bit_base_addr) { -			printk(KERN_ERR PREFIX -			       "MMCONFIG not in low 4GB of memory\n"); -			kfree(pci_mmcfg_config); -			pci_mmcfg_config_num = 0; -			return -ENODEV; -		} -	} - -	return 0; -} -#endif				/* CONFIG_PCI_MMCONFIG */ -  #ifdef CONFIG_X86_LOCAL_APIC  static int __init acpi_parse_madt(struct acpi_table_header *table)  { @@ -522,7 +453,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)   * success: return IRQ number (>=0)   * failure: return < 0   */ -int acpi_register_gsi(u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)  {  	unsigned int irq;  	unsigned int plat_gsi = gsi; @@ -532,14 +463,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)  	 * Make sure all (legacy) PCI IRQs are set as level-triggered.  	 */  	if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { -		if (triggering == ACPI_LEVEL_SENSITIVE) +		if (trigger == ACPI_LEVEL_SENSITIVE)  			eisa_set_level_irq(gsi);  	}  #endif  #ifdef CONFIG_X86_IO_APIC  	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { -		plat_gsi = mp_register_gsi(gsi, triggering, polarity); +		plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);  	}  #endif  	acpi_gsi_to_irq(plat_gsi, &irq); @@ -903,10 +834,8 @@ extern int es7000_plat;  #endif  static struct { -	int apic_id;  	int gsi_base;  	int gsi_end; -	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);  } mp_ioapic_routing[MAX_IO_APICS];  int mp_find_ioapic(int gsi) @@ -986,16 +915,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)  	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);  	mp_ioapics[idx].apicid = uniq_ioapic_id(id); -#ifdef CONFIG_X86_32  	mp_ioapics[idx].apicver = io_apic_get_version(idx); -#else -	mp_ioapics[idx].apicver = 0; -#endif +  	/*  	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups  	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).  	 */ -	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;  	mp_ioapic_routing[idx].gsi_base = gsi_base;  	mp_ioapic_routing[idx].gsi_end = gsi_base +  	    io_apic_get_redir_entries(idx); @@ -1158,26 +1083,52 @@ void __init mp_config_acpi_legacy_irqs(void)  	}  } -int mp_register_gsi(u32 gsi, int triggering, int polarity) +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, +			int polarity)  { +#ifdef CONFIG_X86_MPPARSE +	struct mpc_intsrc mp_irq; +	struct pci_dev *pdev; +	unsigned char number; +	unsigned int devfn;  	int ioapic; -	int ioapic_pin; -#ifdef CONFIG_X86_32 -#define MAX_GSI_NUM	4096 -#define IRQ_COMPRESSION_START	64 +	u8 pin; -	static int pci_irq = IRQ_COMPRESSION_START; -	/* -	 * Mapping between Global System Interrupts, which -	 * represent all possible interrupts, and IRQs -	 * assigned to actual devices. -	 */ -	static int gsi_to_irq[MAX_GSI_NUM]; -#else +	if (!acpi_ioapic) +		return 0; +	if (!dev) +		return 0; +	if (dev->bus != &pci_bus_type) +		return 0; + +	pdev = to_pci_dev(dev); +	number = pdev->bus->number; +	devfn = pdev->devfn; +	pin = pdev->pin; +	/* print the entry should happen on mptable identically */ +	mp_irq.type = MP_INTSRC; +	mp_irq.irqtype = mp_INT; +	mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | +				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3); +	mp_irq.srcbus = number; +	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); +	ioapic = mp_find_ioapic(gsi); +	mp_irq.dstapic = mp_ioapics[ioapic].apicid; +	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + +	save_mp_irq(&mp_irq); +#endif +	return 0; +} + +int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ +	int ioapic; +	int ioapic_pin; +	struct io_apic_irq_attr irq_attr;  	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)  		return gsi; -#endif  	/* Don't set up the ACPI SCI because it's already set up */  	if (acpi_gbl_FADT.sci_interrupt == gsi) @@ -1196,93 +1147,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)  		gsi = ioapic_renumber_irq(ioapic, gsi);  #endif -	/* -	 * Avoid pin reprogramming.  PRTs typically include entries -	 * with redundant pin->gsi mappings (but unique PCI devices); -	 * we only program the IOAPIC on the first. -	 */  	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {  		printk(KERN_ERR "Invalid reference to IOAPIC pin " -		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, +		       "%d-%d\n", mp_ioapics[ioapic].apicid,  		       ioapic_pin);  		return gsi;  	} -	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { -		pr_debug("Pin %d-%d already programmed\n", -			 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); -#ifdef CONFIG_X86_32 -		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); -#else -		return gsi; -#endif -	} - -	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); -#ifdef CONFIG_X86_32 -	/* -	 * For GSI >= 64, use IRQ compression -	 */ -	if ((gsi >= IRQ_COMPRESSION_START) -	    && (triggering == ACPI_LEVEL_SENSITIVE)) { -		/* -		 * For PCI devices assign IRQs in order, avoiding gaps -		 * due to unused I/O APIC pins. -		 */ -		int irq = gsi; -		if (gsi < MAX_GSI_NUM) { -			/* -			 * Retain the VIA chipset work-around (gsi > 15), but -			 * avoid a problem where the 8254 timer (IRQ0) is setup -			 * via an override (so it's not on pin 0 of the ioapic), -			 * and at the same time, the pin 0 interrupt is a PCI -			 * type.  The gsi > 15 test could cause these two pins -			 * to be shared as IRQ0, and they are not shareable. -			 * So test for this condition, and if necessary, avoid -			 * the pin collision. -			 */ -			gsi = pci_irq++; -			/* -			 * Don't assign IRQ used by ACPI SCI -			 */ -			if (gsi == acpi_gbl_FADT.sci_interrupt) -				gsi = pci_irq++; -			gsi_to_irq[irq] = gsi; -		} else { -			printk(KERN_ERR "GSI %u is too high\n", gsi); -			return gsi; -		} -	} -#endif -	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, -				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, -				polarity == ACPI_ACTIVE_HIGH ? 0 : 1); -	return gsi; -} - -int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, -			u32 gsi, int triggering, int polarity) -{ -#ifdef CONFIG_X86_MPPARSE -	struct mpc_intsrc mp_irq; -	int ioapic; -	if (!acpi_ioapic) -		return 0; +	if (enable_update_mptable) +		mp_config_acpi_gsi(dev, gsi, trigger, polarity); -	/* print the entry should happen on mptable identically */ -	mp_irq.type = MP_INTSRC; -	mp_irq.irqtype = mp_INT; -	mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | -				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3); -	mp_irq.srcbus = number; -	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); -	ioapic = mp_find_ioapic(gsi); -	mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; -	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); +	set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, +			     trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, +			     polarity == ACPI_ACTIVE_HIGH ? 0 : 1); +	io_apic_set_pci_routing(dev, gsi, &irq_attr); -	save_mp_irq(&mp_irq); -#endif -	return 0; +	return gsi;  }  /* @@ -1569,14 +1449,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {  	 },  	{  	 .callback = force_acpi_ht, -	 .ident = "ASUS P4B266", -	 .matches = { -		     DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), -		     DMI_MATCH(DMI_BOARD_NAME, "P4B266"), -		     }, -	 }, -	{ -	 .callback = force_acpi_ht,  	 .ident = "ASUS P2B-DS",  	 .matches = {  		     DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index bbbe4bbb6f3..8c44c232efc 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -34,12 +34,22 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,  		flags->bm_check = 1;  	else if (c->x86_vendor == X86_VENDOR_INTEL) {  		/* -		 * Today all CPUs that support C3 share cache. -		 * TBD: This needs to look at cache shared map, once -		 * multi-core detection patch makes to the base. +		 * Today all MP CPUs that support C3 share cache. +		 * And caches should not be flushed by software while +		 * entering C3 type state.  		 */  		flags->bm_check = 1;  	} + +	/* +	 * On all recent Intel platforms, ARB_DISABLE is a nop. +	 * So, set bm_control to zero to indicate that ARB_DISABLE +	 * is not required while entering C3 type state on +	 * P4, Core and beyond CPUs +	 */ +	if (c->x86_vendor == X86_VENDOR_INTEL && +	    (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) +			flags->bm_control = 0;  }  EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 1c31cc0e9de..6a564ac67ef 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile @@ -9,7 +9,7 @@  always		:= wakeup.bin  targets		:= wakeup.elf wakeup.lds -wakeup-y	+= wakeup.o wakemain.o video-mode.o copy.o +wakeup-y	+= wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o  # The link order of the video-*.o modules can matter.  In particular,  # video-vga.o *must* be listed first, followed by video-vesa.o. @@ -42,6 +42,7 @@ KBUILD_CFLAGS	:= $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \  		   $(call cc-option, -mpreferred-stack-boundary=2)  KBUILD_CFLAGS	+= $(call cc-option, -m32)  KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__ +GCOV_PROFILE := n  WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S new file mode 100644 index 00000000000..f51eb0bb56c --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/bioscall.S @@ -0,0 +1 @@ +#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c new file mode 100644 index 00000000000..6206033ba20 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/regs.c @@ -0,0 +1 @@ +#include "../../../boot/regs.c" diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 7c243a2c511..ca93638ba43 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -104,7 +104,7 @@ int acpi_save_state_mem(void)  	initial_gs = per_cpu_offset(smp_processor_id());  #endif  	initial_code = (unsigned long)wakeup_long64; -	saved_magic = 0x123456789abcdef0; +       saved_magic = 0x123456789abcdef0L;  #endif /* CONFIG_64BIT */  	return 0; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad5..9372f0406ad 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -55,7 +55,16 @@ struct iommu_cmd {  static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,  			     struct unity_map_entry *e);  static struct dma_ops_domain *find_protection_domain(u16 devid); +static u64* alloc_pte(struct protection_domain *dom, +		      unsigned long address, u64 +		      **pte_page, gfp_t gfp); +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, +				      unsigned long start_page, +				      unsigned int pages); +#ifndef BUS_NOTIFY_UNBOUND_DRIVER +#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 +#endif  #ifdef CONFIG_AMD_IOMMU_STATS @@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)  {  	struct amd_iommu *iommu; -	list_for_each_entry(iommu, &amd_iommu_list, list) +	for_each_iommu(iommu)  		iommu_poll_events(iommu);  	return IRQ_HANDLED; @@ -425,6 +434,16 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)  	iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);  } +/* Flush the whole IO/TLB for a given protection domain - including PDE */ +static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) +{ +       u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + +       INC_STATS_COUNTER(domain_flush_single); + +       iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); +} +  /*   * This function is used to flush the IO/TLB for a given protection domain   * on every IOMMU in the system @@ -440,7 +459,7 @@ static void iommu_flush_domain(u16 domid)  	__iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,  				      domid, 1, 1); -	list_for_each_entry(iommu, &amd_iommu_list, list) { +	for_each_iommu(iommu) {  		spin_lock_irqsave(&iommu->lock, flags);  		__iommu_queue_command(iommu, &cmd);  		__iommu_completion_wait(iommu); @@ -449,6 +468,35 @@ static void iommu_flush_domain(u16 domid)  	}  } +void amd_iommu_flush_all_domains(void) +{ +	int i; + +	for (i = 1; i < MAX_DOMAIN_ID; ++i) { +		if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) +			continue; +		iommu_flush_domain(i); +	} +} + +void amd_iommu_flush_all_devices(void) +{ +	struct amd_iommu *iommu; +	int i; + +	for (i = 0; i <= amd_iommu_last_bdf; ++i) { +		if (amd_iommu_pd_table[i] == NULL) +			continue; + +		iommu = amd_iommu_rlookup_table[i]; +		if (!iommu) +			continue; + +		iommu_queue_inv_dev_entry(iommu, i); +		iommu_completion_wait(iommu); +	} +} +  /****************************************************************************   *   * The functions below are used the create the page table mappings for @@ -468,7 +516,7 @@ static int iommu_map_page(struct protection_domain *dom,  			  unsigned long phys_addr,  			  int prot)  { -	u64 __pte, *pte, *page; +	u64 __pte, *pte;  	bus_addr  = PAGE_ALIGN(bus_addr);  	phys_addr = PAGE_ALIGN(phys_addr); @@ -477,27 +525,7 @@ static int iommu_map_page(struct protection_domain *dom,  	if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))  		return -EINVAL; -	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - -	if (!IOMMU_PTE_PRESENT(*pte)) { -		page = (u64 *)get_zeroed_page(GFP_KERNEL); -		if (!page) -			return -ENOMEM; -		*pte = IOMMU_L2_PDE(virt_to_phys(page)); -	} - -	pte = IOMMU_PTE_PAGE(*pte); -	pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - -	if (!IOMMU_PTE_PRESENT(*pte)) { -		page = (u64 *)get_zeroed_page(GFP_KERNEL); -		if (!page) -			return -ENOMEM; -		*pte = IOMMU_L1_PDE(virt_to_phys(page)); -	} - -	pte = IOMMU_PTE_PAGE(*pte); -	pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; +	pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);  	if (IOMMU_PTE_PRESENT(*pte))  		return -EBUSY; @@ -595,7 +623,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,  		 * as allocated in the aperture  		 */  		if (addr < dma_dom->aperture_size) -			__set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); +			__set_bit(addr >> PAGE_SHIFT, +				  dma_dom->aperture[0]->bitmap);  	}  	return 0; @@ -632,42 +661,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,   ****************************************************************************/  /* - * The address allocator core function. + * The address allocator core functions.   *   * called with domain->lock held   */ + +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64* fetch_pte(struct protection_domain *domain, +		      unsigned long address) +{ +	u64 *pte; + +	pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + +	if (!IOMMU_PTE_PRESENT(*pte)) +		return NULL; + +	pte = IOMMU_PTE_PAGE(*pte); +	pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + +	if (!IOMMU_PTE_PRESENT(*pte)) +		return NULL; + +	pte = IOMMU_PTE_PAGE(*pte); +	pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + +	return pte; +} + +/* + * This function is used to add a new aperture range to an existing + * aperture in case of dma_ops domain allocation or address allocation + * failure. + */ +static int alloc_new_range(struct amd_iommu *iommu, +			   struct dma_ops_domain *dma_dom, +			   bool populate, gfp_t gfp) +{ +	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; +	int i; + +#ifdef CONFIG_IOMMU_STRESS +	populate = false; +#endif + +	if (index >= APERTURE_MAX_RANGES) +		return -ENOMEM; + +	dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); +	if (!dma_dom->aperture[index]) +		return -ENOMEM; + +	dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); +	if (!dma_dom->aperture[index]->bitmap) +		goto out_free; + +	dma_dom->aperture[index]->offset = dma_dom->aperture_size; + +	if (populate) { +		unsigned long address = dma_dom->aperture_size; +		int i, num_ptes = APERTURE_RANGE_PAGES / 512; +		u64 *pte, *pte_page; + +		for (i = 0; i < num_ptes; ++i) { +			pte = alloc_pte(&dma_dom->domain, address, +					&pte_page, gfp); +			if (!pte) +				goto out_free; + +			dma_dom->aperture[index]->pte_pages[i] = pte_page; + +			address += APERTURE_RANGE_SIZE / 64; +		} +	} + +	dma_dom->aperture_size += APERTURE_RANGE_SIZE; + +	/* Intialize the exclusion range if necessary */ +	if (iommu->exclusion_start && +	    iommu->exclusion_start >= dma_dom->aperture[index]->offset && +	    iommu->exclusion_start < dma_dom->aperture_size) { +		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; +		int pages = iommu_num_pages(iommu->exclusion_start, +					    iommu->exclusion_length, +					    PAGE_SIZE); +		dma_ops_reserve_addresses(dma_dom, startpage, pages); +	} + +	/* +	 * Check for areas already mapped as present in the new aperture +	 * range and mark those pages as reserved in the allocator. Such +	 * mappings may already exist as a result of requested unity +	 * mappings for devices. +	 */ +	for (i = dma_dom->aperture[index]->offset; +	     i < dma_dom->aperture_size; +	     i += PAGE_SIZE) { +		u64 *pte = fetch_pte(&dma_dom->domain, i); +		if (!pte || !IOMMU_PTE_PRESENT(*pte)) +			continue; + +		dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); +	} + +	return 0; + +out_free: +	free_page((unsigned long)dma_dom->aperture[index]->bitmap); + +	kfree(dma_dom->aperture[index]); +	dma_dom->aperture[index] = NULL; + +	return -ENOMEM; +} + +static unsigned long dma_ops_area_alloc(struct device *dev, +					struct dma_ops_domain *dom, +					unsigned int pages, +					unsigned long align_mask, +					u64 dma_mask, +					unsigned long start) +{ +	unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; +	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; +	int i = start >> APERTURE_RANGE_SHIFT; +	unsigned long boundary_size; +	unsigned long address = -1; +	unsigned long limit; + +	next_bit >>= PAGE_SHIFT; + +	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, +			PAGE_SIZE) >> PAGE_SHIFT; + +	for (;i < max_index; ++i) { +		unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; + +		if (dom->aperture[i]->offset >= dma_mask) +			break; + +		limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, +					       dma_mask >> PAGE_SHIFT); + +		address = iommu_area_alloc(dom->aperture[i]->bitmap, +					   limit, next_bit, pages, 0, +					    boundary_size, align_mask); +		if (address != -1) { +			address = dom->aperture[i]->offset + +				  (address << PAGE_SHIFT); +			dom->next_address = address + (pages << PAGE_SHIFT); +			break; +		} + +		next_bit = 0; +	} + +	return address; +} +  static unsigned long dma_ops_alloc_addresses(struct device *dev,  					     struct dma_ops_domain *dom,  					     unsigned int pages,  					     unsigned long align_mask,  					     u64 dma_mask)  { -	unsigned long limit;  	unsigned long address; -	unsigned long boundary_size; -	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, -			PAGE_SIZE) >> PAGE_SHIFT; -	limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, -				       dma_mask >> PAGE_SHIFT); +#ifdef CONFIG_IOMMU_STRESS +	dom->next_address = 0; +	dom->need_flush = true; +#endif -	if (dom->next_bit >= limit) { -		dom->next_bit = 0; -		dom->need_flush = true; -	} +	address = dma_ops_area_alloc(dev, dom, pages, align_mask, +				     dma_mask, dom->next_address); -	address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, -				   0 , boundary_size, align_mask);  	if (address == -1) { -		address = iommu_area_alloc(dom->bitmap, limit, 0, pages, -				0, boundary_size, align_mask); +		dom->next_address = 0; +		address = dma_ops_area_alloc(dev, dom, pages, align_mask, +					     dma_mask, 0);  		dom->need_flush = true;  	} -	if (likely(address != -1)) { -		dom->next_bit = address + pages; -		address <<= PAGE_SHIFT; -	} else +	if (unlikely(address == -1))  		address = bad_dma_address;  	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -684,11 +862,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,  				   unsigned long address,  				   unsigned int pages)  { -	address >>= PAGE_SHIFT; -	iommu_area_free(dom->bitmap, address, pages); +	unsigned i = address >> APERTURE_RANGE_SHIFT; +	struct aperture_range *range = dom->aperture[i]; + +	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); + +#ifdef CONFIG_IOMMU_STRESS +	if (i < 4) +		return; +#endif -	if (address >= dom->next_bit) +	if (address >= dom->next_address)  		dom->need_flush = true; + +	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + +	iommu_area_free(range->bitmap, address, pages); +  }  /**************************************************************************** @@ -736,12 +926,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,  				      unsigned long start_page,  				      unsigned int pages)  { -	unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; +	unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;  	if (start_page + pages > last_page)  		pages = last_page - start_page; -	iommu_area_reserve(dom->bitmap, start_page, pages); +	for (i = start_page; i < start_page + pages; ++i) { +		int index = i / APERTURE_RANGE_PAGES; +		int page  = i % APERTURE_RANGE_PAGES; +		__set_bit(page, dom->aperture[index]->bitmap); +	}  }  static void free_pagetable(struct protection_domain *domain) @@ -780,14 +974,19 @@ static void free_pagetable(struct protection_domain *domain)   */  static void dma_ops_domain_free(struct dma_ops_domain *dom)  { +	int i; +  	if (!dom)  		return;  	free_pagetable(&dom->domain); -	kfree(dom->pte_pages); - -	kfree(dom->bitmap); +	for (i = 0; i < APERTURE_MAX_RANGES; ++i) { +		if (!dom->aperture[i]) +			continue; +		free_page((unsigned long)dom->aperture[i]->bitmap); +		kfree(dom->aperture[i]); +	}  	kfree(dom);  } @@ -797,19 +996,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)   * It also intializes the page table and the address allocator data   * structures required for the dma_ops interface   */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, -						   unsigned order) +static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)  {  	struct dma_ops_domain *dma_dom; -	unsigned i, num_pte_pages; -	u64 *l2_pde; -	u64 address; - -	/* -	 * Currently the DMA aperture must be between 32 MB and 1GB in size -	 */ -	if ((order < 25) || (order > 30)) -		return NULL;  	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);  	if (!dma_dom) @@ -826,55 +1015,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,  	dma_dom->domain.priv = dma_dom;  	if (!dma_dom->domain.pt_root)  		goto free_dma_dom; -	dma_dom->aperture_size = (1ULL << order); -	dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), -				  GFP_KERNEL); -	if (!dma_dom->bitmap) -		goto free_dma_dom; -	/* -	 * mark the first page as allocated so we never return 0 as -	 * a valid dma-address. So we can use 0 as error value -	 */ -	dma_dom->bitmap[0] = 1; -	dma_dom->next_bit = 0;  	dma_dom->need_flush = false;  	dma_dom->target_dev = 0xffff; -	/* Intialize the exclusion range if necessary */ -	if (iommu->exclusion_start && -	    iommu->exclusion_start < dma_dom->aperture_size) { -		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; -		int pages = iommu_num_pages(iommu->exclusion_start, -					    iommu->exclusion_length, -					    PAGE_SIZE); -		dma_ops_reserve_addresses(dma_dom, startpage, pages); -	} +	if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) +		goto free_dma_dom;  	/* -	 * At the last step, build the page tables so we don't need to -	 * allocate page table pages in the dma_ops mapping/unmapping -	 * path. +	 * mark the first page as allocated so we never return 0 as +	 * a valid dma-address. So we can use 0 as error value  	 */ -	num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); -	dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), -			GFP_KERNEL); -	if (!dma_dom->pte_pages) -		goto free_dma_dom; - -	l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); -	if (l2_pde == NULL) -		goto free_dma_dom; +	dma_dom->aperture[0]->bitmap[0] = 1; +	dma_dom->next_address = 0; -	dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); - -	for (i = 0; i < num_pte_pages; ++i) { -		dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); -		if (!dma_dom->pte_pages[i]) -			goto free_dma_dom; -		address = virt_to_phys(dma_dom->pte_pages[i]); -		l2_pde[i] = IOMMU_L1_PDE(address); -	}  	return dma_dom; @@ -934,7 +1088,13 @@ static void attach_device(struct amd_iommu *iommu,  	amd_iommu_pd_table[devid] = domain;  	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +       /* +        * We might boot into a crash-kernel here. The crashed kernel +        * left the caches in the IOMMU dirty. So we have to flush +        * here to evict all dirty stuff. +        */  	iommu_queue_inv_dev_entry(iommu, devid); +	iommu_flush_tlb_pde(iommu, domain->id);  }  /* @@ -983,7 +1143,6 @@ static int device_change_notifier(struct notifier_block *nb,  	struct protection_domain *domain;  	struct dma_ops_domain *dma_domain;  	struct amd_iommu *iommu; -	int order = amd_iommu_aperture_order;  	unsigned long flags;  	if (devid > amd_iommu_last_bdf) @@ -1002,17 +1161,7 @@ static int device_change_notifier(struct notifier_block *nb,  			  "to a non-dma-ops domain\n", dev_name(dev));  	switch (action) { -	case BUS_NOTIFY_BOUND_DRIVER: -		if (domain) -			goto out; -		dma_domain = find_protection_domain(devid); -		if (!dma_domain) -			dma_domain = iommu->default_dom; -		attach_device(iommu, &dma_domain->domain, devid); -		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " -		       "device %s\n", dma_domain->domain.id, dev_name(dev)); -		break; -	case BUS_NOTIFY_UNBIND_DRIVER: +	case BUS_NOTIFY_UNBOUND_DRIVER:  		if (!domain)  			goto out;  		detach_device(domain, devid); @@ -1022,7 +1171,7 @@ static int device_change_notifier(struct notifier_block *nb,  		dma_domain = find_protection_domain(devid);  		if (dma_domain)  			goto out; -		dma_domain = dma_ops_domain_alloc(iommu, order); +		dma_domain = dma_ops_domain_alloc(iommu);  		if (!dma_domain)  			goto out;  		dma_domain->target_dev = devid; @@ -1133,8 +1282,8 @@ static int get_device_resources(struct device *dev,  			dma_dom = (*iommu)->default_dom;  		*domain = &dma_dom->domain;  		attach_device(*iommu, *domain, *bdf); -		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " -				"device %s\n", (*domain)->id, dev_name(dev)); +		DUMP_printk("Using protection domain %d for device %s\n", +			    (*domain)->id, dev_name(dev));  	}  	if (domain_for_device(_bdf) == NULL) @@ -1144,6 +1293,66 @@ static int get_device_resources(struct device *dev,  }  /* + * If the pte_page is not yet allocated this function is called + */ +static u64* alloc_pte(struct protection_domain *dom, +		      unsigned long address, u64 **pte_page, gfp_t gfp) +{ +	u64 *pte, *page; + +	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + +	if (!IOMMU_PTE_PRESENT(*pte)) { +		page = (u64 *)get_zeroed_page(gfp); +		if (!page) +			return NULL; +		*pte = IOMMU_L2_PDE(virt_to_phys(page)); +	} + +	pte = IOMMU_PTE_PAGE(*pte); +	pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + +	if (!IOMMU_PTE_PRESENT(*pte)) { +		page = (u64 *)get_zeroed_page(gfp); +		if (!page) +			return NULL; +		*pte = IOMMU_L1_PDE(virt_to_phys(page)); +	} + +	pte = IOMMU_PTE_PAGE(*pte); + +	if (pte_page) +		*pte_page = pte; + +	pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + +	return pte; +} + +/* + * This function fetches the PTE for a given address in the aperture + */ +static u64* dma_ops_get_pte(struct dma_ops_domain *dom, +			    unsigned long address) +{ +	struct aperture_range *aperture; +	u64 *pte, *pte_page; + +	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; +	if (!aperture) +		return NULL; + +	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; +	if (!pte) { +		pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); +		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; +	} else +		pte += IOMMU_PTE_L0_INDEX(address); + +	return pte; +} + +/*   * This is the generic map function. It maps one 4kb page at paddr to   * the given address in the DMA address space for the domain.   */ @@ -1159,8 +1368,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,  	paddr &= PAGE_MASK; -	pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; -	pte += IOMMU_PTE_L0_INDEX(address); +	pte  = dma_ops_get_pte(dom, address); +	if (!pte) +		return bad_dma_address;  	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1185,14 +1395,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,  				 struct dma_ops_domain *dom,  				 unsigned long address)  { +	struct aperture_range *aperture;  	u64 *pte;  	if (address >= dom->aperture_size)  		return; -	WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); +	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; +	if (!aperture) +		return; + +	pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; +	if (!pte) +		return; -	pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];  	pte += IOMMU_PTE_L0_INDEX(address);  	WARN_ON(!*pte); @@ -1216,7 +1432,7 @@ static dma_addr_t __map_single(struct device *dev,  			       u64 dma_mask)  {  	dma_addr_t offset = paddr & ~PAGE_MASK; -	dma_addr_t address, start; +	dma_addr_t address, start, ret;  	unsigned int pages;  	unsigned long align_mask = 0;  	int i; @@ -1232,14 +1448,33 @@ static dma_addr_t __map_single(struct device *dev,  	if (align)  		align_mask = (1UL << get_order(size)) - 1; +retry:  	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,  					  dma_mask); -	if (unlikely(address == bad_dma_address)) -		goto out; +	if (unlikely(address == bad_dma_address)) { +		/* +		 * setting next_address here will let the address +		 * allocator only scan the new allocated range in the +		 * first run. This is a small optimization. +		 */ +		dma_dom->next_address = dma_dom->aperture_size; + +		if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) +			goto out; + +		/* +		 * aperture was sucessfully enlarged by 128 MB, try +		 * allocation again +		 */ +		goto retry; +	}  	start = address;  	for (i = 0; i < pages; ++i) { -		dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); +		ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); +		if (ret == bad_dma_address) +			goto out_unmap; +  		paddr += PAGE_SIZE;  		start += PAGE_SIZE;  	} @@ -1255,6 +1490,17 @@ static dma_addr_t __map_single(struct device *dev,  out:  	return address; + +out_unmap: + +	for (--i; i >= 0; --i) { +		start -= PAGE_SIZE; +		dma_ops_domain_unmap(iommu, dma_dom, start); +	} + +	dma_ops_free_addresses(dma_dom, address, pages); + +	return bad_dma_address;  }  /* @@ -1537,8 +1783,10 @@ static void *alloc_coherent(struct device *dev, size_t size,  	*dma_addr = __map_single(dev, iommu, domain->priv, paddr,  				 size, DMA_BIDIRECTIONAL, true, dma_mask); -	if (*dma_addr == bad_dma_address) +	if (*dma_addr == bad_dma_address) { +		spin_unlock_irqrestore(&domain->lock, flags);  		goto out_free; +	}  	iommu_completion_wait(iommu); @@ -1625,7 +1873,6 @@ static void prealloc_protection_domains(void)  	struct pci_dev *dev = NULL;  	struct dma_ops_domain *dma_dom;  	struct amd_iommu *iommu; -	int order = amd_iommu_aperture_order;  	u16 devid;  	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { @@ -1638,7 +1885,7 @@ static void prealloc_protection_domains(void)  		iommu = amd_iommu_rlookup_table[devid];  		if (!iommu)  			continue; -		dma_dom = dma_ops_domain_alloc(iommu, order); +		dma_dom = dma_ops_domain_alloc(iommu);  		if (!dma_dom)  			continue;  		init_unity_mappings_for_device(dma_dom, devid); @@ -1664,7 +1911,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {  int __init amd_iommu_init_dma_ops(void)  {  	struct amd_iommu *iommu; -	int order = amd_iommu_aperture_order;  	int ret;  	/* @@ -1672,8 +1918,8 @@ int __init amd_iommu_init_dma_ops(void)  	 * found in the system. Devices not assigned to any other  	 * protection domain will be assigned to the default one.  	 */ -	list_for_each_entry(iommu, &amd_iommu_list, list) { -		iommu->default_dom = dma_ops_domain_alloc(iommu, order); +	for_each_iommu(iommu) { +		iommu->default_dom = dma_ops_domain_alloc(iommu);  		if (iommu->default_dom == NULL)  			return -ENOMEM;  		iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; @@ -1710,7 +1956,7 @@ int __init amd_iommu_init_dma_ops(void)  free_domains: -	list_for_each_entry(iommu, &amd_iommu_list, list) { +	for_each_iommu(iommu) {  		if (iommu->default_dom)  			dma_ops_domain_free(iommu->default_dom);  	} @@ -1842,7 +2088,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,  	old_domain = domain_for_device(devid);  	if (old_domain) -		return -EBUSY; +		detach_device(old_domain, devid);  	attach_device(iommu, domain, devid); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902da..10b2accd12e 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -115,15 +115,21 @@ struct ivmd_header {  	u64 range_length;  } __attribute__((packed)); +bool amd_iommu_dump; +  static int __initdata amd_iommu_detected;  u16 amd_iommu_last_bdf;			/* largest PCI device id we have  					   to handle */  LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings  					   we find in ACPI */ -unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ +#ifdef CONFIG_IOMMU_STRESS +bool amd_iommu_isolate = false; +#else  bool amd_iommu_isolate = true;		/* if true, device isolation is  					   enabled */ +#endif +  bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */  LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the @@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)  static inline unsigned long tbl_size(int entry_size)  {  	unsigned shift = PAGE_SHIFT + -			 get_order(amd_iommu_last_bdf * entry_size); +			 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);  	return 1UL << shift;  } @@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)   * This function set the exclusion range in the IOMMU. DMA accesses to the   * exclusion range are passed through untranslated   */ -static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) +static void iommu_set_exclusion_range(struct amd_iommu *iommu)  {  	u64 start = iommu->exclusion_start & PAGE_MASK;  	u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; @@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)  }  /* Generic functions to enable/disable certain features of the IOMMU. */ -static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)  {  	u32 ctrl; @@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)  }  /* Function to enable the hardware */ -static void __init iommu_enable(struct amd_iommu *iommu) +static void iommu_enable(struct amd_iommu *iommu)  {  	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",  	       dev_name(&iommu->dev->dev), iommu->cap_ptr); @@ -252,11 +258,17 @@ static void __init iommu_enable(struct amd_iommu *iommu)  	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);  } -/* Function to enable IOMMU event logging and event interrupts */ -static void __init iommu_enable_event_logging(struct amd_iommu *iommu) +static void iommu_disable(struct amd_iommu *iommu)  { -	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); -	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); +	/* Disable command buffer */ +	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); + +	/* Disable event logging and event interrupts */ +	iommu_feature_disable(iommu, CONTROL_EVT_INT_EN); +	iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); + +	/* Disable IOMMU hardware itself */ +	iommu_feature_disable(iommu, CONTROL_IOMMU_EN);  }  /* @@ -413,25 +425,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)  {  	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,  			get_order(CMD_BUFFER_SIZE)); -	u64 entry;  	if (cmd_buf == NULL)  		return NULL;  	iommu->cmd_buf_size = CMD_BUFFER_SIZE; -	entry = (u64)virt_to_phys(cmd_buf); +	return cmd_buf; +} + +/* + * This function writes the command buffer address to the hardware and + * enables it. + */ +static void iommu_enable_command_buffer(struct amd_iommu *iommu) +{ +	u64 entry; + +	BUG_ON(iommu->cmd_buf == NULL); + +	entry = (u64)virt_to_phys(iommu->cmd_buf);  	entry |= MMIO_CMD_SIZE_512; +  	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, -			&entry, sizeof(entry)); +		    &entry, sizeof(entry));  	/* set head and tail to zero manually */  	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);  	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);  	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); - -	return cmd_buf;  }  static void __init free_command_buffer(struct amd_iommu *iommu) @@ -443,20 +466,31 @@ static void __init free_command_buffer(struct amd_iommu *iommu)  /* allocates the memory where the IOMMU will log its events to */  static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)  { -	u64 entry;  	iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,  						get_order(EVT_BUFFER_SIZE));  	if (iommu->evt_buf == NULL)  		return NULL; +	return iommu->evt_buf; +} + +static void iommu_enable_event_buffer(struct amd_iommu *iommu) +{ +	u64 entry; + +	BUG_ON(iommu->evt_buf == NULL); +  	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; +  	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,  		    &entry, sizeof(entry)); -	iommu->evt_buf_size = EVT_BUFFER_SIZE; +	/* set head and tail to zero manually */ +	writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); +	writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); -	return iommu->evt_buf; +	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);  }  static void __init free_event_buffer(struct amd_iommu *iommu) @@ -596,32 +630,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,  	p += sizeof(struct ivhd_header);  	end += h->length; +  	while (p < end) {  		e = (struct ivhd_entry *)p;  		switch (e->type) {  		case IVHD_DEV_ALL: + +			DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x" +				    " last device %02x:%02x.%x flags: %02x\n", +				    PCI_BUS(iommu->first_device), +				    PCI_SLOT(iommu->first_device), +				    PCI_FUNC(iommu->first_device), +				    PCI_BUS(iommu->last_device), +				    PCI_SLOT(iommu->last_device), +				    PCI_FUNC(iommu->last_device), +				    e->flags); +  			for (dev_i = iommu->first_device;  					dev_i <= iommu->last_device; ++dev_i)  				set_dev_entry_from_acpi(iommu, dev_i,  							e->flags, 0);  			break;  		case IVHD_DEV_SELECT: + +			DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x " +				    "flags: %02x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags); +  			devid = e->devid;  			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);  			break;  		case IVHD_DEV_SELECT_RANGE_START: + +			DUMP_printk("  DEV_SELECT_RANGE_START\t " +				    "devid: %02x:%02x.%x flags: %02x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags); +  			devid_start = e->devid;  			flags = e->flags;  			ext_flags = 0;  			alias = false;  			break;  		case IVHD_DEV_ALIAS: + +			DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x " +				    "flags: %02x devid_to: %02x:%02x.%x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags, +				    PCI_BUS(e->ext >> 8), +				    PCI_SLOT(e->ext >> 8), +				    PCI_FUNC(e->ext >> 8)); +  			devid = e->devid;  			devid_to = e->ext >> 8; -			set_dev_entry_from_acpi(iommu, devid, e->flags, 0); +			set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);  			amd_iommu_alias_table[devid] = devid_to;  			break;  		case IVHD_DEV_ALIAS_RANGE: + +			DUMP_printk("  DEV_ALIAS_RANGE\t\t " +				    "devid: %02x:%02x.%x flags: %02x " +				    "devid_to: %02x:%02x.%x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags, +				    PCI_BUS(e->ext >> 8), +				    PCI_SLOT(e->ext >> 8), +				    PCI_FUNC(e->ext >> 8)); +  			devid_start = e->devid;  			flags = e->flags;  			devid_to = e->ext >> 8; @@ -629,17 +714,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,  			alias = true;  			break;  		case IVHD_DEV_EXT_SELECT: + +			DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x " +				    "flags: %02x ext: %08x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags, e->ext); +  			devid = e->devid;  			set_dev_entry_from_acpi(iommu, devid, e->flags,  						e->ext);  			break;  		case IVHD_DEV_EXT_SELECT_RANGE: + +			DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: " +				    "%02x:%02x.%x flags: %02x ext: %08x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid), +				    e->flags, e->ext); +  			devid_start = e->devid;  			flags = e->flags;  			ext_flags = e->ext;  			alias = false;  			break;  		case IVHD_DEV_RANGE_END: + +			DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n", +				    PCI_BUS(e->devid), +				    PCI_SLOT(e->devid), +				    PCI_FUNC(e->devid)); +  			devid = e->devid;  			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {  				if (alias) @@ -679,7 +786,7 @@ static void __init free_iommu_all(void)  {  	struct amd_iommu *iommu, *next; -	list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { +	for_each_iommu_safe(iommu, next) {  		list_del(&iommu->list);  		free_iommu_one(iommu);  		kfree(iommu); @@ -710,7 +817,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)  	if (!iommu->mmio_base)  		return -ENOMEM; -	iommu_set_device_table(iommu);  	iommu->cmd_buf = alloc_command_buffer(iommu);  	if (!iommu->cmd_buf)  		return -ENOMEM; @@ -746,6 +852,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)  		h = (struct ivhd_header *)p;  		switch (*p) {  		case ACPI_IVHD_TYPE: + +			DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " +				    "seg: %d flags: %01x info %04x\n", +				    PCI_BUS(h->devid), PCI_SLOT(h->devid), +				    PCI_FUNC(h->devid), h->cap_ptr, +				    h->pci_seg, h->flags, h->info); +			DUMP_printk("       mmio-addr: %016llx\n", +				    h->mmio_phys); +  			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);  			if (iommu == NULL)  				return -ENOMEM; @@ -773,56 +888,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)   *   ****************************************************************************/ -static int __init iommu_setup_msix(struct amd_iommu *iommu) -{ -	struct amd_iommu *curr; -	struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ -	int nvec = 0, i; - -	list_for_each_entry(curr, &amd_iommu_list, list) { -		if (curr->dev == iommu->dev) { -			entries[nvec].entry = curr->evt_msi_num; -			entries[nvec].vector = 0; -			curr->int_enabled = true; -			nvec++; -		} -	} - -	if (pci_enable_msix(iommu->dev, entries, nvec)) { -		pci_disable_msix(iommu->dev); -		return 1; -	} - -	for (i = 0; i < nvec; ++i) { -		int r = request_irq(entries->vector, amd_iommu_int_handler, -				    IRQF_SAMPLE_RANDOM, -				    "AMD IOMMU", -				    NULL); -		if (r) -			goto out_free; -	} - -	return 0; - -out_free: -	for (i -= 1; i >= 0; --i) -		free_irq(entries->vector, NULL); - -	pci_disable_msix(iommu->dev); - -	return 1; -} -  static int __init iommu_setup_msi(struct amd_iommu *iommu)  {  	int r; -	struct amd_iommu *curr; - -	list_for_each_entry(curr, &amd_iommu_list, list) { -		if (curr->dev == iommu->dev) -			curr->int_enabled = true; -	} -  	if (pci_enable_msi(iommu->dev))  		return 1; @@ -837,17 +905,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)  		return 1;  	} +	iommu->int_enabled = true; +	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); +  	return 0;  } -static int __init iommu_init_msi(struct amd_iommu *iommu) +static int iommu_init_msi(struct amd_iommu *iommu)  {  	if (iommu->int_enabled)  		return 0; -	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) -		return iommu_setup_msix(iommu); -	else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) +	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))  		return iommu_setup_msi(iommu);  	return 1; @@ -899,6 +968,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)  static int __init init_unity_map_range(struct ivmd_header *m)  {  	struct unity_map_entry *e = 0; +	char *s;  	e = kzalloc(sizeof(*e), GFP_KERNEL);  	if (e == NULL) @@ -906,14 +976,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)  	switch (m->type) {  	default: +		kfree(e); +		return 0;  	case ACPI_IVMD_TYPE: +		s = "IVMD_TYPEi\t\t\t";  		e->devid_start = e->devid_end = m->devid;  		break;  	case ACPI_IVMD_TYPE_ALL: +		s = "IVMD_TYPE_ALL\t\t";  		e->devid_start = 0;  		e->devid_end = amd_iommu_last_bdf;  		break;  	case ACPI_IVMD_TYPE_RANGE: +		s = "IVMD_TYPE_RANGE\t\t";  		e->devid_start = m->devid;  		e->devid_end = m->aux;  		break; @@ -922,6 +997,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)  	e->address_end = e->address_start + PAGE_ALIGN(m->range_length);  	e->prot = m->flags >> 1; +	DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x" +		    " range_start: %016llx range_end: %016llx flags: %x\n", s, +		    PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start), +		    PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end), +		    PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end), +		    e->address_start, e->address_end, m->flags); +  	list_add_tail(&e->list, &amd_iommu_unity_map);  	return 0; @@ -967,18 +1049,29 @@ static void init_device_table(void)   * This function finally enables all IOMMUs found in the system after   * they have been initialized   */ -static void __init enable_iommus(void) +static void enable_iommus(void)  {  	struct amd_iommu *iommu; -	list_for_each_entry(iommu, &amd_iommu_list, list) { +	for_each_iommu(iommu) { +		iommu_disable(iommu); +		iommu_set_device_table(iommu); +		iommu_enable_command_buffer(iommu); +		iommu_enable_event_buffer(iommu);  		iommu_set_exclusion_range(iommu);  		iommu_init_msi(iommu); -		iommu_enable_event_logging(iommu);  		iommu_enable(iommu);  	}  } +static void disable_iommus(void) +{ +	struct amd_iommu *iommu; + +	for_each_iommu(iommu) +		iommu_disable(iommu); +} +  /*   * Suspend/Resume support   * disable suspend until real resume implemented @@ -986,12 +1079,25 @@ static void __init enable_iommus(void)  static int amd_iommu_resume(struct sys_device *dev)  { +	/* re-load the hardware */ +	enable_iommus(); + +	/* +	 * we have to flush after the IOMMUs are enabled because a +	 * disabled IOMMU will never execute the commands we send +	 */ +	amd_iommu_flush_all_devices(); +	amd_iommu_flush_all_domains(); +  	return 0;  }  static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)  { -	return -EINVAL; +	/* disable IOMMUs to go out of the way for BIOS */ +	disable_iommus(); + +	return 0;  }  static struct sysdev_class amd_iommu_sysdev_class = { @@ -1137,9 +1243,6 @@ int __init amd_iommu_init(void)  	enable_iommus(); -	printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", -			(1 << (amd_iommu_aperture_order-20))); -  	printk(KERN_INFO "AMD IOMMU: device isolation ");  	if (amd_iommu_isolate)  		printk("enabled\n"); @@ -1177,6 +1280,11 @@ free:  	goto out;  } +void amd_iommu_shutdown(void) +{ +	disable_iommus(); +} +  /****************************************************************************   *   * Early detect code. This code runs at IOMMU detection time in the DMA @@ -1211,6 +1319,13 @@ void __init amd_iommu_detect(void)   *   ****************************************************************************/ +static int __init parse_amd_iommu_dump(char *str) +{ +	amd_iommu_dump = true; + +	return 1; +} +  static int __init parse_amd_iommu_options(char *str)  {  	for (; *str; ++str) { @@ -1225,15 +1340,5 @@ static int __init parse_amd_iommu_options(char *str)  	return 1;  } -static int __init parse_amd_iommu_size_options(char *str) -{ -	unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); - -	if ((order > 24) && (order < 31)) -		amd_iommu_aperture_order = order; - -	return 1; -} - +__setup("amd_iommu_dump", parse_amd_iommu_dump);  __setup("amd_iommu=", parse_amd_iommu_options); -__setup("amd_iommu_size=", parse_amd_iommu_size_options); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2870920f24..8c7c042ecad 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@   *	Mikael Pettersson	:	PM converted to driver model.   */ +#include <linux/perf_counter.h>  #include <linux/kernel_stat.h>  #include <linux/mc146818rtc.h>  #include <linux/acpi_pmtmr.h> @@ -34,6 +35,7 @@  #include <linux/smp.h>  #include <linux/mm.h> +#include <asm/perf_counter.h>  #include <asm/pgalloc.h>  #include <asm/atomic.h>  #include <asm/mpspec.h> @@ -98,6 +100,29 @@ early_param("lapic", parse_lapic);  /* Local APIC was disabled by the BIOS and enabled by the kernel */  static int enabled_via_apicbase; +/* + * Handle interrupt mode configuration register (IMCR). + * This register controls whether the interrupt signals + * that reach the BSP come from the master PIC or from the + * local APIC. Before entering Symmetric I/O Mode, either + * the BIOS or the operating system must switch out of + * PIC Mode by changing the IMCR. + */ +static inline void imcr_pic_to_apic(void) +{ +	/* select IMCR register */ +	outb(0x70, 0x22); +	/* NMI and 8259 INTR go through APIC */ +	outb(0x01, 0x23); +} + +static inline void imcr_apic_to_pic(void) +{ +	/* select IMCR register */ +	outb(0x70, 0x22); +	/* NMI and 8259 INTR go directly to BSP */ +	outb(0x00, 0x23); +}  #endif  #ifdef CONFIG_X86_64 @@ -111,13 +136,19 @@ static __init int setup_apicpmtimer(char *s)  __setup("apicpmtimer", setup_apicpmtimer);  #endif +int x2apic_mode;  #ifdef CONFIG_X86_X2APIC -int x2apic;  /* x2apic enabled before OS handover */  static int x2apic_preenabled;  static int disable_x2apic;  static __init int setup_nox2apic(char *str)  { +	if (x2apic_enabled()) { +		pr_warning("Bios already enabled x2apic, " +			   "can't enforce nox2apic"); +		return 0; +	} +  	disable_x2apic = 1;  	setup_clear_cpu_cap(X86_FEATURE_X2APIC);  	return 0; @@ -209,6 +240,31 @@ static int modern_apic(void)  	return lapic_get_version() >= 0x14;  } +/* + * bare function to substitute write operation + * and it's _that_ fast :) + */ +static void native_apic_write_dummy(u32 reg, u32 v) +{ +	WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + +static u32 native_apic_read_dummy(u32 reg) +{ +	WARN_ON_ONCE((cpu_has_apic && !disable_apic)); +	return 0; +} + +/* + * right after this call apic->write/read doesn't do anything + * note that there is no restore operation it works one way + */ +void apic_disable(void) +{ +	apic->read = native_apic_read_dummy; +	apic->write = native_apic_write_dummy; +} +  void native_apic_wait_icr_idle(void)  {  	while (apic_read(APIC_ICR) & APIC_ICR_BUSY) @@ -348,7 +404,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)  static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)  { -	unsigned long reg = (lvt_off << 4) + APIC_EILVT0; +	unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);  	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;  	apic_write(reg, v); @@ -815,7 +871,7 @@ void clear_local_APIC(void)  	u32 v;  	/* APIC hasn't been mapped yet */ -	if (!x2apic && !apic_phys) +	if (!x2apic_mode && !apic_phys)  		return;  	maxlvt = lapic_get_maxlvt(); @@ -843,7 +899,7 @@ void clear_local_APIC(void)  	}  	/* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR  	if (maxlvt >= 5) {  		v = apic_read(APIC_LVTTHMR);  		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); @@ -1133,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)  		apic_write(APIC_ESR, 0);  	}  #endif +	perf_counters_lapic_init();  	preempt_disable(); @@ -1287,7 +1344,7 @@ void check_x2apic(void)  {  	if (x2apic_enabled()) {  		pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); -		x2apic_preenabled = x2apic = 1; +		x2apic_preenabled = x2apic_mode = 1;  	}  } @@ -1295,7 +1352,7 @@ void enable_x2apic(void)  {  	int msr, msr2; -	if (!x2apic) +	if (!x2apic_mode)  		return;  	rdmsr(MSR_IA32_APICBASE, msr, msr2); @@ -1304,6 +1361,7 @@ void enable_x2apic(void)  		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);  	}  } +#endif /* CONFIG_X86_X2APIC */  void __init enable_IR_x2apic(void)  { @@ -1312,32 +1370,21 @@ void __init enable_IR_x2apic(void)  	unsigned long flags;  	struct IO_APIC_route_entry **ioapic_entries = NULL; -	if (!cpu_has_x2apic) -		return; - -	if (!x2apic_preenabled && disable_x2apic) { -		pr_info("Skipped enabling x2apic and Interrupt-remapping " -			"because of nox2apic\n"); -		return; +	ret = dmar_table_init(); +	if (ret) { +		pr_debug("dmar_table_init() failed with %d:\n", ret); +		goto ir_failed;  	} -	if (x2apic_preenabled && disable_x2apic) -		panic("Bios already enabled x2apic, can't enforce nox2apic"); - -	if (!x2apic_preenabled && skip_ioapic_setup) { -		pr_info("Skipped enabling x2apic and Interrupt-remapping " -			"because of skipping io-apic setup\n"); -		return; +	if (!intr_remapping_supported()) { +		pr_debug("intr-remapping not supported\n"); +		goto ir_failed;  	} -	ret = dmar_table_init(); -	if (ret) { -		pr_info("dmar_table_init() failed with %d:\n", ret); -		if (x2apic_preenabled) -			panic("x2apic enabled by bios. But IR enabling failed"); -		else -			pr_info("Not enabling x2apic,Intr-remapping\n"); +	if (!x2apic_preenabled && skip_ioapic_setup) { +		pr_info("Skipped enabling intr-remap because of skipping " +			"io-apic setup\n");  		return;  	} @@ -1357,19 +1404,16 @@ void __init enable_IR_x2apic(void)  	mask_IO_APIC_setup(ioapic_entries);  	mask_8259A(); -	ret = enable_intr_remapping(EIM_32BIT_APIC_ID); - -	if (ret && x2apic_preenabled) { -		local_irq_restore(flags); -		panic("x2apic enabled by bios. But IR enabling failed"); -	} - +	ret = enable_intr_remapping(x2apic_supported());  	if (ret)  		goto end_restore; -	if (!x2apic) { -		x2apic = 1; +	pr_info("Enabled Interrupt-remapping\n"); + +	if (x2apic_supported() && !x2apic_mode) { +		x2apic_mode = 1;  		enable_x2apic(); +		pr_info("Enabled x2apic\n");  	}  end_restore: @@ -1378,37 +1422,34 @@ end_restore:  		 * IR enabling failed  		 */  		restore_IO_APIC_setup(ioapic_entries); -	else -		reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);  	unmask_8259A();  	local_irq_restore(flags);  end: -	if (!ret) { -		if (!x2apic_preenabled) -			pr_info("Enabled x2apic and interrupt-remapping\n"); -		else -			pr_info("Enabled Interrupt-remapping\n"); -	} else -		pr_err("Failed to enable Interrupt-remapping and x2apic\n");  	if (ioapic_entries)  		free_ioapic_entries(ioapic_entries); + +	if (!ret) +		return; + +ir_failed: +	if (x2apic_preenabled) +		panic("x2apic enabled by bios. But IR enabling failed"); +	else if (cpu_has_x2apic) +		pr_info("Not enabling x2apic,Intr-remapping\n");  #else  	if (!cpu_has_x2apic)  		return;  	if (x2apic_preenabled)  		panic("x2apic enabled prior OS handover," -		      " enable CONFIG_INTR_REMAP"); - -	pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " -		" and x2apic\n"); +		      " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");  #endif  	return;  } -#endif /* CONFIG_X86_X2APIC */ +  #ifdef CONFIG_X86_64  /* @@ -1425,7 +1466,6 @@ static int __init detect_init_APIC(void)  	}  	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; -	boot_cpu_physical_apicid = 0;  	return 0;  }  #else @@ -1539,32 +1579,49 @@ void __init early_init_lapic_mapping(void)   */  void __init init_apic_mappings(void)  { -	if (x2apic) { +	unsigned int new_apicid; + +	if (x2apic_mode) {  		boot_cpu_physical_apicid = read_apic_id();  		return;  	} -	/* -	 * If no local APIC can be found then set up a fake all -	 * zeroes page to simulate the local APIC and another -	 * one for the IO-APIC. -	 */ +	/* If no local APIC can be found return early */  	if (!smp_found_config && detect_init_APIC()) { -		apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); -		apic_phys = __pa(apic_phys); -	} else +		/* lets NOP'ify apic operations */ +		pr_info("APIC: disable apic facility\n"); +		apic_disable(); +	} else {  		apic_phys = mp_lapic_addr; -	set_fixmap_nocache(FIX_APIC_BASE, apic_phys); -	apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", -				APIC_BASE, apic_phys); +		/* +		 * acpi lapic path already maps that address in +		 * acpi_register_lapic_address() +		 */ +		if (!acpi_lapic) +			set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + +		apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", +					APIC_BASE, apic_phys); +	}  	/*  	 * Fetch the APIC ID of the BSP in case we have a  	 * default configuration (or the MP table is broken).  	 */ -	if (boot_cpu_physical_apicid == -1U) -		boot_cpu_physical_apicid = read_apic_id(); +	new_apicid = read_apic_id(); +	if (boot_cpu_physical_apicid != new_apicid) { +		boot_cpu_physical_apicid = new_apicid; +		/* +		 * yeah -- we lie about apic_version +		 * in case if apic was disabled via boot option +		 * but it's not a problem for SMP compiled kernel +		 * since smp_sanity_check is prepared for such a case +		 * and disable smp mode +		 */ +		apic_version[new_apicid] = +			 GET_APIC_VERSION(apic_read(APIC_LVR)); +	}  }  /* @@ -1733,8 +1790,7 @@ void __init connect_bsp_APIC(void)  		 */  		apic_printk(APIC_VERBOSE, "leaving PIC mode, "  				"enabling APIC mode.\n"); -		outb(0x70, 0x22); -		outb(0x01, 0x23); +		imcr_pic_to_apic();  	}  #endif  	if (apic->enable_apic_mode) @@ -1762,8 +1818,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)  		 */  		apic_printk(APIC_VERBOSE, "disabling APIC mode, "  				"entering PIC mode.\n"); -		outb(0x70, 0x22); -		outb(0x00, 0x23); +		imcr_apic_to_pic();  		return;  	}  #endif @@ -1962,17 +2017,17 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)  	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);  	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);  	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR  	if (maxlvt >= 5)  		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);  #endif  	local_irq_save(flags);  	disable_local_APIC(); -#ifdef CONFIG_INTR_REMAP +  	if (intr_remapping_enabled)  		disable_intr_remapping(); -#endif +  	local_irq_restore(flags);  	return 0;  } @@ -1982,42 +2037,34 @@ static int lapic_resume(struct sys_device *dev)  	unsigned int l, h;  	unsigned long flags;  	int maxlvt; - -#ifdef CONFIG_INTR_REMAP -	int ret; +	int ret = 0;  	struct IO_APIC_route_entry **ioapic_entries = NULL;  	if (!apic_pm_state.active)  		return 0;  	local_irq_save(flags); -	if (x2apic) { +	if (intr_remapping_enabled) {  		ioapic_entries = alloc_ioapic_entries();  		if (!ioapic_entries) {  			WARN(1, "Alloc ioapic_entries in lapic resume failed."); -			return -ENOMEM; +			ret = -ENOMEM; +			goto restore;  		}  		ret = save_IO_APIC_setup(ioapic_entries);  		if (ret) {  			WARN(1, "Saving IO-APIC state failed: %d\n", ret);  			free_ioapic_entries(ioapic_entries); -			return ret; +			goto restore;  		}  		mask_IO_APIC_setup(ioapic_entries);  		mask_8259A(); -		enable_x2apic();  	} -#else -	if (!apic_pm_state.active) -		return 0; -	local_irq_save(flags); -	if (x2apic) +	if (x2apic_mode)  		enable_x2apic(); -#endif -  	else {  		/*  		 * Make sure the APICBASE points to the right address @@ -2055,21 +2102,16 @@ static int lapic_resume(struct sys_device *dev)  	apic_write(APIC_ESR, 0);  	apic_read(APIC_ESR); -#ifdef CONFIG_INTR_REMAP -	if (intr_remapping_enabled) -		reenable_intr_remapping(EIM_32BIT_APIC_ID); - -	if (x2apic) { +	if (intr_remapping_enabled) { +		reenable_intr_remapping(x2apic_mode);  		unmask_8259A();  		restore_IO_APIC_setup(ioapic_entries);  		free_ioapic_entries(ioapic_entries);  	} -#endif - +restore:  	local_irq_restore(flags); - -	return 0; +	return ret;  }  /* @@ -2117,31 +2159,14 @@ static void apic_pm_activate(void) { }  #endif	/* CONFIG_PM */  #ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) + +static int __cpuinit apic_cluster_num(void)  {  	int i, clusters, zeros;  	unsigned id;  	u16 *bios_cpu_apicid;  	DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); -	/* -	 * there is not this kind of box with AMD CPU yet. -	 * Some AMD box with quadcore cpu and 8 sockets apicid -	 * will be [4, 0x23] or [8, 0x27] could be thought to -	 * vsmp box still need checking... -	 */ -	if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) -		return 0; -  	bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);  	bitmap_zero(clustermap, NUM_APIC_CLUSTERS); @@ -2177,18 +2202,67 @@ __cpuinit int apic_is_clustered_box(void)  			++zeros;  	} -	/* ScaleMP vSMPowered boxes have one cluster per board and TSCs are -	 * not guaranteed to be synced between boards -	 */ -	if (is_vsmp_box() && clusters > 1) +	return clusters; +} + +static int __cpuinitdata multi_checked; +static int __cpuinitdata multi; + +static int __cpuinit set_multi(const struct dmi_system_id *d) +{ +	if (multi) +		return 0; +	pr_info("APIC: %s detected, Multi Chassis\n", d->ident); +	multi = 1; +	return 0; +} + +static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { +	{ +		.callback = set_multi, +		.ident = "IBM System Summit2", +		.matches = { +			DMI_MATCH(DMI_SYS_VENDOR, "IBM"), +			DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), +		}, +	}, +	{} +}; + +static void __cpuinit dmi_check_multi(void) +{ +	if (multi_checked) +		return; + +	dmi_check_system(multi_dmi_table); +	multi_checked = 1; +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. + * Use DMI to check them + */ +__cpuinit int apic_is_clustered_box(void) +{ +	dmi_check_multi(); +	if (multi)  		return 1; +	if (!is_vsmp_box()) +		return 0; +  	/* -	 * If clusters > 2, then should be multi-chassis. -	 * May have to revisit this when multi-core + hyperthreaded CPUs come -	 * out, but AFAIK this will work even for them. +	 * ScaleMP vSMPowered boxes have one cluster per board and TSCs are +	 * not guaranteed to be synced between boards  	 */ -	return (clusters > 2); +	if (apic_cluster_num() > 1) +		return 1; + +	return 0;  }  #endif diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 306e5e88fb6..d0c99abc26c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)  static int flat_phys_pkg_id(int initial_apic_id, int index_msb)  { -	return hard_smp_processor_id() >> index_msb; +	return initial_apic_id >> index_msb;  }  struct apic apic_flat =  { @@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)  	 * regardless of how many processors are present (x86_64 ES7000  	 * is an example).  	 */ -	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && +	if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&  		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {  		printk(KERN_DEBUG "system APIC only can use physical flat");  		return 1; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 30294777557..69328ac8de9 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)  	return gsi;  } -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)  {  	unsigned long vect = 0, psaival = 0; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 30da617d18e..b7a79207295 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -59,6 +59,7 @@  #include <asm/setup.h>  #include <asm/irq_remapping.h>  #include <asm/hpet.h> +#include <asm/hw_irq.h>  #include <asm/uv/uv_hub.h>  #include <asm/uv/uv_irq.h> @@ -129,12 +130,9 @@ struct irq_pin_list {  	struct irq_pin_list *next;  }; -static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +static struct irq_pin_list *get_one_free_irq_2_pin(int node)  {  	struct irq_pin_list *pin; -	int node; - -	node = cpu_to_node(cpu);  	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); @@ -148,9 +146,6 @@ struct irq_cfg {  	unsigned move_cleanup_count;  	u8 vector;  	u8 move_in_progress : 1; -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC -	u8 move_desc_pending : 1; -#endif  };  /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -182,16 +177,18 @@ int __init arch_early_irq_init(void)  	struct irq_cfg *cfg;  	struct irq_desc *desc;  	int count; +	int node;  	int i;  	cfg = irq_cfgx;  	count = ARRAY_SIZE(irq_cfgx); +	node= cpu_to_node(boot_cpu_id);  	for (i = 0; i < count; i++) {  		desc = irq_to_desc(i);  		desc->chip_data = &cfg[i]; -		alloc_bootmem_cpumask_var(&cfg[i].domain); -		alloc_bootmem_cpumask_var(&cfg[i].old_domain); +		zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); +		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);  		if (i < NR_IRQS_LEGACY)  			cpumask_setall(cfg[i].domain);  	} @@ -212,12 +209,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)  	return cfg;  } -static struct irq_cfg *get_one_free_irq_cfg(int cpu) +static struct irq_cfg *get_one_free_irq_cfg(int node)  {  	struct irq_cfg *cfg; -	int node; - -	node = cpu_to_node(cpu);  	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);  	if (cfg) { @@ -238,13 +232,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)  	return cfg;  } -int arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int node)  {  	struct irq_cfg *cfg;  	cfg = desc->chip_data;  	if (!cfg) { -		desc->chip_data = get_one_free_irq_cfg(cpu); +		desc->chip_data = get_one_free_irq_cfg(node);  		if (!desc->chip_data) {  			printk(KERN_ERR "can not alloc irq_cfg\n");  			BUG_ON(1); @@ -254,10 +248,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)  	return 0;  } -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - +/* for move_irq_desc */  static void -init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)  {  	struct irq_pin_list *old_entry, *head, *tail, *entry; @@ -266,7 +259,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)  	if (!old_entry)  		return; -	entry = get_one_free_irq_2_pin(cpu); +	entry = get_one_free_irq_2_pin(node);  	if (!entry)  		return; @@ -276,7 +269,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)  	tail		= entry;  	old_entry	= old_entry->next;  	while (old_entry) { -		entry = get_one_free_irq_2_pin(cpu); +		entry = get_one_free_irq_2_pin(node);  		if (!entry) {  			entry = head;  			while (entry) { @@ -316,12 +309,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)  }  void arch_init_copy_chip_data(struct irq_desc *old_desc, -				 struct irq_desc *desc, int cpu) +				 struct irq_desc *desc, int node)  {  	struct irq_cfg *cfg;  	struct irq_cfg *old_cfg; -	cfg = get_one_free_irq_cfg(cpu); +	cfg = get_one_free_irq_cfg(node);  	if (!cfg)  		return; @@ -332,7 +325,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,  	memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); -	init_copy_irq_2_pin(old_cfg, cfg, cpu); +	init_copy_irq_2_pin(old_cfg, cfg, node);  }  static void free_irq_cfg(struct irq_cfg *old_cfg) @@ -356,19 +349,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)  		old_desc->chip_data = NULL;  	}  } - -static void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ -	struct irq_cfg *cfg = desc->chip_data; - -	if (!cfg->move_in_progress) { -		/* it means that domain is not changed */ -		if (!cpumask_intersects(desc->affinity, mask)) -			cfg->move_desc_pending = 1; -	} -} -#endif +/* end for move_irq_desc */  #else  static struct irq_cfg *irq_cfg(unsigned int irq) @@ -378,13 +359,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)  #endif -#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC -static inline void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ -} -#endif -  struct io_apic {  	unsigned int index;  	unsigned int unused[3]; @@ -488,7 +462,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)  static void  __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)  { -	union entry_union eu; +	union entry_union eu = {{0, 0}}; +  	eu.entry = e;  	io_apic_write(apic, 0x11 + 2*pin, eu.w2);  	io_apic_write(apic, 0x10 + 2*pin, eu.w1); @@ -518,132 +493,18 @@ static void ioapic_mask_entry(int apic, int pin)  	spin_unlock_irqrestore(&ioapic_lock, flags);  } -#ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) -{ -	cpumask_var_t cleanup_mask; - -	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { -		unsigned int i; -		cfg->move_cleanup_count = 0; -		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) -			cfg->move_cleanup_count++; -		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) -			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); -	} else { -		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); -		cfg->move_cleanup_count = cpumask_weight(cleanup_mask); -		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); -		free_cpumask_var(cleanup_mask); -	} -	cfg->move_in_progress = 0; -} - -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ -	int apic, pin; -	struct irq_pin_list *entry; -	u8 vector = cfg->vector; - -	entry = cfg->irq_2_pin; -	for (;;) { -		unsigned int reg; - -		if (!entry) -			break; - -		apic = entry->apic; -		pin = entry->pin; -		/* -		 * With interrupt-remapping, destination information comes -		 * from interrupt-remapping table entry. -		 */ -		if (!irq_remapped(irq)) -			io_apic_write(apic, 0x11 + pin*2, dest); -		reg = io_apic_read(apic, 0x10 + pin*2); -		reg &= ~IO_APIC_REDIR_VECTOR_MASK; -		reg |= vector; -		io_apic_modify(apic, 0x10 + pin*2, reg); -		if (!entry->next) -			break; -		entry = entry->next; -	} -} - -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - -/* - * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and - * leaves desc->affinity untouched. - */ -static unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) -{ -	struct irq_cfg *cfg; -	unsigned int irq; - -	if (!cpumask_intersects(mask, cpu_online_mask)) -		return BAD_APICID; - -	irq = desc->irq; -	cfg = desc->chip_data; -	if (assign_irq_vector(irq, cfg, mask)) -		return BAD_APICID; - -	/* check that before desc->addinity get updated */ -	set_extra_move_desc(desc, mask); - -	cpumask_copy(desc->affinity, mask); - -	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); -} - -static void -set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) -{ -	struct irq_cfg *cfg; -	unsigned long flags; -	unsigned int dest; -	unsigned int irq; - -	irq = desc->irq; -	cfg = desc->chip_data; - -	spin_lock_irqsave(&ioapic_lock, flags); -	dest = set_desc_affinity(desc, mask); -	if (dest != BAD_APICID) { -		/* Only the high 8 bits are valid. */ -		dest = SET_APIC_LOGICAL_ID(dest); -		__target_IO_APIC_irq(irq, dest, cfg); -	} -	spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void -set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) -{ -	struct irq_desc *desc; - -	desc = irq_to_desc(irq); - -	set_ioapic_affinity_irq_desc(desc, mask); -} -#endif /* CONFIG_SMP */ -  /*   * The common case is 1:1 IRQ<->pin mappings. Sometimes there are   * shared ISA-space IRQs, so we have to support them. We are super   * fast in the common case, and fast for shared ISA-space IRQs.   */ -static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)  {  	struct irq_pin_list *entry;  	entry = cfg->irq_2_pin;  	if (!entry) { -		entry = get_one_free_irq_2_pin(cpu); +		entry = get_one_free_irq_2_pin(node);  		if (!entry) {  			printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",  					apic, pin); @@ -663,7 +524,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)  		entry = entry->next;  	} -	entry->next = get_one_free_irq_2_pin(cpu); +	entry->next = get_one_free_irq_2_pin(node);  	entry = entry->next;  	entry->apic = apic;  	entry->pin = pin; @@ -672,7 +533,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)  /*   * Reroute an IRQ to a different pin.   */ -static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, +static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,  				      int oldapic, int oldpin,  				      int newapic, int newpin)  { @@ -692,7 +553,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,  	/* why? call replace before add? */  	if (!replaced) -		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); +		add_pin_to_irq_node(cfg, node, newapic, newpin);  }  static inline void io_apic_modify_irq(struct irq_cfg *cfg, @@ -850,7 +711,6 @@ static int __init ioapic_pirq_setup(char *str)  __setup("pirq=", ioapic_pirq_setup);  #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_INTR_REMAP  struct IO_APIC_route_entry **alloc_ioapic_entries(void)  {  	int apic; @@ -948,20 +808,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)  	return 0;  } -void reinit_intr_remapped_IO_APIC(int intr_remapping, -	struct IO_APIC_route_entry **ioapic_entries) - -{ -	/* -	 * for now plain restore of previous settings. -	 * TBD: In the case of OS enabling interrupt-remapping, -	 * IO-APIC RTE's need to be setup to point to interrupt-remapping -	 * table entries. for now, do a plain restore, and wait for -	 * the setup_IO_APIC_irqs() to do proper initialization. -	 */ -	restore_IO_APIC_setup(ioapic_entries); -} -  void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)  {  	int apic; @@ -971,7 +817,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)  	kfree(ioapic_entries);  } -#endif  /*   * Find the IRQ entry number of a certain pin. @@ -1032,54 +877,6 @@ static int __init find_isa_irq_apic(int irq, int type)  	return -1;  } -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ -	int apic, i, best_guess = -1; - -	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", -		bus, slot, pin); -	if (test_bit(bus, mp_bus_not_pci)) { -		apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); -		return -1; -	} -	for (i = 0; i < mp_irq_entries; i++) { -		int lbus = mp_irqs[i].srcbus; - -		for (apic = 0; apic < nr_ioapics; apic++) -			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || -			    mp_irqs[i].dstapic == MP_APIC_ALL) -				break; - -		if (!test_bit(lbus, mp_bus_not_pci) && -		    !mp_irqs[i].irqtype && -		    (bus == lbus) && -		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { -			int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); - -			if (!(apic || IO_APIC_IRQ(irq))) -				continue; - -			if (pin == (mp_irqs[i].srcbusirq & 3)) -				return irq; -			/* -			 * Use the first all-but-pin matching entry as a -			 * best-guess fuzzy result for broken mptables. -			 */ -			if (best_guess < 0) -				best_guess = irq; -		} -	} -	return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -  #if defined(CONFIG_EISA) || defined(CONFIG_MCA)  /*   * EISA Edge/Level control register, ELCR @@ -1298,6 +1095,64 @@ static int pin_2_irq(int idx, int apic, int pin)  	return irq;  } +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, +				struct io_apic_irq_attr *irq_attr) +{ +	int apic, i, best_guess = -1; + +	apic_printk(APIC_DEBUG, +		    "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", +		    bus, slot, pin); +	if (test_bit(bus, mp_bus_not_pci)) { +		apic_printk(APIC_VERBOSE, +			    "PCI BIOS passed nonexistent PCI bus %d!\n", bus); +		return -1; +	} +	for (i = 0; i < mp_irq_entries; i++) { +		int lbus = mp_irqs[i].srcbus; + +		for (apic = 0; apic < nr_ioapics; apic++) +			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || +			    mp_irqs[i].dstapic == MP_APIC_ALL) +				break; + +		if (!test_bit(lbus, mp_bus_not_pci) && +		    !mp_irqs[i].irqtype && +		    (bus == lbus) && +		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { +			int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); + +			if (!(apic || IO_APIC_IRQ(irq))) +				continue; + +			if (pin == (mp_irqs[i].srcbusirq & 3)) { +				set_io_apic_irq_attr(irq_attr, apic, +						     mp_irqs[i].dstirq, +						     irq_trigger(i), +						     irq_polarity(i)); +				return irq; +			} +			/* +			 * Use the first all-but-pin matching entry as a +			 * best-guess fuzzy result for broken mptables. +			 */ +			if (best_guess < 0) { +				set_io_apic_irq_attr(irq_attr, apic, +						     mp_irqs[i].dstirq, +						     irq_trigger(i), +						     irq_polarity(i)); +				best_guess = irq; +			} +		} +	} +	return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); +  void lock_vector_lock(void)  {  	/* Used to the online set of cpus does not change @@ -1628,58 +1483,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq  	ioapic_write_entry(apic_id, pin, entry);  } +static struct { +	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; +  static void __init setup_IO_APIC_irqs(void)  { -	int apic_id, pin, idx, irq; +	int apic_id = 0, pin, idx, irq;  	int notcon = 0;  	struct irq_desc *desc;  	struct irq_cfg *cfg; -	int cpu = boot_cpu_id; +	int node = cpu_to_node(boot_cpu_id);  	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); -	for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { -		for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { +#ifdef CONFIG_ACPI +	if (!acpi_disabled && acpi_ioapic) { +		apic_id = mp_find_ioapic(0); +		if (apic_id < 0) +			apic_id = 0; +	} +#endif -			idx = find_irq_entry(apic_id, pin, mp_INT); -			if (idx == -1) { -				if (!notcon) { -					notcon = 1; -					apic_printk(APIC_VERBOSE, -						KERN_DEBUG " %d-%d", -						mp_ioapics[apic_id].apicid, pin); -				} else -					apic_printk(APIC_VERBOSE, " %d-%d", -						mp_ioapics[apic_id].apicid, pin); -				continue; -			} -			if (notcon) { +	for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { +		idx = find_irq_entry(apic_id, pin, mp_INT); +		if (idx == -1) { +			if (!notcon) { +				notcon = 1;  				apic_printk(APIC_VERBOSE, -					" (apicid-pin) not connected\n"); -				notcon = 0; -			} +					KERN_DEBUG " %d-%d", +					mp_ioapics[apic_id].apicid, pin); +			} else +				apic_printk(APIC_VERBOSE, " %d-%d", +					mp_ioapics[apic_id].apicid, pin); +			continue; +		} +		if (notcon) { +			apic_printk(APIC_VERBOSE, +				" (apicid-pin) not connected\n"); +			notcon = 0; +		} -			irq = pin_2_irq(idx, apic_id, pin); +		irq = pin_2_irq(idx, apic_id, pin); -			/* -			 * Skip the timer IRQ if there's a quirk handler -			 * installed and if it returns 1: -			 */ -			if (apic->multi_timer_check && -					apic->multi_timer_check(apic_id, irq)) -				continue; - -			desc = irq_to_desc_alloc_cpu(irq, cpu); -			if (!desc) { -				printk(KERN_INFO "can not get irq_desc for %d\n", irq); -				continue; -			} -			cfg = desc->chip_data; -			add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); +		/* +		 * Skip the timer IRQ if there's a quirk handler +		 * installed and if it returns 1: +		 */ +		if (apic->multi_timer_check && +				apic->multi_timer_check(apic_id, irq)) +			continue; -			setup_IO_APIC_irq(apic_id, pin, irq, desc, -					irq_trigger(idx), irq_polarity(idx)); +		desc = irq_to_desc_alloc_node(irq, node); +		if (!desc) { +			printk(KERN_INFO "can not get irq_desc for %d\n", irq); +			continue;  		} +		cfg = desc->chip_data; +		add_pin_to_irq_node(cfg, node, apic_id, pin); +		/* +		 * don't mark it in pin_programmed, so later acpi could +		 * set it correctly when irq < 16 +		 */ +		setup_IO_APIC_irq(apic_id, pin, irq, desc, +				irq_trigger(idx), irq_polarity(idx));  	}  	if (notcon) @@ -1869,7 +1736,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)  __apicdebuginit(void) print_local_APIC(void *dummy)  { -	unsigned int v, ver, maxlvt; +	unsigned int i, v, ver, maxlvt;  	u64 icr;  	if (apic_verbosity == APIC_QUIET) @@ -1957,6 +1824,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)  	printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);  	v = apic_read(APIC_TDCR);  	printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + +	if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { +		v = apic_read(APIC_EFEAT); +		maxlvt = (v >> 16) & 0xff; +		printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); +		v = apic_read(APIC_ECTRL); +		printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); +		for (i = 0; i < maxlvt; i++) { +			v = apic_read(APIC_EILVTn(i)); +			printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); +		} +	}  	printk("\n");  } @@ -2005,6 +1884,11 @@ __apicdebuginit(void) print_PIC(void)  __apicdebuginit(int) print_all_ICs(void)  {  	print_PIC(); + +	/* don't print out if apic is not there */ +	if (!cpu_has_apic || disable_apic) +		return 0; +  	print_all_local_APICs();  	print_IO_APIC(); @@ -2120,7 +2004,9 @@ void disable_IO_APIC(void)  	/*  	 * Use virtual wire A mode when interrupt remapping is enabled.  	 */ -	disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); +	if (cpu_has_apic) +		disconnect_bsp_APIC(!intr_remapping_enabled && +				ioapic_i8259.pin != -1);  }  #ifdef CONFIG_X86_32 @@ -2360,6 +2246,118 @@ static int ioapic_retrigger_irq(unsigned int irq)   */  #ifdef CONFIG_SMP +static void send_cleanup_vector(struct irq_cfg *cfg) +{ +	cpumask_var_t cleanup_mask; + +	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { +		unsigned int i; +		cfg->move_cleanup_count = 0; +		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) +			cfg->move_cleanup_count++; +		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) +			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); +	} else { +		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); +		cfg->move_cleanup_count = cpumask_weight(cleanup_mask); +		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); +		free_cpumask_var(cleanup_mask); +	} +	cfg->move_in_progress = 0; +} + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ +	int apic, pin; +	struct irq_pin_list *entry; +	u8 vector = cfg->vector; + +	entry = cfg->irq_2_pin; +	for (;;) { +		unsigned int reg; + +		if (!entry) +			break; + +		apic = entry->apic; +		pin = entry->pin; +		/* +		 * With interrupt-remapping, destination information comes +		 * from interrupt-remapping table entry. +		 */ +		if (!irq_remapped(irq)) +			io_apic_write(apic, 0x11 + pin*2, dest); +		reg = io_apic_read(apic, 0x10 + pin*2); +		reg &= ~IO_APIC_REDIR_VECTOR_MASK; +		reg |= vector; +		io_apic_modify(apic, 0x10 + pin*2, reg); +		if (!entry->next) +			break; +		entry = entry->next; +	} +} + +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); + +/* + * Either sets desc->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ +	struct irq_cfg *cfg; +	unsigned int irq; + +	if (!cpumask_intersects(mask, cpu_online_mask)) +		return BAD_APICID; + +	irq = desc->irq; +	cfg = desc->chip_data; +	if (assign_irq_vector(irq, cfg, mask)) +		return BAD_APICID; + +	cpumask_copy(desc->affinity, mask); + +	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); +} + +static int +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +{ +	struct irq_cfg *cfg; +	unsigned long flags; +	unsigned int dest; +	unsigned int irq; +	int ret = -1; + +	irq = desc->irq; +	cfg = desc->chip_data; + +	spin_lock_irqsave(&ioapic_lock, flags); +	dest = set_desc_affinity(desc, mask); +	if (dest != BAD_APICID) { +		/* Only the high 8 bits are valid. */ +		dest = SET_APIC_LOGICAL_ID(dest); +		__target_IO_APIC_irq(irq, dest, cfg); +		ret = 0; +	} +	spin_unlock_irqrestore(&ioapic_lock, flags); + +	return ret; +} + +static int +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ +	struct irq_desc *desc; + +	desc = irq_to_desc(irq); + +	return set_ioapic_affinity_irq_desc(desc, mask); +}  #ifdef CONFIG_INTR_REMAP @@ -2374,26 +2372,25 @@ static int ioapic_retrigger_irq(unsigned int irq)   * Real vector that is used for interrupting cpu will be coming from   * the interrupt-remapping table entry.   */ -static void +static int  migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)  {  	struct irq_cfg *cfg;  	struct irte irte;  	unsigned int dest;  	unsigned int irq; +	int ret = -1;  	if (!cpumask_intersects(mask, cpu_online_mask)) -		return; +		return ret;  	irq = desc->irq;  	if (get_irte(irq, &irte)) -		return; +		return ret;  	cfg = desc->chip_data;  	if (assign_irq_vector(irq, cfg, mask)) -		return; - -	set_extra_move_desc(desc, mask); +		return ret;  	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); @@ -2409,27 +2406,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)  		send_cleanup_vector(cfg);  	cpumask_copy(desc->affinity, mask); + +	return 0;  }  /*   * Migrates the IRQ destination in the process context.   */ -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,  					    const struct cpumask *mask)  { -	migrate_ioapic_irq_desc(desc, mask); +	return migrate_ioapic_irq_desc(desc, mask);  } -static void set_ir_ioapic_affinity_irq(unsigned int irq, +static int set_ir_ioapic_affinity_irq(unsigned int irq,  				       const struct cpumask *mask)  {  	struct irq_desc *desc = irq_to_desc(irq); -	set_ir_ioapic_affinity_irq_desc(desc, mask); +	return set_ir_ioapic_affinity_irq_desc(desc, mask);  }  #else -static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,  						   const struct cpumask *mask)  { +	return 0;  }  #endif @@ -2491,86 +2491,19 @@ static void irq_complete_move(struct irq_desc **descp)  	struct irq_cfg *cfg = desc->chip_data;  	unsigned vector, me; -	if (likely(!cfg->move_in_progress)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC -		if (likely(!cfg->move_desc_pending)) -			return; - -		/* domain has not changed, but affinity did */ -		me = smp_processor_id(); -		if (cpumask_test_cpu(me, desc->affinity)) { -			*descp = desc = move_irq_desc(desc, me); -			/* get the new one */ -			cfg = desc->chip_data; -			cfg->move_desc_pending = 0; -		} -#endif +	if (likely(!cfg->move_in_progress))  		return; -	}  	vector = ~get_irq_regs()->orig_ax;  	me = smp_processor_id(); -	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC -		*descp = desc = move_irq_desc(desc, me); -		/* get the new one */ -		cfg = desc->chip_data; -#endif +	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))  		send_cleanup_vector(cfg); -	}  }  #else  static inline void irq_complete_move(struct irq_desc **descp) {}  #endif -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ -	int apic, pin; -	struct irq_pin_list *entry; - -	entry = cfg->irq_2_pin; -	for (;;) { - -		if (!entry) -			break; - -		apic = entry->apic; -		pin = entry->pin; -		io_apic_eoi(apic, pin); -		entry = entry->next; -	} -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ -	struct irq_cfg *cfg; -	unsigned long flags; -	unsigned int irq; - -	irq = desc->irq; -	cfg = desc->chip_data; - -	spin_lock_irqsave(&ioapic_lock, flags); -	__eoi_ioapic_irq(irq, cfg); -	spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_X86_X2APIC -static void ack_x2apic_level(unsigned int irq) -{ -	struct irq_desc *desc = irq_to_desc(irq); -	ack_x2APIC_irq(); -	eoi_ioapic_irq(desc); -} - -static void ack_x2apic_edge(unsigned int irq) -{ -	ack_x2APIC_irq(); -} -#endif -  static void ack_apic_edge(unsigned int irq)  {  	struct irq_desc *desc = irq_to_desc(irq); @@ -2634,9 +2567,6 @@ static void ack_apic_level(unsigned int irq)  	 */  	ack_APIC_irq(); -	if (irq_remapped(irq)) -		eoi_ioapic_irq(desc); -  	/* Now we can move and renable the irq */  	if (unlikely(do_unmask_irq)) {  		/* Only migrate the irq if the ack has been received. @@ -2683,22 +2613,50 @@ static void ack_apic_level(unsigned int irq)  }  #ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ +	int apic, pin; +	struct irq_pin_list *entry; + +	entry = cfg->irq_2_pin; +	for (;;) { + +		if (!entry) +			break; + +		apic = entry->apic; +		pin = entry->pin; +		io_apic_eoi(apic, pin); +		entry = entry->next; +	} +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ +	struct irq_cfg *cfg; +	unsigned long flags; +	unsigned int irq; + +	irq = desc->irq; +	cfg = desc->chip_data; + +	spin_lock_irqsave(&ioapic_lock, flags); +	__eoi_ioapic_irq(irq, cfg); +	spin_unlock_irqrestore(&ioapic_lock, flags); +} +  static void ir_ack_apic_edge(unsigned int irq)  { -#ifdef CONFIG_X86_X2APIC -       if (x2apic_enabled()) -               return ack_x2apic_edge(irq); -#endif -       return ack_apic_edge(irq); +	ack_APIC_irq();  }  static void ir_ack_apic_level(unsigned int irq)  { -#ifdef CONFIG_X86_X2APIC -       if (x2apic_enabled()) -               return ack_x2apic_level(irq); -#endif -       return ack_apic_level(irq); +	struct irq_desc *desc = irq_to_desc(irq); + +	ack_APIC_irq(); +	eoi_ioapic_irq(desc);  }  #endif /* CONFIG_INTR_REMAP */ @@ -2903,7 +2861,7 @@ static inline void __init check_timer(void)  {  	struct irq_desc *desc = irq_to_desc(0);  	struct irq_cfg *cfg = desc->chip_data; -	int cpu = boot_cpu_id; +	int node = cpu_to_node(boot_cpu_id);  	int apic1, pin1, apic2, pin2;  	unsigned long flags;  	int no_pin1 = 0; @@ -2969,7 +2927,7 @@ static inline void __init check_timer(void)  		 * Ok, does IRQ0 through the IOAPIC work?  		 */  		if (no_pin1) { -			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); +			add_pin_to_irq_node(cfg, node, apic1, pin1);  			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);  		} else {  			/* for edge trigger, setup_IO_APIC_irq already @@ -3006,7 +2964,7 @@ static inline void __init check_timer(void)  		/*  		 * legacy devices should be connected to IO APIC #0  		 */ -		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); +		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);  		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);  		enable_8259A_irq(0);  		if (timer_irq_works()) { @@ -3218,14 +3176,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;  /*   * Dynamic irq allocate and deallocation   */ -unsigned int create_irq_nr(unsigned int irq_want) +unsigned int create_irq_nr(unsigned int irq_want, int node)  {  	/* Allocate an unused irq */  	unsigned int irq;  	unsigned int new;  	unsigned long flags;  	struct irq_cfg *cfg_new = NULL; -	int cpu = boot_cpu_id;  	struct irq_desc *desc_new = NULL;  	irq = 0; @@ -3234,7 +3191,7 @@ unsigned int create_irq_nr(unsigned int irq_want)  	spin_lock_irqsave(&vector_lock, flags);  	for (new = irq_want; new < nr_irqs; new++) { -		desc_new = irq_to_desc_alloc_cpu(new, cpu); +		desc_new = irq_to_desc_alloc_node(new, node);  		if (!desc_new) {  			printk(KERN_INFO "can not get irq_desc for %d\n", new);  			continue; @@ -3243,6 +3200,9 @@ unsigned int create_irq_nr(unsigned int irq_want)  		if (cfg_new->vector != 0)  			continue; + +		desc_new = move_irq_desc(desc_new, node); +  		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)  			irq = new;  		break; @@ -3260,11 +3220,12 @@ unsigned int create_irq_nr(unsigned int irq_want)  int create_irq(void)  { +	int node = cpu_to_node(boot_cpu_id);  	unsigned int irq_want;  	int irq;  	irq_want = nr_irqs_gsi; -	irq = create_irq_nr(irq_want); +	irq = create_irq_nr(irq_want, node);  	if (irq == 0)  		irq = -1; @@ -3366,7 +3327,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms  }  #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)  {  	struct irq_desc *desc = irq_to_desc(irq);  	struct irq_cfg *cfg; @@ -3375,7 +3336,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)  	dest = set_desc_affinity(desc, mask);  	if (dest == BAD_APICID) -		return; +		return -1;  	cfg = desc->chip_data; @@ -3387,13 +3348,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);  	write_msi_msg_desc(desc, &msg); + +	return 0;  }  #ifdef CONFIG_INTR_REMAP  /*   * Migrate the MSI irq to another cpumask. This migration is   * done in the process context using interrupt-remapping hardware.   */ -static void +static int  ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)  {  	struct irq_desc *desc = irq_to_desc(irq); @@ -3402,11 +3365,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)  	struct irte irte;  	if (get_irte(irq, &irte)) -		return; +		return -1;  	dest = set_desc_affinity(desc, mask);  	if (dest == BAD_APICID) -		return; +		return -1;  	irte.vector = cfg->vector;  	irte.dest_id = IRTE_DEST(dest); @@ -3423,6 +3386,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)  	 */  	if (cfg->move_in_progress)  		send_cleanup_vector(cfg); + +	return 0;  }  #endif @@ -3518,15 +3483,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)  	unsigned int irq_want;  	struct intel_iommu *iommu = NULL;  	int index = 0; +	int node;  	/* x86 doesn't support multiple MSI yet */  	if (type == PCI_CAP_ID_MSI && nvec > 1)  		return 1; +	node = dev_to_node(&dev->dev);  	irq_want = nr_irqs_gsi;  	sub_handle = 0;  	list_for_each_entry(msidesc, &dev->msi_list, list) { -		irq = create_irq_nr(irq_want); +		irq = create_irq_nr(irq_want, node);  		if (irq == 0)  			return -1;  		irq_want = irq + 1; @@ -3576,7 +3543,7 @@ void arch_teardown_msi_irq(unsigned int irq)  #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)  #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  {  	struct irq_desc *desc = irq_to_desc(irq);  	struct irq_cfg *cfg; @@ -3585,7 +3552,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  	dest = set_desc_affinity(desc, mask);  	if (dest == BAD_APICID) -		return; +		return -1;  	cfg = desc->chip_data; @@ -3597,11 +3564,13 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);  	dmar_msi_write(irq, &msg); + +	return 0;  }  #endif /* CONFIG_SMP */ -struct irq_chip dmar_msi_type = { +static struct irq_chip dmar_msi_type = {  	.name = "DMAR_MSI",  	.unmask = dmar_msi_unmask,  	.mask = dmar_msi_mask, @@ -3630,7 +3599,7 @@ int arch_setup_dmar_msi(unsigned int irq)  #ifdef CONFIG_HPET_TIMER  #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  {  	struct irq_desc *desc = irq_to_desc(irq);  	struct irq_cfg *cfg; @@ -3639,7 +3608,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  	dest = set_desc_affinity(desc, mask);  	if (dest == BAD_APICID) -		return; +		return -1;  	cfg = desc->chip_data; @@ -3651,6 +3620,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);  	hpet_msi_write(irq, &msg); + +	return 0;  }  #endif /* CONFIG_SMP */ @@ -3707,7 +3678,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)  	write_ht_irq_msg(irq, &msg);  } -static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)  {  	struct irq_desc *desc = irq_to_desc(irq);  	struct irq_cfg *cfg; @@ -3715,11 +3686,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)  	dest = set_desc_affinity(desc, mask);  	if (dest == BAD_APICID) -		return; +		return -1;  	cfg = desc->chip_data;  	target_ht_irq(irq, dest, cfg->vector); + +	return 0;  }  #endif @@ -3794,6 +3767,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  	unsigned long flags;  	int err; +	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); +  	cfg = irq_cfg(irq);  	err = assign_irq_vector(irq, cfg, eligible_cpu); @@ -3807,15 +3782,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  	mmr_value = 0;  	entry = (struct uv_IO_APIC_route_entry *)&mmr_value; -	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - -	entry->vector = cfg->vector; -	entry->delivery_mode = apic->irq_delivery_mode; -	entry->dest_mode = apic->irq_dest_mode; -	entry->polarity = 0; -	entry->trigger = 0; -	entry->mask = 0; -	entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); +	entry->vector		= cfg->vector; +	entry->delivery_mode	= apic->irq_delivery_mode; +	entry->dest_mode	= apic->irq_dest_mode; +	entry->polarity		= 0; +	entry->trigger		= 0; +	entry->mask		= 0; +	entry->dest		= apic->cpu_mask_to_apicid(eligible_cpu);  	mmr_pnode = uv_blade_to_pnode(mmr_blade);  	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3833,10 +3806,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)  	struct uv_IO_APIC_route_entry *entry;  	int mmr_pnode; +	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); +  	mmr_value = 0;  	entry = (struct uv_IO_APIC_route_entry *)&mmr_value; -	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); -  	entry->mask = 1;  	mmr_pnode = uv_blade_to_pnode(mmr_blade); @@ -3900,6 +3873,71 @@ int __init arch_probe_nr_irqs(void)  }  #endif +static int __io_apic_set_pci_routing(struct device *dev, int irq, +				struct io_apic_irq_attr *irq_attr) +{ +	struct irq_desc *desc; +	struct irq_cfg *cfg; +	int node; +	int ioapic, pin; +	int trigger, polarity; + +	ioapic = irq_attr->ioapic; +	if (!IO_APIC_IRQ(irq)) { +		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", +			ioapic); +		return -EINVAL; +	} + +	if (dev) +		node = dev_to_node(dev); +	else +		node = cpu_to_node(boot_cpu_id); + +	desc = irq_to_desc_alloc_node(irq, node); +	if (!desc) { +		printk(KERN_INFO "can not get irq_desc %d\n", irq); +		return 0; +	} + +	pin = irq_attr->ioapic_pin; +	trigger = irq_attr->trigger; +	polarity = irq_attr->polarity; + +	/* +	 * IRQs < 16 are already in the irq_2_pin[] map +	 */ +	if (irq >= NR_IRQS_LEGACY) { +		cfg = desc->chip_data; +		add_pin_to_irq_node(cfg, node, ioapic, pin); +	} + +	setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); + +	return 0; +} + +int io_apic_set_pci_routing(struct device *dev, int irq, +				struct io_apic_irq_attr *irq_attr) +{ +	int ioapic, pin; +	/* +	 * Avoid pin reprogramming.  PRTs typically include entries +	 * with redundant pin->gsi mappings (but unique PCI devices); +	 * we only program the IOAPIC on the first. +	 */ +	ioapic = irq_attr->ioapic; +	pin = irq_attr->ioapic_pin; +	if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { +		pr_debug("Pin %d-%d already programmed\n", +			 mp_ioapics[ioapic].apicid, pin); +		return 0; +	} +	set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + +	return __io_apic_set_pci_routing(dev, irq, irq_attr); +} +  /* --------------------------------------------------------------------------                            ACPI-based IOAPIC Configuration     -------------------------------------------------------------------------- */ @@ -3980,6 +4018,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)  	return apic_id;  } +#endif  int __init io_apic_get_version(int ioapic)  { @@ -3992,39 +4031,6 @@ int __init io_apic_get_version(int ioapic)  	return reg_01.bits.version;  } -#endif - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ -	struct irq_desc *desc; -	struct irq_cfg *cfg; -	int cpu = boot_cpu_id; - -	if (!IO_APIC_IRQ(irq)) { -		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", -			ioapic); -		return -EINVAL; -	} - -	desc = irq_to_desc_alloc_cpu(irq, cpu); -	if (!desc) { -		printk(KERN_INFO "can not get irq_desc %d\n", irq); -		return 0; -	} - -	/* -	 * IRQs < 16 are already in the irq_2_pin[] map -	 */ -	if (irq >= NR_IRQS_LEGACY) { -		cfg = desc->chip_data; -		add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); -	} - -	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); - -	return 0; -} -  int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)  { @@ -4055,51 +4061,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)  #ifdef CONFIG_SMP  void __init setup_ioapic_dest(void)  { -	int pin, ioapic, irq, irq_entry; +	int pin, ioapic = 0, irq, irq_entry;  	struct irq_desc *desc; -	struct irq_cfg *cfg;  	const struct cpumask *mask;  	if (skip_ioapic_setup == 1)  		return; -	for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { -		for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { -			irq_entry = find_irq_entry(ioapic, pin, mp_INT); -			if (irq_entry == -1) -				continue; -			irq = pin_2_irq(irq_entry, ioapic, pin); - -			/* setup_IO_APIC_irqs could fail to get vector for some device -			 * when you have too many devices, because at that time only boot -			 * cpu is online. -			 */ -			desc = irq_to_desc(irq); -			cfg = desc->chip_data; -			if (!cfg->vector) { -				setup_IO_APIC_irq(ioapic, pin, irq, desc, -						  irq_trigger(irq_entry), -						  irq_polarity(irq_entry)); -				continue; +#ifdef CONFIG_ACPI +	if (!acpi_disabled && acpi_ioapic) { +		ioapic = mp_find_ioapic(0); +		if (ioapic < 0) +			ioapic = 0; +	} +#endif -			} +	for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { +		irq_entry = find_irq_entry(ioapic, pin, mp_INT); +		if (irq_entry == -1) +			continue; +		irq = pin_2_irq(irq_entry, ioapic, pin); -			/* -			 * Honour affinities which have been set in early boot -			 */ -			if (desc->status & -			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) -				mask = desc->affinity; -			else -				mask = apic->target_cpus(); +		desc = irq_to_desc(irq); -			if (intr_remapping_enabled) -				set_ir_ioapic_affinity_irq_desc(desc, mask); -			else -				set_ioapic_affinity_irq_desc(desc, mask); -		} +		/* +		 * Honour affinities which have been set in early boot +		 */ +		if (desc->status & +		    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) +			mask = desc->affinity; +		else +			mask = apic->target_cpus(); +		if (intr_remapping_enabled) +			set_ir_ioapic_affinity_irq_desc(desc, mask); +		else +			set_ioapic_affinity_irq_desc(desc, mask);  	} +  }  #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index ce4fbfa315a..b3025b43b63 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)  static inline int mce_in_progress(void)  { -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#if defined(CONFIG_X86_NEW_MCE)  	return atomic_read(&mce_entry) > 0;  #endif  	return 0; @@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)  }  #endif -static void report_broken_nmi(int cpu, int *prev_nmi_count) +static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)  {  	printk(KERN_CONT "\n"); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 01eda2ac65e..0c0182cc947 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -20,23 +20,12 @@  #include <asm/apic.h>  #include <asm/setup.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <asm/mpspec.h> -#include <asm/fixmap.h> -#include <asm/apicdef.h> -#include <linux/kernel.h> -#include <linux/string.h>  #include <linux/smp.h> -#include <linux/init.h>  #include <asm/ipi.h> -#include <linux/smp.h> -#include <linux/init.h>  #include <linux/interrupt.h>  #include <asm/acpi.h>  #include <asm/e820.h> -#include <asm/setup.h>  #ifdef CONFIG_HOTPLUG_CPU  #define DEFAULT_SEND_IPI	(1) @@ -160,7 +149,6 @@ extern struct apic apic_summit;  extern struct apic apic_bigsmp;  extern struct apic apic_es7000;  extern struct apic apic_es7000_cluster; -extern struct apic apic_default;  struct apic *apic = &apic_default;  EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1783652bb0e..bc3e880f9b8 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {  void __init default_setup_apic_routing(void)  {  #ifdef CONFIG_X86_X2APIC -	if (x2apic && (apic != &apic_x2apic_phys && +	if (x2apic_mode && (apic != &apic_x2apic_phys &&  #ifdef CONFIG_X86_UV  		       apic != &apic_x2apic_uv_x &&  #endif diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9cfe1f415d8..eafdfbd1ea9 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -44,7 +44,6 @@  #include <asm/ipi.h>  #include <linux/kernel.h>  #include <linux/string.h> -#include <linux/init.h>  #include <linux/gfp.h>  #include <linux/smp.h> @@ -173,13 +172,6 @@ static inline int is_WPEG(struct rio_detail *rio){  		rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);  } - -/* In clustered mode, the high nibble of APIC ID is a cluster number. - * The low nibble is a 4-bit bitmap. */ -#define XAPIC_DEST_CPUS_SHIFT	4 -#define XAPIC_DEST_CPUS_MASK	((1u << XAPIC_DEST_CPUS_SHIFT) - 1) -#define XAPIC_DEST_CLUSTER_MASK	(XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) -  #define SUMMIT_APIC_DFR_VALUE	(APIC_DFR_CLUSTER)  static const struct cpumask *summit_target_cpus(void) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4a903e2f0d1..8e4cbb255c3 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -10,7 +10,7 @@  #include <asm/apic.h>  #include <asm/ipi.h> -DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);  static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)  { diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2bda6935297..096d19aea2f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)  	cpumask_set_cpu(cpu, retmask);  } -static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)  {  #ifdef CONFIG_SMP  	unsigned long val; @@ -463,7 +463,7 @@ static void uv_heartbeat(unsigned long ignored)  	uv_set_scir_bits(bits);  	/* enable next timer period */ -	mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); +	mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);  }  static void __cpuinit uv_heartbeat_enable(int cpu) @@ -562,7 +562,7 @@ void __init uv_system_init(void)  	union uvh_node_id_u node_id;  	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;  	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; -	int max_pnode = 0; +	int gnode_extra, max_pnode = 0;  	unsigned long mmr_base, present, paddr;  	unsigned short pnode_mask; @@ -574,6 +574,13 @@ void __init uv_system_init(void)  	mmr_base =  	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &  	    ~UV_MMR_ENABLE; +	pnode_mask = (1 << n_val) - 1; +	node_id.v = uv_read_local_mmr(UVH_NODE_ID); +	gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; +	gnode_upper = ((unsigned long)gnode_extra  << m_val); +	printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", +			n_val, m_val, gnode_upper, gnode_extra); +  	printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);  	for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) @@ -583,15 +590,18 @@ void __init uv_system_init(void)  	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();  	uv_blade_info = kmalloc(bytes, GFP_KERNEL); +	BUG_ON(!uv_blade_info);  	get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);  	bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();  	uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); +	BUG_ON(!uv_node_to_blade);  	memset(uv_node_to_blade, 255, bytes);  	bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();  	uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); +	BUG_ON(!uv_cpu_to_blade);  	memset(uv_cpu_to_blade, 255, bytes);  	blade = 0; @@ -607,11 +617,6 @@ void __init uv_system_init(void)  		}  	} -	pnode_mask = (1 << n_val) - 1; -	node_id.v = uv_read_local_mmr(UVH_NODE_ID); -	gnode_upper = (((unsigned long)node_id.s.node_id) & -		       ~((1 << n_val) - 1)) << m_val; -  	uv_bios_init();  	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,  			    &sn_coherency_id, &sn_region_size); @@ -634,6 +639,7 @@ void __init uv_system_init(void)  		uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;  		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;  		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; +		uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;  		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;  		uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;  		uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 49e0939bac4..79302e9a33a 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1233,9 +1233,9 @@ static int suspend(int vetoable)  	int err;  	struct apm_user	*as; -	device_suspend(PMSG_SUSPEND); +	dpm_suspend_start(PMSG_SUSPEND); -	device_power_down(PMSG_SUSPEND); +	dpm_suspend_noirq(PMSG_SUSPEND);  	local_irq_disable();  	sysdev_suspend(PMSG_SUSPEND); @@ -1259,9 +1259,9 @@ static int suspend(int vetoable)  	sysdev_resume();  	local_irq_enable(); -	device_power_up(PMSG_RESUME); +	dpm_resume_noirq(PMSG_RESUME); -	device_resume(PMSG_RESUME); +	dpm_resume_end(PMSG_RESUME);  	queue_event(APM_NORMAL_RESUME, NULL);  	spin_lock(&user_list_lock);  	for (as = user_list; as != NULL; as = as->next) { @@ -1277,7 +1277,7 @@ static void standby(void)  {  	int err; -	device_power_down(PMSG_SUSPEND); +	dpm_suspend_noirq(PMSG_SUSPEND);  	local_irq_disable();  	sysdev_suspend(PMSG_SUSPEND); @@ -1291,7 +1291,7 @@ static void standby(void)  	sysdev_resume();  	local_irq_enable(); -	device_power_up(PMSG_RESUME); +	dpm_resume_noirq(PMSG_RESUME);  }  static apm_event_t get_event(void) @@ -1376,7 +1376,7 @@ static void check_events(void)  			ignore_bounce = 1;  			if ((event != APM_NORMAL_RESUME)  			    || (ignore_normal_resume == 0)) { -				device_resume(PMSG_RESUME); +				dpm_resume_end(PMSG_RESUME);  				queue_event(event, NULL);  			}  			ignore_normal_resume = 0; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 5a6aa1c1162..dfdbf640389 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -126,6 +126,7 @@ void foo(void)  #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)  	BLANK();  	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); +	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);  	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);  	BLANK(); @@ -146,4 +147,5 @@ void foo(void)  	OFFSET(BP_loadflags, boot_params, hdr.loadflags);  	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);  	OFFSET(BP_version, boot_params, hdr.version); +	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);  } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72f062fb4b..898ecc47e12 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -125,6 +125,7 @@ int main(void)  	OFFSET(BP_loadflags, boot_params, hdr.loadflags);  	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);  	OFFSET(BP_version, boot_params, hdr.version); +	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);  	BLANK();  	DEFINE(PAGE_SIZE_asm, PAGE_SIZE); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4e242f9a06e..3efcb2b96a1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -1,5 +1,5 @@  # -# Makefile for x86-compatible CPU details and quirks +# Makefile for x86-compatible CPU details, features and quirks  #  # Don't trace early stages of a secondary CPU boot @@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o  obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o  obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o -obj-$(CONFIG_X86_MCE)	+= mcheck/ -obj-$(CONFIG_MTRR)	+= mtrr/ -obj-$(CONFIG_CPU_FREQ)	+= cpufreq/ +obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_MCE)			+= mcheck/ +obj-$(CONFIG_MTRR)			+= mtrr/ +obj-$(CONFIG_CPU_FREQ)			+= cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o  quiet_cmd_mkcapflags = MKCAP   $@        cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa6..e5b27d8f1b4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -6,6 +6,7 @@  #include <asm/processor.h>  #include <asm/apic.h>  #include <asm/cpu.h> +#include <asm/pci-direct.h>  #ifdef CONFIG_X86_64  # include <asm/numa_64.h> @@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)  #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)  	int cpu = smp_processor_id();  	int node; -	unsigned apicid = hard_smp_processor_id(); +	unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;  	node = c->phys_proc_id;  	if (apicid_to_node[apicid] != NUMA_NO_NODE) @@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)  		    (c->x86_model == 8 && c->x86_mask >= 8))  			set_cpu_cap(c, X86_FEATURE_K6_MTRR);  #endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) +	/* check CPU config space for extended APIC ID */ +	if (c->x86 >= 0xf) { +		unsigned int val; +		val = read_pci_config(0, 24, 0, 0x68); +		if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) +			set_cpu_cap(c, X86_FEATURE_EXTD_APICID); +	} +#endif  }  static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 77848d9fca6..6b26d4deada 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@  #include <linux/io.h>  #include <asm/stackprotector.h> +#include <asm/perf_counter.h>  #include <asm/mmu_context.h>  #include <asm/hypervisor.h>  #include <asm/processor.h> @@ -107,7 +108,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {  	/* data */  	[GDT_ENTRY_APMBIOS_BASE+2]	= { { { 0x0000ffff, 0x00409200 } } }, -	[GDT_ENTRY_ESPFIX_SS]		= { { { 0x00000000, 0x00c09200 } } }, +	[GDT_ENTRY_ESPFIX_SS]		= { { { 0x0000ffff, 0x00cf9200 } } },  	[GDT_ENTRY_PERCPU]		= { { { 0x0000ffff, 0x00cf9200 } } },  	GDT_STACK_CANARY_INIT  #endif @@ -299,7 +300,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)  	return NULL;		/* Not found */  } -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;  void load_percpu_segment(int cpu)  { @@ -485,7 +487,6 @@ out:  static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)  {  	char *v = c->x86_vendor_id; -	static int printed;  	int i;  	for (i = 0; i < X86_VENDOR_NUM; i++) { @@ -502,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)  		}  	} -	if (!printed) { -		printed++; -		printk(KERN_ERR -		    "CPU: vendor_id '%s' unknown, using generic init.\n", v); - -		printk(KERN_ERR "CPU: Your system may be unstable.\n"); -	} +	printk_once(KERN_ERR +			"CPU: vendor_id '%s' unknown, using generic init.\n" \ +			"CPU: Your system may be unstable.\n", v);  	c->x86_vendor = X86_VENDOR_UNKNOWN;  	this_cpu = &default_cpu; @@ -768,6 +765,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  	if (this_cpu->c_identify)  		this_cpu->c_identify(c); +	/* Clear/Set all flags overriden by options, after probe */ +	for (i = 0; i < NCAPINTS; i++) { +		c->x86_capability[i] &= ~cpu_caps_cleared[i]; +		c->x86_capability[i] |= cpu_caps_set[i]; +	} +  #ifdef CONFIG_X86_64  	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);  #endif @@ -813,6 +816,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  #endif  	init_hypervisor(c); + +	/* +	 * Clear/Set all flags overriden by options, need do it +	 * before following smp all cpus cap AND. +	 */ +	for (i = 0; i < NCAPINTS; i++) { +		c->x86_capability[i] &= ~cpu_caps_cleared[i]; +		c->x86_capability[i] |= cpu_caps_set[i]; +	} +  	/*  	 * On SMP, boot_cpu_data holds the common feature set between  	 * all CPUs; so make sure that we indicate which features are @@ -825,10 +838,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];  	} -	/* Clear all flags overriden by options */ -	for (i = 0; i < NCAPINTS; i++) -		c->x86_capability[i] &= ~cleared_cpu_caps[i]; -  #ifdef CONFIG_X86_MCE  	/* Init Machine Check Exception if available. */  	mcheck_init(c); @@ -839,6 +848,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)  	numa_add_cpu(smp_processor_id());  #endif + +	/* Cap the iomem address space to what is addressable on all CPUs */ +	iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1;  }  #ifdef CONFIG_X86_64 @@ -861,6 +873,7 @@ void __init identify_boot_cpu(void)  #else  	vgetcpu_set_mode();  #endif +	init_hw_perf_counters();  }  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab96c6..6b2a52dd040 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -32,9 +32,7 @@  static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);  static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); -static DEFINE_PER_CPU(unsigned, cpu_modelflag);  static DEFINE_PER_CPU(int, cpu_priv_count); -static DEFINE_PER_CPU(unsigned, cpu_model);  static DEFINE_MUTEX(cpu_debug_lock); @@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {  	{ "value",	CPU_REG_ALL,	1	},  }; -/* Intel Registers Range */ -static struct cpu_debug_range cpu_intel_range[] = { -	{ 0x00000000, 0x00000001, CPU_MC,	CPU_INTEL_ALL		}, -	{ 0x00000006, 0x00000007, CPU_MONITOR,	CPU_CX_AT_XE		}, -	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_INTEL_ALL		}, -	{ 0x00000011, 0x00000013, CPU_PMC,	CPU_INTEL_PENTIUM	}, -	{ 0x00000017, 0x00000017, CPU_PLATFORM,	CPU_PX_CX_AT_XE		}, -	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_P6_CX_AT_XE		}, +/* CPU Registers Range */ +static struct cpu_debug_range cpu_reg_range[] = { +	{ 0x00000000, 0x00000001, CPU_MC,	}, +	{ 0x00000006, 0x00000007, CPU_MONITOR,	}, +	{ 0x00000010, 0x00000010, CPU_TIME,	}, +	{ 0x00000011, 0x00000013, CPU_PMC,	}, +	{ 0x00000017, 0x00000017, CPU_PLATFORM,	}, +	{ 0x0000001B, 0x0000001B, CPU_APIC,	}, +	{ 0x0000002A, 0x0000002B, CPU_POWERON,	}, +	{ 0x0000002C, 0x0000002C, CPU_FREQ,	}, +	{ 0x0000003A, 0x0000003A, CPU_CONTROL,	}, +	{ 0x00000040, 0x00000047, CPU_LBRANCH,	}, +	{ 0x00000060, 0x00000067, CPU_LBRANCH,	}, +	{ 0x00000079, 0x00000079, CPU_BIOS,	}, +	{ 0x00000088, 0x0000008A, CPU_CACHE,	}, +	{ 0x0000008B, 0x0000008B, CPU_BIOS,	}, +	{ 0x0000009B, 0x0000009B, CPU_MONITOR,	}, +	{ 0x000000C1, 0x000000C4, CPU_PMC,	}, +	{ 0x000000CD, 0x000000CD, CPU_FREQ,	}, +	{ 0x000000E7, 0x000000E8, CPU_PERF,	}, +	{ 0x000000FE, 0x000000FE, CPU_MTRR,	}, -	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_PX_CX_AT_XE		}, -	{ 0x0000002B, 0x0000002B, CPU_POWERON,	CPU_INTEL_XEON		}, -	{ 0x0000002C, 0x0000002C, CPU_FREQ,	CPU_INTEL_XEON		}, -	{ 0x0000003A, 0x0000003A, CPU_CONTROL,	CPU_CX_AT_XE		}, +	{ 0x00000116, 0x0000011E, CPU_CACHE,	}, +	{ 0x00000174, 0x00000176, CPU_SYSENTER,	}, +	{ 0x00000179, 0x0000017B, CPU_MC,	}, +	{ 0x00000186, 0x00000189, CPU_PMC,	}, +	{ 0x00000198, 0x00000199, CPU_PERF,	}, +	{ 0x0000019A, 0x0000019A, CPU_TIME,	}, +	{ 0x0000019B, 0x0000019D, CPU_THERM,	}, +	{ 0x000001A0, 0x000001A0, CPU_MISC,	}, +	{ 0x000001C9, 0x000001C9, CPU_LBRANCH,	}, +	{ 0x000001D7, 0x000001D8, CPU_LBRANCH,	}, +	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	}, +	{ 0x000001DA, 0x000001E0, CPU_LBRANCH,	}, -	{ 0x00000040, 0x00000043, CPU_LBRANCH,	CPU_PM_CX_AT_XE		}, -	{ 0x00000044, 0x00000047, CPU_LBRANCH,	CPU_PM_CO_AT		}, -	{ 0x00000060, 0x00000063, CPU_LBRANCH,	CPU_C2_AT		}, -	{ 0x00000064, 0x00000067, CPU_LBRANCH,	CPU_INTEL_ATOM		}, +	{ 0x00000200, 0x0000020F, CPU_MTRR,	}, +	{ 0x00000250, 0x00000250, CPU_MTRR,	}, +	{ 0x00000258, 0x00000259, CPU_MTRR,	}, +	{ 0x00000268, 0x0000026F, CPU_MTRR,	}, +	{ 0x00000277, 0x00000277, CPU_PAT,	}, +	{ 0x000002FF, 0x000002FF, CPU_MTRR,	}, -	{ 0x00000079, 0x00000079, CPU_BIOS,	CPU_P6_CX_AT_XE		}, -	{ 0x00000088, 0x0000008A, CPU_CACHE,	CPU_INTEL_P6		}, -	{ 0x0000008B, 0x0000008B, CPU_BIOS,	CPU_P6_CX_AT_XE		}, -	{ 0x0000009B, 0x0000009B, CPU_MONITOR,	CPU_INTEL_XEON		}, +	{ 0x00000300, 0x00000311, CPU_PMC,	}, +	{ 0x00000345, 0x00000345, CPU_PMC,	}, +	{ 0x00000360, 0x00000371, CPU_PMC,	}, +	{ 0x0000038D, 0x00000390, CPU_PMC,	}, +	{ 0x000003A0, 0x000003BE, CPU_PMC,	}, +	{ 0x000003C0, 0x000003CD, CPU_PMC,	}, +	{ 0x000003E0, 0x000003E1, CPU_PMC,	}, +	{ 0x000003F0, 0x000003F2, CPU_PMC,	}, -	{ 0x000000C1, 0x000000C2, CPU_PMC,	CPU_P6_CX_AT		}, -	{ 0x000000CD, 0x000000CD, CPU_FREQ,	CPU_CX_AT		}, -	{ 0x000000E7, 0x000000E8, CPU_PERF,	CPU_CX_AT		}, -	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_P6_CX_XE		}, +	{ 0x00000400, 0x00000417, CPU_MC,	}, +	{ 0x00000480, 0x0000048B, CPU_VMX,	}, -	{ 0x00000116, 0x00000116, CPU_CACHE,	CPU_INTEL_P6		}, -	{ 0x00000118, 0x00000118, CPU_CACHE,	CPU_INTEL_P6		}, -	{ 0x00000119, 0x00000119, CPU_CACHE,	CPU_INTEL_PX		}, -	{ 0x0000011A, 0x0000011B, CPU_CACHE,	CPU_INTEL_P6		}, -	{ 0x0000011E, 0x0000011E, CPU_CACHE,	CPU_PX_CX_AT		}, +	{ 0x00000600, 0x00000600, CPU_DEBUG,	}, +	{ 0x00000680, 0x0000068F, CPU_LBRANCH,	}, +	{ 0x000006C0, 0x000006CF, CPU_LBRANCH,	}, -	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_P6_CX_AT_XE		}, -	{ 0x00000179, 0x0000017A, CPU_MC,	CPU_PX_CX_AT_XE		}, -	{ 0x0000017B, 0x0000017B, CPU_MC,	CPU_P6_XE		}, -	{ 0x00000186, 0x00000187, CPU_PMC,	CPU_P6_CX_AT		}, -	{ 0x00000198, 0x00000199, CPU_PERF,	CPU_PM_CX_AT_XE		}, -	{ 0x0000019A, 0x0000019A, CPU_TIME,	CPU_PM_CX_AT_XE		}, -	{ 0x0000019B, 0x0000019D, CPU_THERM,	CPU_PM_CX_AT_XE		}, -	{ 0x000001A0, 0x000001A0, CPU_MISC,	CPU_PM_CX_AT_XE		}, +	{ 0x000107CC, 0x000107D3, CPU_PMC,	}, -	{ 0x000001C9, 0x000001C9, CPU_LBRANCH,	CPU_PM_CX_AT		}, -	{ 0x000001D7, 0x000001D8, CPU_LBRANCH,	CPU_INTEL_XEON		}, -	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_CX_AT_XE		}, -	{ 0x000001DA, 0x000001DA, CPU_LBRANCH,	CPU_INTEL_XEON		}, -	{ 0x000001DB, 0x000001DB, CPU_LBRANCH,	CPU_P6_XE		}, -	{ 0x000001DC, 0x000001DC, CPU_LBRANCH,	CPU_INTEL_P6		}, -	{ 0x000001DD, 0x000001DE, CPU_LBRANCH,	CPU_PX_CX_AT_XE		}, -	{ 0x000001E0, 0x000001E0, CPU_LBRANCH,	CPU_INTEL_P6		}, +	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	}, +	{ 0xC0000081, 0xC0000084, CPU_CALL,	}, +	{ 0xC0000100, 0xC0000102, CPU_BASE,	}, +	{ 0xC0000103, 0xC0000103, CPU_TIME,	}, -	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_P6_CX_XE		}, -	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_P6_CX_XE		}, -	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_P6_CX_XE		}, -	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_P6_CX_XE		}, -	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_C2_AT_XE		}, -	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_P6_CX_XE		}, - -	{ 0x00000300, 0x00000308, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x00000309, 0x0000030B, CPU_PMC,	CPU_C2_AT_XE		}, -	{ 0x0000030C, 0x00000311, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x00000345, 0x00000345, CPU_PMC,	CPU_C2_AT		}, -	{ 0x00000360, 0x00000371, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x0000038D, 0x00000390, CPU_PMC,	CPU_C2_AT		}, -	{ 0x000003A0, 0x000003BE, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x000003C0, 0x000003CD, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x000003E0, 0x000003E1, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x000003F0, 0x000003F0, CPU_PMC,	CPU_INTEL_XEON		}, -	{ 0x000003F1, 0x000003F1, CPU_PMC,	CPU_C2_AT_XE		}, -	{ 0x000003F2, 0x000003F2, CPU_PMC,	CPU_INTEL_XEON		}, - -	{ 0x00000400, 0x00000402, CPU_MC,	CPU_PM_CX_AT_XE		}, -	{ 0x00000403, 0x00000403, CPU_MC,	CPU_INTEL_XEON		}, -	{ 0x00000404, 0x00000406, CPU_MC,	CPU_PM_CX_AT_XE		}, -	{ 0x00000407, 0x00000407, CPU_MC,	CPU_INTEL_XEON		}, -	{ 0x00000408, 0x0000040A, CPU_MC,	CPU_PM_CX_AT_XE		}, -	{ 0x0000040B, 0x0000040B, CPU_MC,	CPU_INTEL_XEON		}, -	{ 0x0000040C, 0x0000040E, CPU_MC,	CPU_PM_CX_XE		}, -	{ 0x0000040F, 0x0000040F, CPU_MC,	CPU_INTEL_XEON		}, -	{ 0x00000410, 0x00000412, CPU_MC,	CPU_PM_CX_AT_XE		}, -	{ 0x00000413, 0x00000417, CPU_MC,	CPU_CX_AT_XE		}, -	{ 0x00000480, 0x0000048B, CPU_VMX,	CPU_CX_AT_XE		}, - -	{ 0x00000600, 0x00000600, CPU_DEBUG,	CPU_PM_CX_AT_XE		}, -	{ 0x00000680, 0x0000068F, CPU_LBRANCH,	CPU_INTEL_XEON		}, -	{ 0x000006C0, 0x000006CF, CPU_LBRANCH,	CPU_INTEL_XEON		}, - -	{ 0x000107CC, 0x000107D3, CPU_PMC,	CPU_INTEL_XEON_MP	}, - -	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_INTEL_XEON		}, -	{ 0xC0000081, 0xC0000082, CPU_CALL,	CPU_INTEL_XEON		}, -	{ 0xC0000084, 0xC0000084, CPU_CALL,	CPU_INTEL_XEON		}, -	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_INTEL_XEON		}, +	{ 0xC0010000, 0xC0010007, CPU_PMC,	}, +	{ 0xC0010010, 0xC0010010, CPU_CONF,	}, +	{ 0xC0010015, 0xC0010015, CPU_CONF,	}, +	{ 0xC0010016, 0xC001001A, CPU_MTRR,	}, +	{ 0xC001001D, 0xC001001D, CPU_MTRR,	}, +	{ 0xC001001F, 0xC001001F, CPU_CONF,	}, +	{ 0xC0010030, 0xC0010035, CPU_BIOS,	}, +	{ 0xC0010044, 0xC0010048, CPU_MC,	}, +	{ 0xC0010050, 0xC0010056, CPU_SMM,	}, +	{ 0xC0010058, 0xC0010058, CPU_CONF,	}, +	{ 0xC0010060, 0xC0010060, CPU_CACHE,	}, +	{ 0xC0010061, 0xC0010068, CPU_SMM,	}, +	{ 0xC0010069, 0xC001006B, CPU_SMM,	}, +	{ 0xC0010070, 0xC0010071, CPU_SMM,	}, +	{ 0xC0010111, 0xC0010113, CPU_SMM,	}, +	{ 0xC0010114, 0xC0010118, CPU_SVM,	}, +	{ 0xC0010140, 0xC0010141, CPU_OSVM,	}, +	{ 0xC0011022, 0xC0011023, CPU_CONF,	},  }; -/* AMD Registers Range */ -static struct cpu_debug_range cpu_amd_range[] = { -	{ 0x00000000, 0x00000001, CPU_MC,	CPU_K10_PLUS,		}, -	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_K8_PLUS,		}, -	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_K8_PLUS,		}, -	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_K7_PLUS		}, -	{ 0x0000008B, 0x0000008B, CPU_VER,	CPU_K8_PLUS		}, -	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_K8_PLUS,		}, - -	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_K8_PLUS,		}, -	{ 0x00000179, 0x0000017B, CPU_MC,	CPU_K8_PLUS,		}, -	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_K8_PLUS,		}, -	{ 0x000001DB, 0x000001DE, CPU_LBRANCH,	CPU_K8_PLUS,		}, - -	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_K8_PLUS,		}, -	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_K8_PLUS,		}, - -	{ 0x00000400, 0x00000413, CPU_MC,	CPU_K8_PLUS,		}, - -	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_AMD_ALL,		}, -	{ 0xC0000081, 0xC0000084, CPU_CALL,	CPU_K8_PLUS,		}, -	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_K8_PLUS,		}, -	{ 0xC0000103, 0xC0000103, CPU_TIME,	CPU_K10_PLUS,		}, - -	{ 0xC0010000, 0xC0010007, CPU_PMC,	CPU_K8_PLUS,		}, -	{ 0xC0010010, 0xC0010010, CPU_CONF,	CPU_K7_PLUS,		}, -	{ 0xC0010015, 0xC0010015, CPU_CONF,	CPU_K7_PLUS,		}, -	{ 0xC0010016, 0xC001001A, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0xC001001D, 0xC001001D, CPU_MTRR,	CPU_K8_PLUS,		}, -	{ 0xC001001F, 0xC001001F, CPU_CONF,	CPU_K8_PLUS,		}, -	{ 0xC0010030, 0xC0010035, CPU_BIOS,	CPU_K8_PLUS,		}, -	{ 0xC0010044, 0xC0010048, CPU_MC,	CPU_K8_PLUS,		}, -	{ 0xC0010050, 0xC0010056, CPU_SMM,	CPU_K0F_PLUS,		}, -	{ 0xC0010058, 0xC0010058, CPU_CONF,	CPU_K10_PLUS,		}, -	{ 0xC0010060, 0xC0010060, CPU_CACHE,	CPU_AMD_11,		}, -	{ 0xC0010061, 0xC0010068, CPU_SMM,	CPU_K10_PLUS,		}, -	{ 0xC0010069, 0xC001006B, CPU_SMM,	CPU_AMD_11,		}, -	{ 0xC0010070, 0xC0010071, CPU_SMM,	CPU_K10_PLUS,		}, -	{ 0xC0010111, 0xC0010113, CPU_SMM,	CPU_K8_PLUS,		}, -	{ 0xC0010114, 0xC0010118, CPU_SVM,	CPU_K10_PLUS,		}, -	{ 0xC0010140, 0xC0010141, CPU_OSVM,	CPU_K10_PLUS,		}, -	{ 0xC0011022, 0xC0011023, CPU_CONF,	CPU_K10_PLUS,		}, -}; - - -/* Intel */ -static int get_intel_modelflag(unsigned model) -{ -	int flag; - -	switch (model) { -	case 0x0501: -	case 0x0502: -	case 0x0504: -		flag = CPU_INTEL_PENTIUM; -		break; -	case 0x0601: -	case 0x0603: -	case 0x0605: -	case 0x0607: -	case 0x0608: -	case 0x060A: -	case 0x060B: -		flag = CPU_INTEL_P6; -		break; -	case 0x0609: -	case 0x060D: -		flag = CPU_INTEL_PENTIUM_M; -		break; -	case 0x060E: -		flag = CPU_INTEL_CORE; -		break; -	case 0x060F: -	case 0x0617: -		flag = CPU_INTEL_CORE2; -		break; -	case 0x061C: -		flag = CPU_INTEL_ATOM; -		break; -	case 0x0F00: -	case 0x0F01: -	case 0x0F02: -	case 0x0F03: -	case 0x0F04: -		flag = CPU_INTEL_XEON_P4; -		break; -	case 0x0F06: -		flag = CPU_INTEL_XEON_MP; -		break; -	default: -		flag = CPU_NONE; -		break; -	} - -	return flag; -} - -/* AMD */ -static int get_amd_modelflag(unsigned model) -{ -	int flag; - -	switch (model >> 8) { -	case 0x6: -		flag = CPU_AMD_K6; -		break; -	case 0x7: -		flag = CPU_AMD_K7; -		break; -	case 0x8: -		flag = CPU_AMD_K8; -		break; -	case 0xf: -		flag = CPU_AMD_0F; -		break; -	case 0x10: -		flag = CPU_AMD_10; -		break; -	case 0x11: -		flag = CPU_AMD_11; -		break; -	default: -		flag = CPU_NONE; -		break; -	} - -	return flag; -} - -static int get_cpu_modelflag(unsigned cpu) -{ -	int flag; - -	flag = per_cpu(cpu_model, cpu); - -	switch (flag >> 16) { -	case X86_VENDOR_INTEL: -		flag = get_intel_modelflag(flag); -		break; -	case X86_VENDOR_AMD: -		flag = get_amd_modelflag(flag & 0xffff); -		break; -	default: -		flag = CPU_NONE; -		break; -	} - -	return flag; -} - -static int get_cpu_range_count(unsigned cpu) -{ -	int index; - -	switch (per_cpu(cpu_model, cpu) >> 16) { -	case X86_VENDOR_INTEL: -		index = ARRAY_SIZE(cpu_intel_range); -		break; -	case X86_VENDOR_AMD: -		index = ARRAY_SIZE(cpu_amd_range); -		break; -	default: -		index = 0; -		break; -	} - -	return index; -} -  static int is_typeflag_valid(unsigned cpu, unsigned flag)  { -	unsigned vendor, modelflag; -	int i, index; +	int i;  	/* Standard Registers should be always valid */  	if (flag >= CPU_TSS)  		return 1; -	modelflag = per_cpu(cpu_modelflag, cpu); -	vendor = per_cpu(cpu_model, cpu) >> 16; -	index = get_cpu_range_count(cpu); - -	for (i = 0; i < index; i++) { -		switch (vendor) { -		case X86_VENDOR_INTEL: -			if ((cpu_intel_range[i].model & modelflag) && -			    (cpu_intel_range[i].flag & flag)) -				return 1; -			break; -		case X86_VENDOR_AMD: -			if ((cpu_amd_range[i].model & modelflag) && -			    (cpu_amd_range[i].flag & flag)) -				return 1; -			break; -		} +	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { +		if (cpu_reg_range[i].flag == flag) +			return 1;  	}  	/* Invalid */ @@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)  static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,  			      int index, unsigned flag)  { -	unsigned modelflag; - -	modelflag = per_cpu(cpu_modelflag, cpu); -	*max = 0; -	switch (per_cpu(cpu_model, cpu) >> 16) { -	case X86_VENDOR_INTEL: -		if ((cpu_intel_range[index].model & modelflag) && -		    (cpu_intel_range[index].flag & flag)) { -			*min = cpu_intel_range[index].min; -			*max = cpu_intel_range[index].max; -		} -		break; -	case X86_VENDOR_AMD: -		if ((cpu_amd_range[index].model & modelflag) && -		    (cpu_amd_range[index].flag & flag)) { -			*min = cpu_amd_range[index].min; -			*max = cpu_amd_range[index].max; -		} -		break; -	} +	if (cpu_reg_range[index].flag == flag) { +		*min = cpu_reg_range[index].min; +		*max = cpu_reg_range[index].max; +	} else +		*max = 0;  	return *max;  } @@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)  	unsigned msr, msr_min, msr_max;  	struct cpu_private *priv;  	u32 low, high; -	int i, range; +	int i;  	if (seq) {  		priv = seq->private; @@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)  		}  	} -	range = get_cpu_range_count(cpu); - -	for (i = 0; i < range; i++) { +	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {  		if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))  			continue; @@ -588,8 +369,20 @@ static void print_apic(void *arg)  	seq_printf(seq, " TMICT\t\t: %08x\n",  apic_read(APIC_TMICT));  	seq_printf(seq, " TMCCT\t\t: %08x\n",  apic_read(APIC_TMCCT));  	seq_printf(seq, " TDCR\t\t: %08x\n",  apic_read(APIC_TDCR)); -#endif /* CONFIG_X86_LOCAL_APIC */ +	if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { +		unsigned int i, v, maxeilvt; +		v = apic_read(APIC_EFEAT); +		maxeilvt = (v >> 16) & 0xff; +		seq_printf(seq, " EFEAT\t\t: %08x\n", v); +		seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); + +		for (i = 0; i < maxeilvt; i++) { +			v = apic_read(APIC_EILVTn(i)); +			seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); +		} +	} +#endif /* CONFIG_X86_LOCAL_APIC */  	seq_printf(seq, "\n MSR\t:\n");  } @@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)  {  	struct dentry *cpu_dentry = NULL;  	unsigned reg, reg_min, reg_max; -	int i, range, err = 0; +	int i, err = 0;  	char reg_dir[12];  	u32 low, high; -	range = get_cpu_range_count(cpu); - -	for (i = 0; i < range; i++) { +	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {  		if (!get_cpu_range(cpu, ®_min, ®_max, i,  				   cpu_base[type].flag))  			continue; @@ -850,10 +641,6 @@ static int cpu_init_cpu(void)  		cpui = &cpu_data(cpu);  		if (!cpu_has(cpui, X86_FEATURE_MSR))  			continue; -		per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | -					   (cpui->x86 << 8) | -					   (cpui->x86_model)); -		per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);  		sprintf(cpu_dir, "cpu%d", cpu);  		cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 52c83987547..f138c6c389b 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -220,11 +220,14 @@ config X86_LONGHAUL  	  If in doubt, say N.  config X86_E_POWERSAVER -	tristate "VIA C7 Enhanced PowerSaver" +	tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"  	select CPU_FREQ_TABLE -	depends on X86_32 +	depends on X86_32 && EXPERIMENTAL  	help -	  This adds the CPUFreq driver for VIA C7 processors. +	  This adds the CPUFreq driver for VIA C7 processors.  However, this driver +	  does not have any safeguards to prevent operating the CPU out of spec +	  and is thus considered dangerous.  Please use the regular ACPI cpufreq +	  driver, enabled by CONFIG_X86_ACPI_CPUFREQ.  	  If in doubt, say N. diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 752e8c6b2c7..ae9b503220c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)  {  	struct cpuinfo_x86 *cpu = &cpu_data(cpuid); -	if (cpu->x86_vendor != X86_VENDOR_INTEL || -	    !cpu_has(cpu, X86_FEATURE_EST)) -		return 0; - -	return 1; +	return cpu_has(cpu, X86_FEATURE_EST);  }  static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index cf52215d9eb..81cbe64ed6b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1,3 +1,4 @@ +  /*   *   (c) 2003-2006 Advanced Micro Devices, Inc.   *  Your use of this code is subject to the terms and conditions of the @@ -117,20 +118,17 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)  	u32 i = 0;  	if (cpu_family == CPU_HW_PSTATE) { -		if (data->currpstate == HW_PSTATE_INVALID) { -			/* read (initial) hw pstate if not yet set */ -			rdmsr(MSR_PSTATE_STATUS, lo, hi); -			i = lo & HW_PSTATE_MASK; +		rdmsr(MSR_PSTATE_STATUS, lo, hi); +		i = lo & HW_PSTATE_MASK; +		data->currpstate = i; + +		/* +		 * a workaround for family 11h erratum 311 might cause +		 * an "out-of-range Pstate if the core is in Pstate-0 +		 */ +		if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) +			data->currpstate = HW_PSTATE_0; -			/* -			 * a workaround for family 11h erratum 311 might cause -			 * an "out-of-range Pstate if the core is in Pstate-0 -			 */ -			if (i >= data->numps) -				data->currpstate = HW_PSTATE_0; -			else -				data->currpstate = i; -		}  		return 0;  	}  	do { @@ -510,41 +508,34 @@ static int core_voltage_post_transition(struct powernow_k8_data *data,  	return 0;  } -static int check_supported_cpu(unsigned int cpu) +static void check_supported_cpu(void *_rc)  { -	cpumask_t oldmask;  	u32 eax, ebx, ecx, edx; -	unsigned int rc = 0; - -	oldmask = current->cpus_allowed; -	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); +	int *rc = _rc; -	if (smp_processor_id() != cpu) { -		printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); -		goto out; -	} +	*rc = -ENODEV;  	if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) -		goto out; +		return;  	eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);  	if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&  	    ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) -		goto out; +		return;  	if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {  		if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||  		    ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {  			printk(KERN_INFO PFX  				"Processor cpuid %x not supported\n", eax); -			goto out; +			return;  		}  		eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);  		if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {  			printk(KERN_INFO PFX  			       "No frequency change capabilities detected\n"); -			goto out; +			return;  		}  		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); @@ -552,21 +543,17 @@ static int check_supported_cpu(unsigned int cpu)  			!= P_STATE_TRANSITION_CAPABLE) {  			printk(KERN_INFO PFX  				"Power state transitions not supported\n"); -			goto out; +			return;  		}  	} else { /* must be a HW Pstate capable processor */  		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);  		if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)  			cpu_family = CPU_HW_PSTATE;  		else -			goto out; +			return;  	} -	rc = 1; - -out: -	set_cpus_allowed_ptr(current, &oldmask); -	return rc; +	*rc = 0;  }  static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, @@ -823,13 +810,14 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,  	if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))  		return; -	control = data->acpi_data.states[index].control; data->irt = (control -			>> IRT_SHIFT) & IRT_MASK; data->rvo = (control >> -				RVO_SHIFT) & RVO_MASK; data->exttype = (control -					>> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; -	data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1 -		<< ((control >> MVS_SHIFT) & MVS_MASK); data->vstable = -		(control >> VST_SHIFT) & VST_MASK; } +	control = data->acpi_data.states[index].control; +	data->irt = (control >> IRT_SHIFT) & IRT_MASK; +	data->rvo = (control >> RVO_SHIFT) & RVO_MASK; +	data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; +	data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; +	data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); +	data->vstable = (control >> VST_SHIFT) & VST_MASK; +}  static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)  { @@ -1046,6 +1034,19 @@ static int get_transition_latency(struct powernow_k8_data *data)  		if (cur_latency > max_latency)  			max_latency = cur_latency;  	} +	if (max_latency == 0) { +		/* +		 * Fam 11h always returns 0 as transition latency. +		 * This is intended and means "very fast". While cpufreq core +		 * and governors currently can handle that gracefully, better +		 * set it to 1 to avoid problems in the future. +		 * For all others it's a BIOS bug. +		 */ +		if (!boot_cpu_data.x86 == 0x11) +			printk(KERN_ERR FW_WARN PFX "Invalid zero transition " +				"latency\n"); +		max_latency = 1; +	}  	/* value in usecs, needs to be in nanoseconds */  	return 1000 * max_latency;  } @@ -1093,7 +1094,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,  	freqs.old = find_khz_freq_from_fid(data->currfid);  	freqs.new = find_khz_freq_from_fid(fid); -	for_each_cpu_mask_nr(i, *(data->available_cores)) { +	for_each_cpu(i, data->available_cores) {  		freqs.cpu = i;  		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);  	} @@ -1101,7 +1102,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,  	res = transition_fid_vid(data, fid, vid);  	freqs.new = find_khz_freq_from_fid(data->currfid); -	for_each_cpu_mask_nr(i, *(data->available_cores)) { +	for_each_cpu(i, data->available_cores) {  		freqs.cpu = i;  		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);  	} @@ -1126,7 +1127,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,  			data->currpstate);  	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); -	for_each_cpu_mask_nr(i, *(data->available_cores)) { +	for_each_cpu(i, data->available_cores) {  		freqs.cpu = i;  		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);  	} @@ -1134,7 +1135,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,  	res = transition_pstate(data, pstate);  	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); -	for_each_cpu_mask_nr(i, *(data->available_cores)) { +	for_each_cpu(i, data->available_cores) {  		freqs.cpu = i;  		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);  	} @@ -1235,21 +1236,47 @@ static int powernowk8_verify(struct cpufreq_policy *pol)  	return cpufreq_frequency_table_verify(pol, data->powernow_table);  } -static const char ACPI_PSS_BIOS_BUG_MSG[] = -	KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" -	KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; +struct init_on_cpu { +	struct powernow_k8_data *data; +	int rc; +}; + +static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) +{ +	struct init_on_cpu *init_on_cpu = _init_on_cpu; + +	if (pending_bit_stuck()) { +		printk(KERN_ERR PFX "failing init, change pending bit set\n"); +		init_on_cpu->rc = -ENODEV; +		return; +	} + +	if (query_current_values_with_pending_wait(init_on_cpu->data)) { +		init_on_cpu->rc = -ENODEV; +		return; +	} + +	if (cpu_family == CPU_OPTERON) +		fidvid_msr_init(); + +	init_on_cpu->rc = 0; +}  /* per CPU init entry point to the driver */  static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)  { +	static const char ACPI_PSS_BIOS_BUG_MSG[] = +		KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" +		KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";  	struct powernow_k8_data *data; -	cpumask_t oldmask; +	struct init_on_cpu init_on_cpu;  	int rc;  	if (!cpu_online(pol->cpu))  		return -ENODEV; -	if (!check_supported_cpu(pol->cpu)) +	smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); +	if (rc)  		return -ENODEV;  	data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); @@ -1289,27 +1316,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)  		pol->cpuinfo.transition_latency = get_transition_latency(data);  	/* only run on specific CPU from here on */ -	oldmask = current->cpus_allowed; -	set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); - -	if (smp_processor_id() != pol->cpu) { -		printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); -		goto err_out_unmask; -	} - -	if (pending_bit_stuck()) { -		printk(KERN_ERR PFX "failing init, change pending bit set\n"); -		goto err_out_unmask; -	} - -	if (query_current_values_with_pending_wait(data)) -		goto err_out_unmask; - -	if (cpu_family == CPU_OPTERON) -		fidvid_msr_init(); - -	/* run on any CPU again */ -	set_cpus_allowed_ptr(current, &oldmask); +	init_on_cpu.data = data; +	smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, +				 &init_on_cpu, 1); +	rc = init_on_cpu.rc; +	if (rc != 0) +		goto err_out_exit_acpi;  	if (cpu_family == CPU_HW_PSTATE)  		cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); @@ -1346,8 +1358,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)  	return 0; -err_out_unmask: -	set_cpus_allowed_ptr(current, &oldmask); +err_out_exit_acpi:  	powernow_k8_cpu_exit_acpi(data);  err_out: @@ -1372,28 +1383,25 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)  	return 0;  } +static void query_values_on_cpu(void *_err) +{ +	int *err = _err; +	struct powernow_k8_data *data = __get_cpu_var(powernow_data); + +	*err = query_current_values_with_pending_wait(data); +} +  static unsigned int powernowk8_get(unsigned int cpu)  { -	struct powernow_k8_data *data; -	cpumask_t oldmask = current->cpus_allowed; +	struct powernow_k8_data *data = per_cpu(powernow_data, cpu);  	unsigned int khz = 0; -	unsigned int first; - -	first = cpumask_first(cpu_core_mask(cpu)); -	data = per_cpu(powernow_data, first); +	int err;  	if (!data)  		return -EINVAL; -	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); -	if (smp_processor_id() != cpu) { -		printk(KERN_ERR PFX -			"limiting to CPU %d failed in powernowk8_get\n", cpu); -		set_cpus_allowed_ptr(current, &oldmask); -		return 0; -	} - -	if (query_current_values_with_pending_wait(data)) +	smp_call_function_single(cpu, query_values_on_cpu, &err, true); +	if (err)  		goto out;  	if (cpu_family == CPU_HW_PSTATE) @@ -1404,7 +1412,6 @@ static unsigned int powernowk8_get(unsigned int cpu)  out: -	set_cpus_allowed_ptr(current, &oldmask);  	return khz;  } @@ -1430,7 +1437,9 @@ static int __cpuinit powernowk8_init(void)  	unsigned int i, supported_cpus = 0;  	for_each_online_cpu(i) { -		if (check_supported_cpu(i)) +		int rc; +		smp_call_function_single(i, check_supported_cpu, &rc, 1); +		if (rc == 0)  			supported_cpus++;  	} diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index 6c6698feade..c9c1190b5e1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -223,14 +223,3 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned  static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);  static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); - -#ifdef CONFIG_SMP -static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) -{ -} -#else -static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) -{ -	cpu_set(0, cpu_sharedcore_mask[0]); -} -#endif diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 55c831ed71c..8d672ef162c 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -323,14 +323,8 @@ static unsigned int get_cur_freq(unsigned int cpu)  {  	unsigned l, h;  	unsigned clock_freq; -	cpumask_t saved_mask; -	saved_mask = current->cpus_allowed; -	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); -	if (smp_processor_id() != cpu) -		return 0; - -	rdmsr(MSR_IA32_PERF_STATUS, l, h); +	rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);  	clock_freq = extract_clock(l, cpu, 0);  	if (unlikely(clock_freq == 0)) { @@ -340,11 +334,9 @@ static unsigned int get_cur_freq(unsigned int cpu)  		 * P-state transition (like TM2). Get the last freq set   		 * in PERF_CTL.  		 */ -		rdmsr(MSR_IA32_PERF_CTL, l, h); +		rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);  		clock_freq = extract_clock(l, cpu, 1);  	} - -	set_cpus_allowed_ptr(current, &saved_mask);  	return clock_freq;  } @@ -467,15 +459,10 @@ static int centrino_target (struct cpufreq_policy *policy,  	struct cpufreq_freqs	freqs;  	int			retval = 0;  	unsigned int		j, k, first_cpu, tmp; -	cpumask_var_t saved_mask, covered_cpus; +	cpumask_var_t covered_cpus; -	if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) -		return -ENOMEM; -	if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { -		free_cpumask_var(saved_mask); +	if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))  		return -ENOMEM; -	} -	cpumask_copy(saved_mask, ¤t->cpus_allowed);  	if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {  		retval = -ENODEV; @@ -493,7 +480,7 @@ static int centrino_target (struct cpufreq_policy *policy,  	first_cpu = 1;  	for_each_cpu(j, policy->cpus) { -		const struct cpumask *mask; +		int good_cpu;  		/* cpufreq holds the hotplug lock, so we are safe here */  		if (!cpu_online(j)) @@ -504,32 +491,30 @@ static int centrino_target (struct cpufreq_policy *policy,  		 * Make sure we are running on CPU that wants to change freq  		 */  		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) -			mask = policy->cpus; +			good_cpu = cpumask_any_and(policy->cpus, +						   cpu_online_mask);  		else -			mask = cpumask_of(j); +			good_cpu = j; -		set_cpus_allowed_ptr(current, mask); -		preempt_disable(); -		if (unlikely(!cpu_isset(smp_processor_id(), *mask))) { +		if (good_cpu >= nr_cpu_ids) {  			dprintk("couldn't limit to CPUs in this domain\n");  			retval = -EAGAIN;  			if (first_cpu) {  				/* We haven't started the transition yet. */ -				goto migrate_end; +				goto out;  			} -			preempt_enable();  			break;  		}  		msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;  		if (first_cpu) { -			rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); +			rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);  			if (msr == (oldmsr & 0xffff)) {  				dprintk("no change needed - msr was and needs "  					"to be %x\n", oldmsr);  				retval = 0; -				goto migrate_end; +				goto out;  			}  			freqs.old = extract_clock(oldmsr, cpu, 0); @@ -553,14 +538,11 @@ static int centrino_target (struct cpufreq_policy *policy,  			oldmsr |= msr;  		} -		wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); -		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { -			preempt_enable(); +		wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); +		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)  			break; -		} -		cpu_set(j, *covered_cpus); -		preempt_enable(); +		cpumask_set_cpu(j, covered_cpus);  	}  	for_each_cpu(k, policy->cpus) { @@ -578,10 +560,8 @@ static int centrino_target (struct cpufreq_policy *policy,  		 * Best effort undo..  		 */ -		for_each_cpu_mask_nr(j, *covered_cpus) { -			set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); -			wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); -		} +		for_each_cpu(j, covered_cpus) +			wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);  		tmp = freqs.new;  		freqs.new = freqs.old; @@ -593,15 +573,9 @@ static int centrino_target (struct cpufreq_policy *policy,  			cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);  		}  	} -	set_cpus_allowed_ptr(current, saved_mask);  	retval = 0; -	goto out; -migrate_end: -	preempt_enable(); -	set_cpus_allowed_ptr(current, saved_mask);  out: -	free_cpumask_var(saved_mask);  	free_cpumask_var(covered_cpus);  	return retval;  } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 016c1a4fa3f..6911e91fb4f 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -89,7 +89,8 @@ static int speedstep_find_register(void)   * speedstep_set_state - set the SpeedStep state   * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)   * - *   Tries to change the SpeedStep state. + *   Tries to change the SpeedStep state.  Can be called from + *   smp_call_function_single.   */  static void speedstep_set_state(unsigned int state)  { @@ -143,6 +144,11 @@ static void speedstep_set_state(unsigned int state)  	return;  } +/* Wrapper for smp_call_function_single. */ +static void _speedstep_set_state(void *_state) +{ +	speedstep_set_state(*(unsigned int *)_state); +}  /**   * speedstep_activate - activate SpeedStep control in the chipset @@ -226,22 +232,28 @@ static unsigned int speedstep_detect_chipset(void)  	return 0;  } -static unsigned int _speedstep_get(const struct cpumask *cpus) -{ +struct get_freq_data {  	unsigned int speed; -	cpumask_t cpus_allowed; +	unsigned int processor; +}; + +static void get_freq_data(void *_data) +{ +	struct get_freq_data *data = _data; -	cpus_allowed = current->cpus_allowed; -	set_cpus_allowed_ptr(current, cpus); -	speed = speedstep_get_frequency(speedstep_processor); -	set_cpus_allowed_ptr(current, &cpus_allowed); -	dprintk("detected %u kHz as current frequency\n", speed); -	return speed; +	data->speed = speedstep_get_frequency(data->processor);  }  static unsigned int speedstep_get(unsigned int cpu)  { -	return _speedstep_get(cpumask_of(cpu)); +	struct get_freq_data data = { .processor = cpu }; + +	/* You're supposed to ensure CPU is online. */ +	if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) +		BUG(); + +	dprintk("detected %u kHz as current frequency\n", data.speed); +	return data.speed;  }  /** @@ -257,16 +269,16 @@ static int speedstep_target(struct cpufreq_policy *policy,  			     unsigned int target_freq,  			     unsigned int relation)  { -	unsigned int newstate = 0; +	unsigned int newstate = 0, policy_cpu;  	struct cpufreq_freqs freqs; -	cpumask_t cpus_allowed;  	int i;  	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],  				target_freq, relation, &newstate))  		return -EINVAL; -	freqs.old = _speedstep_get(policy->cpus); +	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); +	freqs.old = speedstep_get(policy_cpu);  	freqs.new = speedstep_freqs[newstate].frequency;  	freqs.cpu = policy->cpu; @@ -276,20 +288,13 @@ static int speedstep_target(struct cpufreq_policy *policy,  	if (freqs.old == freqs.new)  		return 0; -	cpus_allowed = current->cpus_allowed; -  	for_each_cpu(i, policy->cpus) {  		freqs.cpu = i;  		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);  	} -	/* switch to physical CPU where state is to be changed */ -	set_cpus_allowed_ptr(current, policy->cpus); - -	speedstep_set_state(newstate); - -	/* allow to be run on all CPUs */ -	set_cpus_allowed_ptr(current, &cpus_allowed); +	smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, +				 true);  	for_each_cpu(i, policy->cpus) {  		freqs.cpu = i; @@ -312,33 +317,43 @@ static int speedstep_verify(struct cpufreq_policy *policy)  	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);  } +struct get_freqs { +	struct cpufreq_policy *policy; +	int ret; +}; + +static void get_freqs_on_cpu(void *_get_freqs) +{ +	struct get_freqs *get_freqs = _get_freqs; + +	get_freqs->ret = +		speedstep_get_freqs(speedstep_processor, +			    &speedstep_freqs[SPEEDSTEP_LOW].frequency, +			    &speedstep_freqs[SPEEDSTEP_HIGH].frequency, +			    &get_freqs->policy->cpuinfo.transition_latency, +			    &speedstep_set_state); +}  static int speedstep_cpu_init(struct cpufreq_policy *policy)  { -	int result = 0; -	unsigned int speed; -	cpumask_t cpus_allowed; +	int result; +	unsigned int policy_cpu, speed; +	struct get_freqs gf;  	/* only run on CPU to be set, or on its sibling */  #ifdef CONFIG_SMP  	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));  #endif - -	cpus_allowed = current->cpus_allowed; -	set_cpus_allowed_ptr(current, policy->cpus); +	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);  	/* detect low and high frequency and transition latency */ -	result = speedstep_get_freqs(speedstep_processor, -				     &speedstep_freqs[SPEEDSTEP_LOW].frequency, -				     &speedstep_freqs[SPEEDSTEP_HIGH].frequency, -				     &policy->cpuinfo.transition_latency, -				     &speedstep_set_state); -	set_cpus_allowed_ptr(current, &cpus_allowed); -	if (result) -		return result; +	gf.policy = policy; +	smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); +	if (gf.ret) +		return gf.ret;  	/* get current speed setting */ -	speed = _speedstep_get(policy->cpus); +	speed = speedstep_get(policy_cpu);  	if (!speed)  		return -EIO; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index 2e3c6862657..f4c290b8482 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -226,6 +226,7 @@ static unsigned int pentium4_get_frequency(void)  } +/* Warning: may get called from smp_call_function_single. */  unsigned int speedstep_get_frequency(unsigned int processor)  {  	switch (processor) { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7437fa133c0..3260ab04499 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)  	 */  	if (c->x86 == 6 && c->x86_model < 15)  		clear_cpu_cap(c, X86_FEATURE_PAT); + +#ifdef CONFIG_KMEMCHECK +	/* +	 * P4s have a "fast strings" feature which causes single- +	 * stepping REP instructions to only generate a #DB on +	 * cache-line boundaries. +	 * +	 * Ingo Molnar reported a Pentium D (model 6) and a Xeon +	 * (model 2) with the same problem. +	 */ +	if (c->x86 == 15) { +		u64 misc_enable; + +		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + +		if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { +			printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); + +			misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; +			wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); +		} +	} +#endif  }  #ifdef CONFIG_X86_32 @@ -229,12 +252,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)  }  #endif -static void __cpuinit srat_detect_node(void) +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)  {  #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)  	unsigned node;  	int cpu = smp_processor_id(); -	int apicid = hard_smp_processor_id(); +	int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;  	/* Don't do the funky fallback heuristics the AMD version employs  	   for now. */ @@ -400,7 +423,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)  	}  	/* Work around errata */ -	srat_detect_node(); +	srat_detect_node(c);  	if (cpu_has(c, X86_FEATURE_VMX))  		detect_vmx_virtcap(c); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 483eda96e10..789efe217e1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,6 +17,7 @@  #include <asm/processor.h>  #include <asm/smp.h> +#include <asm/k8.h>  #define LVL_1_INST	1  #define LVL_1_DATA	2 @@ -159,14 +160,6 @@ struct _cpuid4_info_regs {  	unsigned long can_disable;  }; -#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) -static struct pci_device_id k8_nb_id[] = { -	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, -	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, -	{} -}; -#endif -  unsigned short			num_cache_leaves;  /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -207,10 +200,17 @@ union l3_cache {  };  static const unsigned short __cpuinitconst assocs[] = { -	[1] = 1, [2] = 2, [4] = 4, [6] = 8, -	[8] = 16, [0xa] = 32, [0xb] = 48, +	[1] = 1, +	[2] = 2, +	[4] = 4, +	[6] = 8, +	[8] = 16, +	[0xa] = 32, +	[0xb] = 48,  	[0xc] = 64, -	[0xf] = 0xffff // ?? +	[0xd] = 96, +	[0xe] = 128, +	[0xf] = 0xffff /* fully associative - no way to show this currently */  };  static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; @@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,  	eax->split.type = types[leaf];  	eax->split.level = levels[leaf];  	if (leaf == 3) -		eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; +		eax->split.num_threads_sharing = +			current_cpu_data.x86_max_cores - 1;  	else  		eax->split.num_threads_sharing = 0;  	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; @@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)  {  	if (index < 3)  		return; + +	if (boot_cpu_data.x86 == 0x11) +		return; + +	/* see erratum #382 */ +	if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) +		return; +  	this_leaf->can_disable = 1;  } @@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)  #define to_object(k)	container_of(k, struct _index_kobject, kobj)  #define to_attr(a)	container_of(a, struct _cache_attr, attr) -#ifdef CONFIG_PCI -static struct pci_dev *get_k8_northbridge(int node) -{ -	struct pci_dev *dev = NULL; -	int i; - -	for (i = 0; i <= node; i++) { -		do { -			dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); -			if (!dev) -				break; -		} while (!pci_match_id(&k8_nb_id[0], dev)); -		if (!dev) -			break; -	} -	return dev; -} -#else -static struct pci_dev *get_k8_northbridge(int node) -{ -	return NULL; -} -#endif - -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, +				  unsigned int index)  { -	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); -	int node = cpu_to_node(cpumask_first(mask)); -	struct pci_dev *dev = NULL; -	ssize_t ret = 0; -	int i; +	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); +	int node = cpu_to_node(cpu); +	struct pci_dev *dev = node_to_k8_nb_misc(node); +	unsigned int reg = 0;  	if (!this_leaf->can_disable) -		return sprintf(buf, "Feature not enabled\n"); - -	dev = get_k8_northbridge(node); -	if (!dev) { -		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");  		return -EINVAL; -	} -	for (i = 0; i < 2; i++) { -		unsigned int reg; +	if (!dev) +		return -EINVAL; -		pci_read_config_dword(dev, 0x1BC + i * 4, ®); +	pci_read_config_dword(dev, 0x1BC + index * 4, ®); +	return sprintf(buf, "%x\n", reg); +} -		ret += sprintf(buf, "%sEntry: %d\n", buf, i); -		ret += sprintf(buf, "%sReads:  %s\tNew Entries: %s\n",   -			buf, -			reg & 0x80000000 ? "Disabled" : "Allowed", -			reg & 0x40000000 ? "Disabled" : "Allowed"); -		ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", -			buf, (reg & 0x30000) >> 16, reg & 0xfff); -	} -	return ret; +#define SHOW_CACHE_DISABLE(index)					\ +static ssize_t								\ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)  	\ +{									\ +	return show_cache_disable(this_leaf, buf, index);		\  } +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) -static ssize_t -store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, -		    size_t count) +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, +	const char *buf, size_t count, unsigned int index)  { -	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); -	int node = cpu_to_node(cpumask_first(mask)); -	struct pci_dev *dev = NULL; -	unsigned int ret, index, val; +	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); +	int node = cpu_to_node(cpu); +	struct pci_dev *dev = node_to_k8_nb_misc(node); +	unsigned long val = 0; +	unsigned int scrubber = 0;  	if (!this_leaf->can_disable) -		return 0; - -	if (strlen(buf) > 15)  		return -EINVAL; -	ret = sscanf(buf, "%x %x", &index, &val); -	if (ret != 2) +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	if (!dev)  		return -EINVAL; -	if (index > 1) + +	if (strict_strtoul(buf, 10, &val) < 0)  		return -EINVAL;  	val |= 0xc0000000; -	dev = get_k8_northbridge(node); -	if (!dev) { -		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); -		return -EINVAL; -	} + +	pci_read_config_dword(dev, 0x58, &scrubber); +	scrubber &= ~0x1f000000; +	pci_write_config_dword(dev, 0x58, scrubber);  	pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);  	wbinvd();  	pci_write_config_dword(dev, 0x1BC + index * 4, val); +	return count; +} -	return 1; +#define STORE_CACHE_DISABLE(index)					\ +static ssize_t								\ +store_cache_disable_##index(struct _cpuid4_info *this_leaf,	     	\ +			    const char *buf, size_t count)		\ +{									\ +	return store_cache_disable(this_leaf, buf, count, index);	\  } +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1)  struct _cache_attr {  	struct attribute attr; @@ -808,7 +795,10 @@ define_one_ro(size);  define_one_ro(shared_cpu_map);  define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, +		show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, +		show_cache_disable_1, store_cache_disable_1);  static struct attribute * default_attrs[] = {  	&type.attr, @@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {  	&size.attr,  	&shared_cpu_map.attr,  	&shared_cpu_list.attr, -	&cache_disable.attr, +	&cache_disable_0.attr, +	&cache_disable_1.attr,  	NULL  }; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index b2f89829bbe..188a1ca5ad2 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,7 +1,12 @@ -obj-y				=  mce_$(BITS).o therm_throt.o +obj-y				=  mce.o -obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o -obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o -obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o +obj-$(CONFIG_X86_NEW_MCE)	+= mce-severity.o +obj-$(CONFIG_X86_OLD_MCE)	+= k7.o p4.o p6.o +obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o +obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o +obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd.o  obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o  obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o +obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o + +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index dd3af6e7b39..b945d5dbc60 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -2,25 +2,23 @@   * Athlon specific Machine Check Exception Reporting   * (C) Copyright 2002 Dave Jones <davej@redhat.com>   */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h>  #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h>  #include <linux/smp.h>  #include <asm/processor.h>  #include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include "mce.h" - -/* Machine Check Handler For AMD Athlon/Duron */ +/* Machine Check Handler For AMD Athlon/Duron: */  static void k7_machine_check(struct pt_regs *regs, long error_code)  { -	int recover = 1;  	u32 alow, ahigh, high, low;  	u32 mcgstl, mcgsth; +	int recover = 1;  	int i;  	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -32,15 +30,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)  	for (i = 1; i < nr_mce_banks; i++) {  		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); -		if (high&(1<<31)) { +		if (high & (1<<31)) {  			char misc[20];  			char addr[24]; -			misc[0] = addr[0] = '\0'; + +			misc[0] = '\0'; +			addr[0] = '\0'; +  			if (high & (1<<29))  				recover |= 1;  			if (high & (1<<25))  				recover |= 2;  			high &= ~(1<<31); +  			if (high & (1<<27)) {  				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);  				snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,27 +51,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)  				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);  				snprintf(addr, 24, " at %08x%08x", ahigh, alow);  			} +  			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",  				smp_processor_id(), i, high, low, misc, addr); -			/* Clear it */ + +			/* Clear it: */  			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); -			/* Serialize */ +			/* Serialize: */  			wmb();  			add_taint(TAINT_MACHINE_CHECK);  		}  	} -	if (recover&2) +	if (recover & 2)  		panic("CPU context corrupt"); -	if (recover&1) +	if (recover & 1)  		panic("Unable to continue"); +  	printk(KERN_EMERG "Attempting to continue.\n"); +  	mcgstl &= ~(1<<2);  	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);  } -/* AMD K7 machine check is Intel like */ +/* AMD K7 machine check is Intel like: */  void amd_mcheck_init(struct cpuinfo_x86 *c)  {  	u32 l, h; @@ -79,21 +85,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)  		return;  	machine_check_vector = k7_machine_check; +	/* Make sure the vector pointer is visible before we enable MCEs: */  	wmb();  	printk(KERN_INFO "Intel machine check architecture supported.\n"); +  	rdmsr(MSR_IA32_MCG_CAP, l, h);  	if (l & (1<<8))	/* Control register present ? */  		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);  	nr_mce_banks = l & 0xff; -	/* Clear status for MC index 0 separately, we don't touch CTL, -	 * as some K7 Athlons cause spurious MCEs when its enabled. */ +	/* +	 * Clear status for MC index 0 separately, we don't touch CTL, +	 * as some K7 Athlons cause spurious MCEs when its enabled: +	 */  	if (boot_cpu_data.x86 == 6) {  		wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);  		i = 1;  	} else  		i = 0; +  	for (; i < nr_mce_banks; i++) {  		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);  		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c new file mode 100644 index 00000000000..a3a235a53f0 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -0,0 +1,127 @@ +/* + * Machine check injection support. + * Copyright 2008 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Authors: + * Andi Kleen + * Ying Huang + */ +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/smp.h> +#include <asm/mce.h> + +/* Update fake mce registers on current CPU. */ +static void inject_mce(struct mce *m) +{ +	struct mce *i = &per_cpu(injectm, m->extcpu); + +	/* Make sure noone reads partially written injectm */ +	i->finished = 0; +	mb(); +	m->finished = 0; +	/* First set the fields after finished */ +	i->extcpu = m->extcpu; +	mb(); +	/* Now write record in order, finished last (except above) */ +	memcpy(i, m, sizeof(struct mce)); +	/* Finally activate it */ +	mb(); +	i->finished = 1; +} + +struct delayed_mce { +	struct timer_list timer; +	struct mce m; +}; + +/* Inject mce on current CPU */ +static void raise_mce(unsigned long data) +{ +	struct delayed_mce *dm = (struct delayed_mce *)data; +	struct mce *m = &dm->m; +	int cpu = m->extcpu; + +	inject_mce(m); +	if (m->status & MCI_STATUS_UC) { +		struct pt_regs regs; +		memset(®s, 0, sizeof(struct pt_regs)); +		regs.ip = m->ip; +		regs.cs = m->cs; +		printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); +		do_machine_check(®s, 0); +		printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); +	} else { +		mce_banks_t b; +		memset(&b, 0xff, sizeof(mce_banks_t)); +		printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); +		machine_check_poll(0, &b); +		mce_notify_irq(); +		printk(KERN_INFO "Finished machine check poll on CPU %d\n", +		       cpu); +	} +	kfree(dm); +} + +/* Error injection interface */ +static ssize_t mce_write(struct file *filp, const char __user *ubuf, +			 size_t usize, loff_t *off) +{ +	struct delayed_mce *dm; +	struct mce m; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +	/* +	 * There are some cases where real MSR reads could slip +	 * through. +	 */ +	if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) +		return -EIO; + +	if ((unsigned long)usize > sizeof(struct mce)) +		usize = sizeof(struct mce); +	if (copy_from_user(&m, ubuf, usize)) +		return -EFAULT; + +	if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) +		return -EINVAL; + +	dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); +	if (!dm) +		return -ENOMEM; + +	/* +	 * Need to give user space some time to set everything up, +	 * so do it a jiffie or two later everywhere. +	 * Should we use a hrtimer here for better synchronization? +	 */ +	memcpy(&dm->m, &m, sizeof(struct mce)); +	setup_timer(&dm->timer, raise_mce, (unsigned long)dm); +	dm->timer.expires = jiffies + 2; +	add_timer_on(&dm->timer, m.extcpu); +	return usize; +} + +static int inject_init(void) +{ +	printk(KERN_INFO "Machine check injector initialized\n"); +	mce_chrdev_ops.write = mce_write; +	return 0; +} + +module_init(inject_init); +/* + * Cannot tolerate unloading currently because we cannot + * guarantee all openers of mce_chrdev will get a reference to us. + */ +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h new file mode 100644 index 00000000000..54dcb8ff12e --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -0,0 +1,15 @@ +#include <asm/mce.h> + +enum severity_level { +	MCE_NO_SEVERITY, +	MCE_KEEP_SEVERITY, +	MCE_SOME_SEVERITY, +	MCE_AO_SEVERITY, +	MCE_UC_SEVERITY, +	MCE_AR_SEVERITY, +	MCE_PANIC_SEVERITY, +}; + +int mce_severity(struct mce *a, int tolerant, char **msg); + +extern int mce_ser; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c new file mode 100644 index 00000000000..ff0807f9705 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -0,0 +1,218 @@ +/* + * MCE grading rules. + * Copyright 2008, 2009 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Author: Andi Kleen + */ +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/debugfs.h> +#include <asm/mce.h> + +#include "mce-internal.h" + +/* + * Grade an mce by severity. In general the most severe ones are processed + * first. Since there are quite a lot of combinations test the bits in a + * table-driven way. The rules are simply processed in order, first + * match wins. + * + * Note this is only used for machine check exceptions, the corrected + * errors use much simpler rules. The exceptions still check for the corrected + * errors, but only to leave them alone for the CMCI handler (except for + * panic situations) + */ + +enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum ser { SER_REQUIRED = 1, NO_SER = 2 }; + +static struct severity { +	u64 mask; +	u64 result; +	unsigned char sev; +	unsigned char mcgmask; +	unsigned char mcgres; +	unsigned char ser; +	unsigned char context; +	unsigned char covered; +	char *msg; +} severities[] = { +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define SEV(s) .sev = MCE_ ## s ## _SEVERITY +#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } +#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } +#define MCGMASK(x, res, s, m, r...) \ +	{ .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } +#define MASK(x, y, s, m, r...) \ +	{ .mask = x, .result = y, SEV(s), .msg = m, ## r } +#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCACOD 0xffff + +	BITCLR(MCI_STATUS_VAL, NO, "Invalid"), +	BITCLR(MCI_STATUS_EN, NO, "Not enabled"), +	BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), +	/* When MCIP is not set something is very confused */ +	MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), +	/* Neither return not error IP -- no chance to recover -> PANIC */ +	MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, +		"Neither restart nor error IP"), +	MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", +		KERNEL), +	BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), +	MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, +	     "Spurious not enabled", SER), + +	/* ignore OVER for UCNA */ +	MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, +	     "Uncorrected no action required", SER), +	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, +	     "Illegal combination (UCNA with AR=1)", SER), +	MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), + +	/* AR add known MCACODs here */ +	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, +	     "Action required with lost events", SER), +	MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, +	     "Action required; unknown MCACOD", SER), + +	/* known AO MCACODs: */ +	MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, +	     "Action optional: memory scrubbing error", SER), +	MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, +	     "Action optional: last level cache writeback error", SER), + +	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, +	     "Action optional unknown MCACOD", SER), +	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, +	     "Action optional with lost events", SER), +	BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), +	BITSET(MCI_STATUS_UC, UC, "Uncorrected"), +	BITSET(0, SOME, "No match")	/* always matches. keep at end */ +}; + +/* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ +static int error_context(struct mce *m) +{ +	if (m->mcgstatus & MCG_STATUS_EIPV) +		return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; +	/* Unknown, assume kernel */ +	return IN_KERNEL; +} + +int mce_severity(struct mce *a, int tolerant, char **msg) +{ +	enum context ctx = error_context(a); +	struct severity *s; + +	for (s = severities;; s++) { +		if ((a->status & s->mask) != s->result) +			continue; +		if ((a->mcgstatus & s->mcgmask) != s->mcgres) +			continue; +		if (s->ser == SER_REQUIRED && !mce_ser) +			continue; +		if (s->ser == NO_SER && mce_ser) +			continue; +		if (s->context && ctx != s->context) +			continue; +		if (msg) +			*msg = s->msg; +		s->covered = 1; +		if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { +			if (panic_on_oops || tolerant < 1) +				return MCE_PANIC_SEVERITY; +		} +		return s->sev; +	} +} + +static void *s_start(struct seq_file *f, loff_t *pos) +{ +	if (*pos >= ARRAY_SIZE(severities)) +		return NULL; +	return &severities[*pos]; +} + +static void *s_next(struct seq_file *f, void *data, loff_t *pos) +{ +	if (++(*pos) >= ARRAY_SIZE(severities)) +		return NULL; +	return &severities[*pos]; +} + +static void s_stop(struct seq_file *f, void *data) +{ +} + +static int s_show(struct seq_file *f, void *data) +{ +	struct severity *ser = data; +	seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); +	return 0; +} + +static const struct seq_operations severities_seq_ops = { +	.start	= s_start, +	.next	= s_next, +	.stop	= s_stop, +	.show	= s_show, +}; + +static int severities_coverage_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &severities_seq_ops); +} + +static ssize_t severities_coverage_write(struct file *file, +					 const char __user *ubuf, +					 size_t count, loff_t *ppos) +{ +	int i; +	for (i = 0; i < ARRAY_SIZE(severities); i++) +		severities[i].covered = 0; +	return count; +} + +static const struct file_operations severities_coverage_fops = { +	.open		= severities_coverage_open, +	.release	= seq_release, +	.read		= seq_read, +	.write		= severities_coverage_write, +}; + +static int __init severities_debugfs_init(void) +{ +	struct dentry *dmce = NULL, *fseverities_coverage = NULL; + +	dmce = debugfs_create_dir("mce", NULL); +	if (dmce == NULL) +		goto err_out; +	fseverities_coverage = debugfs_create_file("severities-coverage", +						   0444, dmce, NULL, +						   &severities_coverage_fops); +	if (fseverities_coverage == NULL) +		goto err_out; + +	return 0; + +err_out: +	if (fseverities_coverage) +		debugfs_remove(fseverities_coverage); +	if (dmce) +		debugfs_remove(dmce); +	return -ENOMEM; +} +late_initcall(severities_debugfs_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c new file mode 100644 index 00000000000..284d1de968b --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -0,0 +1,2049 @@ +/* + * Machine check handler. + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ +#include <linux/thread_info.h> +#include <linux/capability.h> +#include <linux/miscdevice.h> +#include <linux/interrupt.h> +#include <linux/ratelimit.h> +#include <linux/kallsyms.h> +#include <linux/rcupdate.h> +#include <linux/kobject.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/string.h> +#include <linux/sysdev.h> +#include <linux/delay.h> +#include <linux/ctype.h> +#include <linux/sched.h> +#include <linux/sysfs.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/poll.h> +#include <linux/nmi.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/processor.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/ipi.h> +#include <asm/mce.h> +#include <asm/msr.h> + +#include "mce-internal.h" + +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ +	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", +	       smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = +						unexpected_machine_check; + +int mce_disabled __read_mostly; + +#ifdef CONFIG_X86_NEW_MCE + +#define MISC_MCELOG_MINOR	227 + +#define SPINUNIT 100	/* 100ns */ + +atomic_t mce_entry; + +DEFINE_PER_CPU(unsigned, mce_exception_count); + +/* + * Tolerant levels: + *   0: always panic on uncorrected errors, log corrected errors + *   1: panic or SIGBUS on uncorrected errors, log corrected errors + *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors + *   3: never panic or SIGBUS, log all errors (for testing only) + */ +static int			tolerant		__read_mostly = 1; +static int			banks			__read_mostly; +static u64			*bank			__read_mostly; +static int			rip_msr			__read_mostly; +static int			mce_bootlog		__read_mostly = -1; +static int			monarch_timeout		__read_mostly = -1; +static int			mce_panic_timeout	__read_mostly; +static int			mce_dont_log_ce		__read_mostly; +int				mce_cmci_disabled	__read_mostly; +int				mce_ignore_ce		__read_mostly; +int				mce_ser			__read_mostly; + +/* User mode helper program triggered by machine check event */ +static unsigned long		mce_need_notify; +static char			mce_helper[128]; +static char			*mce_helper_argv[2] = { mce_helper, NULL }; + +static unsigned long		dont_init_banks; + +static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +static DEFINE_PER_CPU(struct mce, mces_seen); +static int			cpu_missing; + + +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { +	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +static inline int skip_bank_init(int i) +{ +	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); +} + +static DEFINE_PER_CPU(struct work_struct, mce_work); + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ +	memset(m, 0, sizeof(struct mce)); +	m->cpu = m->extcpu = smp_processor_id(); +	rdtscll(m->tsc); +	/* We hope get_seconds stays lockless */ +	m->time = get_seconds(); +	m->cpuvendor = boot_cpu_data.x86_vendor; +	m->cpuid = cpuid_eax(1); +#ifdef CONFIG_SMP +	m->socketid = cpu_data(m->extcpu).phys_proc_id; +#endif +	m->apicid = cpu_data(m->extcpu).initial_apicid; +	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); +} + +DEFINE_PER_CPU(struct mce, injectm); +EXPORT_PER_CPU_SYMBOL_GPL(injectm); + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log mcelog = { +	.signature	= MCE_LOG_SIGNATURE, +	.len		= MCE_LOG_LEN, +	.recordlen	= sizeof(struct mce), +}; + +void mce_log(struct mce *mce) +{ +	unsigned next, entry; + +	mce->finished = 0; +	wmb(); +	for (;;) { +		entry = rcu_dereference(mcelog.next); +		for (;;) { +			/* +			 * When the buffer fills up discard new entries. +			 * Assume that the earlier errors are the more +			 * interesting ones: +			 */ +			if (entry >= MCE_LOG_LEN) { +				set_bit(MCE_OVERFLOW, +					(unsigned long *)&mcelog.flags); +				return; +			} +			/* Old left over entry. Skip: */ +			if (mcelog.entry[entry].finished) { +				entry++; +				continue; +			} +			break; +		} +		smp_rmb(); +		next = entry + 1; +		if (cmpxchg(&mcelog.next, entry, next) == entry) +			break; +	} +	memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); +	wmb(); +	mcelog.entry[entry].finished = 1; +	wmb(); + +	mce->finished = 1; +	set_bit(0, &mce_need_notify); +} + +static void print_mce(struct mce *m) +{ +	printk(KERN_EMERG +	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", +	       m->extcpu, m->mcgstatus, m->bank, m->status); +	if (m->ip) { +		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", +		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", +		       m->cs, m->ip); +		if (m->cs == __KERNEL_CS) +			print_symbol("{%s}", m->ip); +		printk("\n"); +	} +	printk(KERN_EMERG "TSC %llx ", m->tsc); +	if (m->addr) +		printk("ADDR %llx ", m->addr); +	if (m->misc) +		printk("MISC %llx ", m->misc); +	printk("\n"); +	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", +			m->cpuvendor, m->cpuid, m->time, m->socketid, +			m->apicid); +} + +static void print_mce_head(void) +{ +	printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); +} + +static void print_mce_tail(void) +{ +	printk(KERN_EMERG "This is not a software problem!\n" +	       KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); +} + +#define PANIC_TIMEOUT 5 /* 5 seconds */ + +static atomic_t mce_paniced; + +/* Panic in progress. Enable interrupts and wait for final IPI */ +static void wait_for_panic(void) +{ +	long timeout = PANIC_TIMEOUT*USEC_PER_SEC; +	preempt_disable(); +	local_irq_enable(); +	while (timeout-- > 0) +		udelay(1); +	if (panic_timeout == 0) +		panic_timeout = mce_panic_timeout; +	panic("Panicing machine check CPU died"); +} + +static void mce_panic(char *msg, struct mce *final, char *exp) +{ +	int i; + +	/* +	 * Make sure only one CPU runs in machine check panic +	 */ +	if (atomic_add_return(1, &mce_paniced) > 1) +		wait_for_panic(); +	barrier(); + +	bust_spinlocks(1); +	console_verbose(); +	print_mce_head(); +	/* First print corrected ones that are still unlogged */ +	for (i = 0; i < MCE_LOG_LEN; i++) { +		struct mce *m = &mcelog.entry[i]; +		if (!(m->status & MCI_STATUS_VAL)) +			continue; +		if (!(m->status & MCI_STATUS_UC)) +			print_mce(m); +	} +	/* Now print uncorrected but with the final one last */ +	for (i = 0; i < MCE_LOG_LEN; i++) { +		struct mce *m = &mcelog.entry[i]; +		if (!(m->status & MCI_STATUS_VAL)) +			continue; +		if (!(m->status & MCI_STATUS_UC)) +			continue; +		if (!final || memcmp(m, final, sizeof(struct mce))) +			print_mce(m); +	} +	if (final) +		print_mce(final); +	if (cpu_missing) +		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); +	print_mce_tail(); +	if (exp) +		printk(KERN_EMERG "Machine check: %s\n", exp); +	if (panic_timeout == 0) +		panic_timeout = mce_panic_timeout; +	panic(msg); +} + +/* Support code for software error injection */ + +static int msr_to_offset(u32 msr) +{ +	unsigned bank = __get_cpu_var(injectm.bank); +	if (msr == rip_msr) +		return offsetof(struct mce, ip); +	if (msr == MSR_IA32_MC0_STATUS + bank*4) +		return offsetof(struct mce, status); +	if (msr == MSR_IA32_MC0_ADDR + bank*4) +		return offsetof(struct mce, addr); +	if (msr == MSR_IA32_MC0_MISC + bank*4) +		return offsetof(struct mce, misc); +	if (msr == MSR_IA32_MCG_STATUS) +		return offsetof(struct mce, mcgstatus); +	return -1; +} + +/* MSR access wrappers used for error injection */ +static u64 mce_rdmsrl(u32 msr) +{ +	u64 v; +	if (__get_cpu_var(injectm).finished) { +		int offset = msr_to_offset(msr); +		if (offset < 0) +			return 0; +		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); +	} +	rdmsrl(msr, v); +	return v; +} + +static void mce_wrmsrl(u32 msr, u64 v) +{ +	if (__get_cpu_var(injectm).finished) { +		int offset = msr_to_offset(msr); +		if (offset >= 0) +			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; +		return; +	} +	wrmsrl(msr, v); +} + +/* + * Simple lockless ring to communicate PFNs from the exception handler with the + * process context work function. This is vastly simplified because there's + * only a single reader and a single writer. + */ +#define MCE_RING_SIZE 16	/* we use one entry less */ + +struct mce_ring { +	unsigned short start; +	unsigned short end; +	unsigned long ring[MCE_RING_SIZE]; +}; +static DEFINE_PER_CPU(struct mce_ring, mce_ring); + +/* Runs with CPU affinity in workqueue */ +static int mce_ring_empty(void) +{ +	struct mce_ring *r = &__get_cpu_var(mce_ring); + +	return r->start == r->end; +} + +static int mce_ring_get(unsigned long *pfn) +{ +	struct mce_ring *r; +	int ret = 0; + +	*pfn = 0; +	get_cpu(); +	r = &__get_cpu_var(mce_ring); +	if (r->start == r->end) +		goto out; +	*pfn = r->ring[r->start]; +	r->start = (r->start + 1) % MCE_RING_SIZE; +	ret = 1; +out: +	put_cpu(); +	return ret; +} + +/* Always runs in MCE context with preempt off */ +static int mce_ring_add(unsigned long pfn) +{ +	struct mce_ring *r = &__get_cpu_var(mce_ring); +	unsigned next; + +	next = (r->end + 1) % MCE_RING_SIZE; +	if (next == r->start) +		return -1; +	r->ring[r->end] = pfn; +	wmb(); +	r->end = next; +	return 0; +} + +int mce_available(struct cpuinfo_x86 *c) +{ +	if (mce_disabled) +		return 0; +	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static void mce_schedule_work(void) +{ +	if (!mce_ring_empty()) { +		struct work_struct *work = &__get_cpu_var(mce_work); +		if (!work_pending(work)) +			schedule_work(work); +	} +} + +/* + * Get the address of the instruction at the time of the machine check + * error. + */ +static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) +{ + +	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { +		m->ip = regs->ip; +		m->cs = regs->cs; +	} else { +		m->ip = 0; +		m->cs = 0; +	} +	if (rip_msr) +		m->ip = mce_rdmsrl(rip_msr); +} + +#ifdef CONFIG_X86_LOCAL_APIC  +/* + * Called after interrupts have been reenabled again + * when a MCE happened during an interrupts off region + * in the kernel. + */ +asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) +{ +	ack_APIC_irq(); +	exit_idle(); +	irq_enter(); +	mce_notify_irq(); +	mce_schedule_work(); +	irq_exit(); +} +#endif + +static void mce_report_event(struct pt_regs *regs) +{ +	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { +		mce_notify_irq(); +		/* +		 * Triggering the work queue here is just an insurance +		 * policy in case the syscall exit notify handler +		 * doesn't run soon enough or ends up running on the +		 * wrong CPU (can happen when audit sleeps) +		 */ +		mce_schedule_work(); +		return; +	} + +#ifdef CONFIG_X86_LOCAL_APIC +	/* +	 * Without APIC do not notify. The event will be picked +	 * up eventually. +	 */ +	if (!cpu_has_apic) +		return; + +	/* +	 * When interrupts are disabled we cannot use +	 * kernel services safely. Trigger an self interrupt +	 * through the APIC to instead do the notification +	 * after interrupts are reenabled again. +	 */ +	apic->send_IPI_self(MCE_SELF_VECTOR); + +	/* +	 * Wait for idle afterwards again so that we don't leave the +	 * APIC in a non idle state because the normal APIC writes +	 * cannot exclude us. +	 */ +	apic_wait_icr_idle(); +#endif +} + +DEFINE_PER_CPU(unsigned, mce_poll_count); + +/* + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + * + * Note: spec recommends to panic for fatal unsignalled + * errors here. However this would be quite problematic -- + * we would need to reimplement the Monarch handling and + * it would mess up the exclusion between exception handler + * and poll hander -- * so we skip this for now. + * These cases should not happen anyways, or only when the CPU + * is already totally * confused. In this case it's likely it will + * not fully execute the machine check handler either. + */ +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ +	struct mce m; +	int i; + +	__get_cpu_var(mce_poll_count)++; + +	mce_setup(&m); + +	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); +	for (i = 0; i < banks; i++) { +		if (!bank[i] || !test_bit(i, *b)) +			continue; + +		m.misc = 0; +		m.addr = 0; +		m.bank = i; +		m.tsc = 0; + +		barrier(); +		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); +		if (!(m.status & MCI_STATUS_VAL)) +			continue; + +		/* +		 * Uncorrected or signalled events are handled by the exception +		 * handler when it is enabled, so don't process those here. +		 * +		 * TBD do the same check for MCI_STATUS_EN here? +		 */ +		if (!(flags & MCP_UC) && +		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) +			continue; + +		if (m.status & MCI_STATUS_MISCV) +			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); +		if (m.status & MCI_STATUS_ADDRV) +			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + +		if (!(flags & MCP_TIMESTAMP)) +			m.tsc = 0; +		/* +		 * Don't get the IP here because it's unlikely to +		 * have anything to do with the actual error location. +		 */ +		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { +			mce_log(&m); +			add_taint(TAINT_MACHINE_CHECK); +		} + +		/* +		 * Clear state for this bank. +		 */ +		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +	} + +	/* +	 * Don't clear MCG_STATUS here because it's only defined for +	 * exceptions. +	 */ + +	sync_core(); +} +EXPORT_SYMBOL_GPL(machine_check_poll); + +/* + * Do a quick check if any of the events requires a panic. + * This decides if we keep the events around or clear them. + */ +static int mce_no_way_out(struct mce *m, char **msg) +{ +	int i; + +	for (i = 0; i < banks; i++) { +		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); +		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) +			return 1; +	} +	return 0; +} + +/* + * Variable to establish order between CPUs while scanning. + * Each CPU spins initially until executing is equal its number. + */ +static atomic_t mce_executing; + +/* + * Defines order of CPUs on entry. First CPU becomes Monarch. + */ +static atomic_t mce_callin; + +/* + * Check if a timeout waiting for other CPUs happened. + */ +static int mce_timed_out(u64 *t) +{ +	/* +	 * The others already did panic for some reason. +	 * Bail out like in a timeout. +	 * rmb() to tell the compiler that system_state +	 * might have been modified by someone else. +	 */ +	rmb(); +	if (atomic_read(&mce_paniced)) +		wait_for_panic(); +	if (!monarch_timeout) +		goto out; +	if ((s64)*t < SPINUNIT) { +		/* CHECKME: Make panic default for 1 too? */ +		if (tolerant < 1) +			mce_panic("Timeout synchronizing machine check over CPUs", +				  NULL, NULL); +		cpu_missing = 1; +		return 1; +	} +	*t -= SPINUNIT; +out: +	touch_nmi_watchdog(); +	return 0; +} + +/* + * The Monarch's reign.  The Monarch is the CPU who entered + * the machine check handler first. It waits for the others to + * raise the exception too and then grades them. When any + * error is fatal panic. Only then let the others continue. + * + * The other CPUs entering the MCE handler will be controlled by the + * Monarch. They are called Subjects. + * + * This way we prevent any potential data corruption in a unrecoverable case + * and also makes sure always all CPU's errors are examined. + * + * Also this detects the case of an machine check event coming from outer + * space (not detected by any CPUs) In this case some external agent wants + * us to shut down, so panic too. + * + * The other CPUs might still decide to panic if the handler happens + * in a unrecoverable place, but in this case the system is in a semi-stable + * state and won't corrupt anything by itself. It's ok to let the others + * continue for a bit first. + * + * All the spin loops have timeouts; when a timeout happens a CPU + * typically elects itself to be Monarch. + */ +static void mce_reign(void) +{ +	int cpu; +	struct mce *m = NULL; +	int global_worst = 0; +	char *msg = NULL; +	char *nmsg = NULL; + +	/* +	 * This CPU is the Monarch and the other CPUs have run +	 * through their handlers. +	 * Grade the severity of the errors of all the CPUs. +	 */ +	for_each_possible_cpu(cpu) { +		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, +					    &nmsg); +		if (severity > global_worst) { +			msg = nmsg; +			global_worst = severity; +			m = &per_cpu(mces_seen, cpu); +		} +	} + +	/* +	 * Cannot recover? Panic here then. +	 * This dumps all the mces in the log buffer and stops the +	 * other CPUs. +	 */ +	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) +		mce_panic("Fatal Machine check", m, msg); + +	/* +	 * For UC somewhere we let the CPU who detects it handle it. +	 * Also must let continue the others, otherwise the handling +	 * CPU could deadlock on a lock. +	 */ + +	/* +	 * No machine check event found. Must be some external +	 * source or one CPU is hung. Panic. +	 */ +	if (!m && tolerant < 3) +		mce_panic("Machine check from unknown source", NULL, NULL); + +	/* +	 * Now clear all the mces_seen so that they don't reappear on +	 * the next mce. +	 */ +	for_each_possible_cpu(cpu) +		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); +} + +static atomic_t global_nwo; + +/* + * Start of Monarch synchronization. This waits until all CPUs have + * entered the exception handler and then determines if any of them + * saw a fatal event that requires panic. Then it executes them + * in the entry order. + * TBD double check parallel CPU hotunplug + */ +static int mce_start(int *no_way_out) +{ +	int order; +	int cpus = num_online_cpus(); +	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + +	if (!timeout) +		return -1; + +	atomic_add(*no_way_out, &global_nwo); +	/* +	 * global_nwo should be updated before mce_callin +	 */ +	smp_wmb(); +	order = atomic_add_return(1, &mce_callin); + +	/* +	 * Wait for everyone. +	 */ +	while (atomic_read(&mce_callin) != cpus) { +		if (mce_timed_out(&timeout)) { +			atomic_set(&global_nwo, 0); +			return -1; +		} +		ndelay(SPINUNIT); +	} + +	/* +	 * mce_callin should be read before global_nwo +	 */ +	smp_rmb(); + +	if (order == 1) { +		/* +		 * Monarch: Starts executing now, the others wait. +		 */ +		atomic_set(&mce_executing, 1); +	} else { +		/* +		 * Subject: Now start the scanning loop one by one in +		 * the original callin order. +		 * This way when there are any shared banks it will be +		 * only seen by one CPU before cleared, avoiding duplicates. +		 */ +		while (atomic_read(&mce_executing) < order) { +			if (mce_timed_out(&timeout)) { +				atomic_set(&global_nwo, 0); +				return -1; +			} +			ndelay(SPINUNIT); +		} +	} + +	/* +	 * Cache the global no_way_out state. +	 */ +	*no_way_out = atomic_read(&global_nwo); + +	return order; +} + +/* + * Synchronize between CPUs after main scanning loop. + * This invokes the bulk of the Monarch processing. + */ +static int mce_end(int order) +{ +	int ret = -1; +	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + +	if (!timeout) +		goto reset; +	if (order < 0) +		goto reset; + +	/* +	 * Allow others to run. +	 */ +	atomic_inc(&mce_executing); + +	if (order == 1) { +		/* CHECKME: Can this race with a parallel hotplug? */ +		int cpus = num_online_cpus(); + +		/* +		 * Monarch: Wait for everyone to go through their scanning +		 * loops. +		 */ +		while (atomic_read(&mce_executing) <= cpus) { +			if (mce_timed_out(&timeout)) +				goto reset; +			ndelay(SPINUNIT); +		} + +		mce_reign(); +		barrier(); +		ret = 0; +	} else { +		/* +		 * Subject: Wait for Monarch to finish. +		 */ +		while (atomic_read(&mce_executing) != 0) { +			if (mce_timed_out(&timeout)) +				goto reset; +			ndelay(SPINUNIT); +		} + +		/* +		 * Don't reset anything. That's done by the Monarch. +		 */ +		return 0; +	} + +	/* +	 * Reset all global state. +	 */ +reset: +	atomic_set(&global_nwo, 0); +	atomic_set(&mce_callin, 0); +	barrier(); + +	/* +	 * Let others run again. +	 */ +	atomic_set(&mce_executing, 0); +	return ret; +} + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses upto page granuality for now. + */ +static int mce_usable_address(struct mce *m) +{ +	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) +		return 0; +	if ((m->misc & 0x3f) > PAGE_SHIFT) +		return 0; +	if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) +		return 0; +	return 1; +} + +static void mce_clear_state(unsigned long *toclear) +{ +	int i; + +	for (i = 0; i < banks; i++) { +		if (test_bit(i, toclear)) +			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +	} +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! + * + * On Intel systems this is entered on all CPUs in parallel through + * MCE broadcast. However some CPUs might be broken beyond repair, + * so be always careful when synchronizing with others. + */ +void do_machine_check(struct pt_regs *regs, long error_code) +{ +	struct mce m, *final; +	int i; +	int worst = 0; +	int severity; +	/* +	 * Establish sequential order between the CPUs entering the machine +	 * check handler. +	 */ +	int order; +	/* +	 * If no_way_out gets set, there is no safe way to recover from this +	 * MCE.  If tolerant is cranked up, we'll try anyway. +	 */ +	int no_way_out = 0; +	/* +	 * If kill_it gets set, there might be a way to recover from this +	 * error. +	 */ +	int kill_it = 0; +	DECLARE_BITMAP(toclear, MAX_NR_BANKS); +	char *msg = "Unknown"; + +	atomic_inc(&mce_entry); + +	__get_cpu_var(mce_exception_count)++; + +	if (notify_die(DIE_NMI, "machine check", regs, error_code, +			   18, SIGKILL) == NOTIFY_STOP) +		goto out; +	if (!banks) +		goto out; + +	mce_setup(&m); + +	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); +	no_way_out = mce_no_way_out(&m, &msg); + +	final = &__get_cpu_var(mces_seen); +	*final = m; + +	barrier(); + +	/* +	 * When no restart IP must always kill or panic. +	 */ +	if (!(m.mcgstatus & MCG_STATUS_RIPV)) +		kill_it = 1; + +	/* +	 * Go through all the banks in exclusion of the other CPUs. +	 * This way we don't report duplicated events on shared banks +	 * because the first one to see it will clear it. +	 */ +	order = mce_start(&no_way_out); +	for (i = 0; i < banks; i++) { +		__clear_bit(i, toclear); +		if (!bank[i]) +			continue; + +		m.misc = 0; +		m.addr = 0; +		m.bank = i; + +		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); +		if ((m.status & MCI_STATUS_VAL) == 0) +			continue; + +		/* +		 * Non uncorrected or non signaled errors are handled by +		 * machine_check_poll. Leave them alone, unless this panics. +		 */ +		if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && +			!no_way_out) +			continue; + +		/* +		 * Set taint even when machine check was not enabled. +		 */ +		add_taint(TAINT_MACHINE_CHECK); + +		severity = mce_severity(&m, tolerant, NULL); + +		/* +		 * When machine check was for corrected handler don't touch, +		 * unless we're panicing. +		 */ +		if (severity == MCE_KEEP_SEVERITY && !no_way_out) +			continue; +		__set_bit(i, toclear); +		if (severity == MCE_NO_SEVERITY) { +			/* +			 * Machine check event was not enabled. Clear, but +			 * ignore. +			 */ +			continue; +		} + +		/* +		 * Kill on action required. +		 */ +		if (severity == MCE_AR_SEVERITY) +			kill_it = 1; + +		if (m.status & MCI_STATUS_MISCV) +			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); +		if (m.status & MCI_STATUS_ADDRV) +			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + +		/* +		 * Action optional error. Queue address for later processing. +		 * When the ring overflows we just ignore the AO error. +		 * RED-PEN add some logging mechanism when +		 * usable_address or mce_add_ring fails. +		 * RED-PEN don't ignore overflow for tolerant == 0 +		 */ +		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) +			mce_ring_add(m.addr >> PAGE_SHIFT); + +		mce_get_rip(&m, regs); +		mce_log(&m); + +		if (severity > worst) { +			*final = m; +			worst = severity; +		} +	} + +	if (!no_way_out) +		mce_clear_state(toclear); + +	/* +	 * Do most of the synchronization with other CPUs. +	 * When there's any problem use only local no_way_out state. +	 */ +	if (mce_end(order) < 0) +		no_way_out = worst >= MCE_PANIC_SEVERITY; + +	/* +	 * If we have decided that we just CAN'T continue, and the user +	 * has not set tolerant to an insane level, give up and die. +	 * +	 * This is mainly used in the case when the system doesn't +	 * support MCE broadcasting or it has been disabled. +	 */ +	if (no_way_out && tolerant < 3) +		mce_panic("Fatal machine check on current CPU", final, msg); + +	/* +	 * If the error seems to be unrecoverable, something should be +	 * done.  Try to kill as little as possible.  If we can kill just +	 * one task, do that.  If the user has set the tolerance very +	 * high, don't try to do anything at all. +	 */ + +	if (kill_it && tolerant < 3) +		force_sig(SIGBUS, current); + +	/* notify userspace ASAP */ +	set_thread_flag(TIF_MCE_NOTIFY); + +	if (worst > 0) +		mce_report_event(regs); +	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); +out: +	atomic_dec(&mce_entry); +	sync_core(); +} +EXPORT_SYMBOL_GPL(do_machine_check); + +/* dummy to break dependency. actual code is in mm/memory-failure.c */ +void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +{ +	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); +} + +/* + * Called after mce notification in process context. This code + * is allowed to sleep. Call the high level VM handler to process + * any corrupted pages. + * Assume that the work queue code only calls this one at a time + * per CPU. + * Note we don't disable preemption, so this code might run on the wrong + * CPU. In this case the event is picked up by the scheduled work queue. + * This is merely a fast path to expedite processing in some common + * cases. + */ +void mce_notify_process(void) +{ +	unsigned long pfn; +	mce_notify_irq(); +	while (mce_ring_get(&pfn)) +		memory_failure(pfn, MCE_VECTOR); +} + +static void mce_process_work(struct work_struct *dummy) +{ +	mce_notify_process(); +} + +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occurred. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(__u64 status) +{ +	struct mce m; + +	mce_setup(&m); +	m.bank = MCE_THERMAL_BANK; +	m.status = status; +	mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + +/* + * Periodic polling timer for "silent" machine check errors.  If the + * poller finds an MCE, poll 2x faster.  When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). + */ +static int check_interval = 5 * 60; /* 5 minutes */ + +static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(struct timer_list, mce_timer); + +static void mcheck_timer(unsigned long data) +{ +	struct timer_list *t = &per_cpu(mce_timer, data); +	int *n; + +	WARN_ON(smp_processor_id() != data); + +	if (mce_available(¤t_cpu_data)) { +		machine_check_poll(MCP_TIMESTAMP, +				&__get_cpu_var(mce_poll_banks)); +	} + +	/* +	 * Alert userspace if needed.  If we logged an MCE, reduce the +	 * polling interval, otherwise increase the polling interval. +	 */ +	n = &__get_cpu_var(next_interval); +	if (mce_notify_irq()) +		*n = max(*n/2, HZ/100); +	else +		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + +	t->expires = jiffies + *n; +	add_timer(t); +} + +static void mce_do_trigger(struct work_struct *work) +{ +	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +int mce_notify_irq(void) +{ +	/* Not more than two messages every minute */ +	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + +	clear_thread_flag(TIF_MCE_NOTIFY); + +	if (test_and_clear_bit(0, &mce_need_notify)) { +		wake_up_interruptible(&mce_wait); + +		/* +		 * There is no risk of missing notifications because +		 * work_pending is always cleared before the function is +		 * executed. +		 */ +		if (mce_helper[0] && !work_pending(&mce_trigger_work)) +			schedule_work(&mce_trigger_work); + +		if (__ratelimit(&ratelimit)) +			printk(KERN_INFO "Machine check events logged\n"); + +		return 1; +	} +	return 0; +} +EXPORT_SYMBOL_GPL(mce_notify_irq); + +/* + * Initialize Machine Checks for a CPU. + */ +static int mce_cap_init(void) +{ +	unsigned b; +	u64 cap; + +	rdmsrl(MSR_IA32_MCG_CAP, cap); + +	b = cap & MCG_BANKCNT_MASK; +	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + +	if (b > MAX_NR_BANKS) { +		printk(KERN_WARNING +		       "MCE: Using only %u machine check banks out of %u\n", +			MAX_NR_BANKS, b); +		b = MAX_NR_BANKS; +	} + +	/* Don't support asymmetric configurations today */ +	WARN_ON(banks != 0 && b != banks); +	banks = b; +	if (!bank) { +		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); +		if (!bank) +			return -ENOMEM; +		memset(bank, 0xff, banks * sizeof(u64)); +	} + +	/* Use accurate RIP reporting if available. */ +	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) +		rip_msr = MSR_IA32_MCG_EIP; + +	if (cap & MCG_SER_P) +		mce_ser = 1; + +	return 0; +} + +static void mce_init(void) +{ +	mce_banks_t all_banks; +	u64 cap; +	int i; + +	/* +	 * Log the machine checks left over from the previous reset. +	 */ +	bitmap_fill(all_banks, MAX_NR_BANKS); +	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); + +	set_in_cr4(X86_CR4_MCE); + +	rdmsrl(MSR_IA32_MCG_CAP, cap); +	if (cap & MCG_CTL_P) +		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + +	for (i = 0; i < banks; i++) { +		if (skip_bank_init(i)) +			continue; +		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); +		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +	} +} + +/* Add per CPU specific workarounds here */ +static void mce_cpu_quirks(struct cpuinfo_x86 *c) +{ +	/* This should be disabled by the BIOS, but isn't always */ +	if (c->x86_vendor == X86_VENDOR_AMD) { +		if (c->x86 == 15 && banks > 4) { +			/* +			 * disable GART TBL walk error reporting, which +			 * trips off incorrectly with the IOMMU & 3ware +			 * & Cerberus: +			 */ +			clear_bit(10, (unsigned long *)&bank[4]); +		} +		if (c->x86 <= 17 && mce_bootlog < 0) { +			/* +			 * Lots of broken BIOS around that don't clear them +			 * by default and leave crap in there. Don't log: +			 */ +			mce_bootlog = 0; +		} +		/* +		 * Various K7s with broken bank 0 around. Always disable +		 * by default. +		 */ +		 if (c->x86 == 6 && banks > 0) +			bank[0] = 0; +	} + +	if (c->x86_vendor == X86_VENDOR_INTEL) { +		/* +		 * SDM documents that on family 6 bank 0 should not be written +		 * because it aliases to another special BIOS controlled +		 * register. +		 * But it's not aliased anymore on model 0x1a+ +		 * Don't ignore bank 0 completely because there could be a +		 * valid event later, merely don't write CTL0. +		 */ + +		if (c->x86 == 6 && c->x86_model < 0x1A) +			__set_bit(0, &dont_init_banks); + +		/* +		 * All newer Intel systems support MCE broadcasting. Enable +		 * synchronization with a one second timeout. +		 */ +		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && +			monarch_timeout < 0) +			monarch_timeout = USEC_PER_SEC; +	} +	if (monarch_timeout < 0) +		monarch_timeout = 0; +	if (mce_bootlog != 0) +		mce_panic_timeout = 30; +} + +static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +{ +	if (c->x86 != 5) +		return; +	switch (c->x86_vendor) { +	case X86_VENDOR_INTEL: +		intel_p5_mcheck_init(c); +		break; +	case X86_VENDOR_CENTAUR: +		winchip_mcheck_init(c); +		break; +	} +} + +static void mce_cpu_features(struct cpuinfo_x86 *c) +{ +	switch (c->x86_vendor) { +	case X86_VENDOR_INTEL: +		mce_intel_feature_init(c); +		break; +	case X86_VENDOR_AMD: +		mce_amd_feature_init(c); +		break; +	default: +		break; +	} +} + +static void mce_init_timer(void) +{ +	struct timer_list *t = &__get_cpu_var(mce_timer); +	int *n = &__get_cpu_var(next_interval); + +	if (mce_ignore_ce) +		return; + +	*n = check_interval * HZ; +	if (!*n) +		return; +	setup_timer(t, mcheck_timer, smp_processor_id()); +	t->expires = round_jiffies(jiffies + *n); +	add_timer(t); +} + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off: + */ +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +{ +	if (mce_disabled) +		return; + +	mce_ancient_init(c); + +	if (!mce_available(c)) +		return; + +	if (mce_cap_init() < 0) { +		mce_disabled = 1; +		return; +	} +	mce_cpu_quirks(c); + +	machine_check_vector = do_machine_check; + +	mce_init(); +	mce_cpu_features(c); +	mce_init_timer(); +	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); +} + +/* + * Character device to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_state_lock); +static int		open_count;		/* #times opened */ +static int		open_exclu;		/* already open exclusive? */ + +static int mce_open(struct inode *inode, struct file *file) +{ +	spin_lock(&mce_state_lock); + +	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { +		spin_unlock(&mce_state_lock); + +		return -EBUSY; +	} + +	if (file->f_flags & O_EXCL) +		open_exclu = 1; +	open_count++; + +	spin_unlock(&mce_state_lock); + +	return nonseekable_open(inode, file); +} + +static int mce_release(struct inode *inode, struct file *file) +{ +	spin_lock(&mce_state_lock); + +	open_count--; +	open_exclu = 0; + +	spin_unlock(&mce_state_lock); + +	return 0; +} + +static void collect_tscs(void *data) +{ +	unsigned long *cpu_tsc = (unsigned long *)data; + +	rdtscll(cpu_tsc[smp_processor_id()]); +} + +static DEFINE_MUTEX(mce_read_mutex); + +static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, +			loff_t *off) +{ +	char __user *buf = ubuf; +	unsigned long *cpu_tsc; +	unsigned prev, next; +	int i, err; + +	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); +	if (!cpu_tsc) +		return -ENOMEM; + +	mutex_lock(&mce_read_mutex); +	next = rcu_dereference(mcelog.next); + +	/* Only supports full reads right now */ +	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { +		mutex_unlock(&mce_read_mutex); +		kfree(cpu_tsc); + +		return -EINVAL; +	} + +	err = 0; +	prev = 0; +	do { +		for (i = prev; i < next; i++) { +			unsigned long start = jiffies; + +			while (!mcelog.entry[i].finished) { +				if (time_after_eq(jiffies, start + 2)) { +					memset(mcelog.entry + i, 0, +					       sizeof(struct mce)); +					goto timeout; +				} +				cpu_relax(); +			} +			smp_rmb(); +			err |= copy_to_user(buf, mcelog.entry + i, +					    sizeof(struct mce)); +			buf += sizeof(struct mce); +timeout: +			; +		} + +		memset(mcelog.entry + prev, 0, +		       (next - prev) * sizeof(struct mce)); +		prev = next; +		next = cmpxchg(&mcelog.next, prev, 0); +	} while (next != prev); + +	synchronize_sched(); + +	/* +	 * Collect entries that were still getting written before the +	 * synchronize. +	 */ +	on_each_cpu(collect_tscs, cpu_tsc, 1); + +	for (i = next; i < MCE_LOG_LEN; i++) { +		if (mcelog.entry[i].finished && +		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { +			err |= copy_to_user(buf, mcelog.entry+i, +					    sizeof(struct mce)); +			smp_rmb(); +			buf += sizeof(struct mce); +			memset(&mcelog.entry[i], 0, sizeof(struct mce)); +		} +	} +	mutex_unlock(&mce_read_mutex); +	kfree(cpu_tsc); + +	return err ? -EFAULT : buf - ubuf; +} + +static unsigned int mce_poll(struct file *file, poll_table *wait) +{ +	poll_wait(file, &mce_wait, wait); +	if (rcu_dereference(mcelog.next)) +		return POLLIN | POLLRDNORM; +	return 0; +} + +static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ +	int __user *p = (int __user *)arg; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	switch (cmd) { +	case MCE_GET_RECORD_LEN: +		return put_user(sizeof(struct mce), p); +	case MCE_GET_LOG_LEN: +		return put_user(MCE_LOG_LEN, p); +	case MCE_GETCLEAR_FLAGS: { +		unsigned flags; + +		do { +			flags = mcelog.flags; +		} while (cmpxchg(&mcelog.flags, flags, 0) != flags); + +		return put_user(flags, p); +	} +	default: +		return -ENOTTY; +	} +} + +/* Modified in mce-inject.c, so not static or const */ +struct file_operations mce_chrdev_ops = { +	.open			= mce_open, +	.release		= mce_release, +	.read			= mce_read, +	.poll			= mce_poll, +	.unlocked_ioctl		= mce_ioctl, +}; +EXPORT_SYMBOL_GPL(mce_chrdev_ops); + +static struct miscdevice mce_log_device = { +	MISC_MCELOG_MINOR, +	"mcelog", +	&mce_chrdev_ops, +}; + +/* + * mce=off Disables machine check + * mce=no_cmci Disables CMCI + * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. + * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) + *	monarchtimeout is how long to wait for other CPUs on machine + *	check, or 0 to not wait + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=nobootlog Don't log MCEs from before booting. + */ +static int __init mcheck_enable(char *str) +{ +	if (*str == 0) +		enable_p5_mce(); +	if (*str == '=') +		str++; +	if (!strcmp(str, "off")) +		mce_disabled = 1; +	else if (!strcmp(str, "no_cmci")) +		mce_cmci_disabled = 1; +	else if (!strcmp(str, "dont_log_ce")) +		mce_dont_log_ce = 1; +	else if (!strcmp(str, "ignore_ce")) +		mce_ignore_ce = 1; +	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) +		mce_bootlog = (str[0] == 'b'); +	else if (isdigit(str[0])) { +		get_option(&str, &tolerant); +		if (*str == ',') { +			++str; +			get_option(&str, &monarch_timeout); +		} +	} else { +		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", +		       str); +		return 0; +	} +	return 1; +} +__setup("mce", mcheck_enable); + +/* + * Sysfs support + */ + +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ +	int i; + +	for (i = 0; i < banks; i++) { +		if (!skip_bank_init(i)) +			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +	} +	return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ +	return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ +	return mce_disable(); +} + +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ +static int mce_resume(struct sys_device *dev) +{ +	mce_init(); +	mce_cpu_features(¤t_cpu_data); + +	return 0; +} + +static void mce_cpu_restart(void *data) +{ +	del_timer_sync(&__get_cpu_var(mce_timer)); +	if (!mce_available(¤t_cpu_data)) +		return; +	mce_init(); +	mce_init_timer(); +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ +	on_each_cpu(mce_cpu_restart, NULL, 1); +} + +/* Toggle features for corrected errors */ +static void mce_disable_ce(void *all) +{ +	if (!mce_available(¤t_cpu_data)) +		return; +	if (all) +		del_timer_sync(&__get_cpu_var(mce_timer)); +	cmci_clear(); +} + +static void mce_enable_ce(void *all) +{ +	if (!mce_available(¤t_cpu_data)) +		return; +	cmci_reenable(); +	cmci_recheck(); +	if (all) +		mce_init_timer(); +} + +static struct sysdev_class mce_sysclass = { +	.suspend	= mce_suspend, +	.shutdown	= mce_shutdown, +	.resume		= mce_resume, +	.name		= "machinecheck", +}; + +DEFINE_PER_CPU(struct sys_device, mce_dev); + +__cpuinitdata +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); + +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +			 char *buf) +{ +	u64 b = bank[attr - bank_attrs]; + +	return sprintf(buf, "%llx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +			const char *buf, size_t size) +{ +	u64 new; + +	if (strict_strtoull(buf, 0, &new) < 0) +		return -EINVAL; + +	bank[attr - bank_attrs] = new; +	mce_restart(); + +	return size; +} + +static ssize_t +show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +{ +	strcpy(buf, mce_helper); +	strcat(buf, "\n"); +	return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +				const char *buf, size_t siz) +{ +	char *p; +	int len; + +	strncpy(mce_helper, buf, sizeof(mce_helper)); +	mce_helper[sizeof(mce_helper)-1] = 0; +	len = strlen(mce_helper); +	p = strchr(mce_helper, '\n'); + +	if (*p) +		*p = 0; + +	return len; +} + +static ssize_t set_ignore_ce(struct sys_device *s, +			     struct sysdev_attribute *attr, +			     const char *buf, size_t size) +{ +	u64 new; + +	if (strict_strtoull(buf, 0, &new) < 0) +		return -EINVAL; + +	if (mce_ignore_ce ^ !!new) { +		if (new) { +			/* disable ce features */ +			on_each_cpu(mce_disable_ce, (void *)1, 1); +			mce_ignore_ce = 1; +		} else { +			/* enable ce features */ +			mce_ignore_ce = 0; +			on_each_cpu(mce_enable_ce, (void *)1, 1); +		} +	} +	return size; +} + +static ssize_t set_cmci_disabled(struct sys_device *s, +				 struct sysdev_attribute *attr, +				 const char *buf, size_t size) +{ +	u64 new; + +	if (strict_strtoull(buf, 0, &new) < 0) +		return -EINVAL; + +	if (mce_cmci_disabled ^ !!new) { +		if (new) { +			/* disable cmci */ +			on_each_cpu(mce_disable_ce, NULL, 1); +			mce_cmci_disabled = 1; +		} else { +			/* enable cmci */ +			mce_cmci_disabled = 0; +			on_each_cpu(mce_enable_ce, NULL, 1); +		} +	} +	return size; +} + +static ssize_t store_int_with_restart(struct sys_device *s, +				      struct sysdev_attribute *attr, +				      const char *buf, size_t size) +{ +	ssize_t ret = sysdev_store_int(s, attr, buf, size); +	mce_restart(); +	return ret; +} + +static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); +static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); +static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); + +static struct sysdev_ext_attribute attr_check_interval = { +	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, +		     store_int_with_restart), +	&check_interval +}; + +static struct sysdev_ext_attribute attr_ignore_ce = { +	_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), +	&mce_ignore_ce +}; + +static struct sysdev_ext_attribute attr_cmci_disabled = { +	_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), +	&mce_cmci_disabled +}; + +static struct sysdev_attribute *mce_attrs[] = { +	&attr_tolerant.attr, +	&attr_check_interval.attr, +	&attr_trigger, +	&attr_monarch_timeout.attr, +	&attr_dont_log_ce.attr, +	&attr_ignore_ce.attr, +	&attr_cmci_disabled.attr, +	NULL +}; + +static cpumask_var_t mce_dev_initialized; + +/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_create_device(unsigned int cpu) +{ +	int err; +	int i, j; + +	if (!mce_available(&boot_cpu_data)) +		return -EIO; + +	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); +	per_cpu(mce_dev, cpu).id	= cpu; +	per_cpu(mce_dev, cpu).cls	= &mce_sysclass; + +	err = sysdev_register(&per_cpu(mce_dev, cpu)); +	if (err) +		return err; + +	for (i = 0; mce_attrs[i]; i++) { +		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); +		if (err) +			goto error; +	} +	for (j = 0; j < banks; j++) { +		err = sysdev_create_file(&per_cpu(mce_dev, cpu), +					&bank_attrs[j]); +		if (err) +			goto error2; +	} +	cpumask_set_cpu(cpu, mce_dev_initialized); + +	return 0; +error2: +	while (--j >= 0) +		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); +error: +	while (--i >= 0) +		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + +	sysdev_unregister(&per_cpu(mce_dev, cpu)); + +	return err; +} + +static __cpuinit void mce_remove_device(unsigned int cpu) +{ +	int i; + +	if (!cpumask_test_cpu(cpu, mce_dev_initialized)) +		return; + +	for (i = 0; mce_attrs[i]; i++) +		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + +	for (i = 0; i < banks; i++) +		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + +	sysdev_unregister(&per_cpu(mce_dev, cpu)); +	cpumask_clear_cpu(cpu, mce_dev_initialized); +} + +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void *h) +{ +	unsigned long action = *(unsigned long *)h; +	int i; + +	if (!mce_available(¤t_cpu_data)) +		return; +	if (!(action & CPU_TASKS_FROZEN)) +		cmci_clear(); +	for (i = 0; i < banks; i++) { +		if (!skip_bank_init(i)) +			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +	} +} + +static void mce_reenable_cpu(void *h) +{ +	unsigned long action = *(unsigned long *)h; +	int i; + +	if (!mce_available(¤t_cpu_data)) +		return; + +	if (!(action & CPU_TASKS_FROZEN)) +		cmci_reenable(); +	for (i = 0; i < banks; i++) { +		if (!skip_bank_init(i)) +			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +	} +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int __cpuinit +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ +	unsigned int cpu = (unsigned long)hcpu; +	struct timer_list *t = &per_cpu(mce_timer, cpu); + +	switch (action) { +	case CPU_ONLINE: +	case CPU_ONLINE_FROZEN: +		mce_create_device(cpu); +		if (threshold_cpu_callback) +			threshold_cpu_callback(action, cpu); +		break; +	case CPU_DEAD: +	case CPU_DEAD_FROZEN: +		if (threshold_cpu_callback) +			threshold_cpu_callback(action, cpu); +		mce_remove_device(cpu); +		break; +	case CPU_DOWN_PREPARE: +	case CPU_DOWN_PREPARE_FROZEN: +		del_timer_sync(t); +		smp_call_function_single(cpu, mce_disable_cpu, &action, 1); +		break; +	case CPU_DOWN_FAILED: +	case CPU_DOWN_FAILED_FROZEN: +		t->expires = round_jiffies(jiffies + +						__get_cpu_var(next_interval)); +		add_timer_on(t, cpu); +		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); +		break; +	case CPU_POST_DEAD: +		/* intentionally ignoring frozen here */ +		cmci_rediscover(cpu); +		break; +	} +	return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier __cpuinitdata = { +	.notifier_call = mce_cpu_callback, +}; + +static __init int mce_init_banks(void) +{ +	int i; + +	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, +				GFP_KERNEL); +	if (!bank_attrs) +		return -ENOMEM; + +	for (i = 0; i < banks; i++) { +		struct sysdev_attribute *a = &bank_attrs[i]; + +		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i); +		if (!a->attr.name) +			goto nomem; + +		a->attr.mode	= 0644; +		a->show		= show_bank; +		a->store	= set_bank; +	} +	return 0; + +nomem: +	while (--i >= 0) +		kfree(bank_attrs[i].attr.name); +	kfree(bank_attrs); +	bank_attrs = NULL; + +	return -ENOMEM; +} + +static __init int mce_init_device(void) +{ +	int err; +	int i = 0; + +	if (!mce_available(&boot_cpu_data)) +		return -EIO; + +	zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); + +	err = mce_init_banks(); +	if (err) +		return err; + +	err = sysdev_class_register(&mce_sysclass); +	if (err) +		return err; + +	for_each_online_cpu(i) { +		err = mce_create_device(i); +		if (err) +			return err; +	} + +	register_hotcpu_notifier(&mce_cpu_notifier); +	misc_register(&mce_log_device); + +	return err; +} + +device_initcall(mce_init_device); + +#else /* CONFIG_X86_OLD_MCE: */ + +int nr_mce_banks; +EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */ + +/* This has to be run for each processor */ +void mcheck_init(struct cpuinfo_x86 *c) +{ +	if (mce_disabled) +		return; + +	switch (c->x86_vendor) { +	case X86_VENDOR_AMD: +		amd_mcheck_init(c); +		break; + +	case X86_VENDOR_INTEL: +		if (c->x86 == 5) +			intel_p5_mcheck_init(c); +		if (c->x86 == 6) +			intel_p6_mcheck_init(c); +		if (c->x86 == 15) +			intel_p4_mcheck_init(c); +		break; + +	case X86_VENDOR_CENTAUR: +		if (c->x86 == 5) +			winchip_mcheck_init(c); +		break; + +	default: +		break; +	} +	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); +} + +static int __init mcheck_enable(char *str) +{ +	mce_p5_enabled = 1; +	return 1; +} +__setup("mce", mcheck_enable); + +#endif /* CONFIG_X86_OLD_MCE */ + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ +	mce_disabled = 1; +	return 1; +} +__setup("nomce", mcheck_disable); diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h deleted file mode 100644 index ae9f628838f..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ /dev/null @@ -1,14 +0,0 @@ -#include <linux/init.h> -#include <asm/mce.h> - -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); - -/* Call the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); - -extern int nr_mce_banks; - diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c deleted file mode 100644 index 3552119b091..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * mce.c - x86 Machine Check Exception Reporting - * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com> - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/smp.h> -#include <linux/thread_info.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> - -#include "mce.h" - -int mce_disabled; -int nr_mce_banks; - -EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */ - -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ -	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ -	if (mce_disabled == 1) -		return; - -	switch (c->x86_vendor) { -	case X86_VENDOR_AMD: -		amd_mcheck_init(c); -		break; - -	case X86_VENDOR_INTEL: -		if (c->x86 == 5) -			intel_p5_mcheck_init(c); -		if (c->x86 == 6) -			intel_p6_mcheck_init(c); -		if (c->x86 == 15) -			intel_p4_mcheck_init(c); -		break; - -	case X86_VENDOR_CENTAUR: -		if (c->x86 == 5) -			winchip_mcheck_init(c); -		break; - -	default: -		break; -	} -} - -static int __init mcheck_disable(char *str) -{ -	mce_disabled = 1; -	return 1; -} - -static int __init mcheck_enable(char *str) -{ -	mce_disabled = -1; -	return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c deleted file mode 100644 index 09dd1d414fc..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ /dev/null @@ -1,1187 +0,0 @@ -/* - * Machine check handler. - * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Rest from unknown author(s). - * 2004 Andi Kleen. Rewrote most of it. - * Copyright 2008 Intel Corporation - * Author: Andi Kleen - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/smp_lock.h> -#include <linux/string.h> -#include <linux/rcupdate.h> -#include <linux/kallsyms.h> -#include <linux/sysdev.h> -#include <linux/miscdevice.h> -#include <linux/fs.h> -#include <linux/capability.h> -#include <linux/cpu.h> -#include <linux/percpu.h> -#include <linux/poll.h> -#include <linux/thread_info.h> -#include <linux/ctype.h> -#include <linux/kmod.h> -#include <linux/kdebug.h> -#include <linux/kobject.h> -#include <linux/sysfs.h> -#include <linux/ratelimit.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/mce.h> -#include <asm/uaccess.h> -#include <asm/smp.h> -#include <asm/idle.h> - -#define MISC_MCELOG_MINOR 227 - -atomic_t mce_entry; - -static int mce_dont_init; - -/* - * Tolerant levels: - *   0: always panic on uncorrected errors, log corrected errors - *   1: panic or SIGBUS on uncorrected errors, log corrected errors - *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors - *   3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static atomic_t mce_events; - -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_wait); - -/* MCA banks polled by the period polling timer for corrected events */ -DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { -	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL -}; - -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) -{ -	memset(m, 0, sizeof(struct mce)); -	m->cpu = smp_processor_id(); -	rdtscll(m->tsc); -} - -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log mcelog = { -	MCE_LOG_SIGNATURE, -	MCE_LOG_LEN, -}; - -void mce_log(struct mce *mce) -{ -	unsigned next, entry; -	atomic_inc(&mce_events); -	mce->finished = 0; -	wmb(); -	for (;;) { -		entry = rcu_dereference(mcelog.next); -		for (;;) { -			/* When the buffer fills up discard new entries. Assume -			   that the earlier errors are the more interesting. */ -			if (entry >= MCE_LOG_LEN) { -				set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); -				return; -			} -			/* Old left over entry. Skip. */ -			if (mcelog.entry[entry].finished) { -				entry++; -				continue; -			} -			break; -		} -		smp_rmb(); -		next = entry + 1; -		if (cmpxchg(&mcelog.next, entry, next) == entry) -			break; -	} -	memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); -	wmb(); -	mcelog.entry[entry].finished = 1; -	wmb(); - -	set_bit(0, ¬ify_user); -} - -static void print_mce(struct mce *m) -{ -	printk(KERN_EMERG "\n" -	       KERN_EMERG "HARDWARE ERROR\n" -	       KERN_EMERG -	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", -	       m->cpu, m->mcgstatus, m->bank, m->status); -	if (m->ip) { -		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", -		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", -		       m->cs, m->ip); -		if (m->cs == __KERNEL_CS) -			print_symbol("{%s}", m->ip); -		printk("\n"); -	} -	printk(KERN_EMERG "TSC %llx ", m->tsc); -	if (m->addr) -		printk("ADDR %llx ", m->addr); -	if (m->misc) -		printk("MISC %llx ", m->misc); -	printk("\n"); -	printk(KERN_EMERG "This is not a software problem!\n"); -	printk(KERN_EMERG "Run through mcelog --ascii to decode " -	       "and contact your hardware vendor\n"); -} - -static void mce_panic(char *msg, struct mce *backup, unsigned long start) -{ -	int i; - -	oops_begin(); -	for (i = 0; i < MCE_LOG_LEN; i++) { -		unsigned long tsc = mcelog.entry[i].tsc; - -		if (time_before(tsc, start)) -			continue; -		print_mce(&mcelog.entry[i]); -		if (backup && mcelog.entry[i].tsc == backup->tsc) -			backup = NULL; -	} -	if (backup) -		print_mce(backup); -	panic(msg); -} - -int mce_available(struct cpuinfo_x86 *c) -{ -	if (mce_dont_init) -		return 0; -	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); -} - -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ -	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { -		m->ip = regs->ip; -		m->cs = regs->cs; -	} else { -		m->ip = 0; -		m->cs = 0; -	} -	if (rip_msr) { -		/* Assume the RIP in the MSR is exact. Is this true? */ -		m->mcgstatus |= MCG_STATUS_EIPV; -		rdmsrl(rip_msr, m->ip); -		m->cs = 0; -	} -} - -/* - * Poll for corrected events or events that happened before reset. - * Those are just logged through /dev/mcelog. - * - * This is executed in standard interrupt context. - */ -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) -{ -	struct mce m; -	int i; - -	mce_setup(&m); - -	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); -	for (i = 0; i < banks; i++) { -		if (!bank[i] || !test_bit(i, *b)) -			continue; - -		m.misc = 0; -		m.addr = 0; -		m.bank = i; -		m.tsc = 0; - -		barrier(); -		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); -		if (!(m.status & MCI_STATUS_VAL)) -			continue; - -		/* -		 * Uncorrected events are handled by the exception handler -		 * when it is enabled. But when the exception is disabled log -		 * everything. -		 * -		 * TBD do the same check for MCI_STATUS_EN here? -		 */ -		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) -			continue; - -		if (m.status & MCI_STATUS_MISCV) -			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); -		if (m.status & MCI_STATUS_ADDRV) -			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - -		if (!(flags & MCP_TIMESTAMP)) -			m.tsc = 0; -		/* -		 * Don't get the IP here because it's unlikely to -		 * have anything to do with the actual error location. -		 */ -		if (!(flags & MCP_DONTLOG)) { -			mce_log(&m); -			add_taint(TAINT_MACHINE_CHECK); -		} - -		/* -		 * Clear state for this bank. -		 */ -		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); -	} - -	/* -	 * Don't clear MCG_STATUS here because it's only defined for -	 * exceptions. -	 */ -} - -/* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. - * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even - * think about putting a printk in there! - */ -void do_machine_check(struct pt_regs * regs, long error_code) -{ -	struct mce m, panicm; -	u64 mcestart = 0; -	int i; -	int panicm_found = 0; -	/* -	 * If no_way_out gets set, there is no safe way to recover from this -	 * MCE.  If tolerant is cranked up, we'll try anyway. -	 */ -	int no_way_out = 0; -	/* -	 * If kill_it gets set, there might be a way to recover from this -	 * error. -	 */ -	int kill_it = 0; -	DECLARE_BITMAP(toclear, MAX_NR_BANKS); - -	atomic_inc(&mce_entry); - -	if (notify_die(DIE_NMI, "machine check", regs, error_code, -			   18, SIGKILL) == NOTIFY_STOP) -		goto out2; -	if (!banks) -		goto out2; - -	mce_setup(&m); - -	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); -	/* if the restart IP is not valid, we're done for */ -	if (!(m.mcgstatus & MCG_STATUS_RIPV)) -		no_way_out = 1; - -	rdtscll(mcestart); -	barrier(); - -	for (i = 0; i < banks; i++) { -		__clear_bit(i, toclear); -		if (!bank[i]) -			continue; - -		m.misc = 0; -		m.addr = 0; -		m.bank = i; - -		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); -		if ((m.status & MCI_STATUS_VAL) == 0) -			continue; - -		/* -		 * Non uncorrected errors are handled by machine_check_poll -		 * Leave them alone. -		 */ -		if ((m.status & MCI_STATUS_UC) == 0) -			continue; - -		/* -		 * Set taint even when machine check was not enabled. -		 */ -		add_taint(TAINT_MACHINE_CHECK); - -		__set_bit(i, toclear); - -		if (m.status & MCI_STATUS_EN) { -			/* if PCC was set, there's no way out */ -			no_way_out |= !!(m.status & MCI_STATUS_PCC); -			/* -			 * If this error was uncorrectable and there was -			 * an overflow, we're in trouble.  If no overflow, -			 * we might get away with just killing a task. -			 */ -			if (m.status & MCI_STATUS_UC) { -				if (tolerant < 1 || m.status & MCI_STATUS_OVER) -					no_way_out = 1; -				kill_it = 1; -			} -		} else { -			/* -			 * Machine check event was not enabled. Clear, but -			 * ignore. -			 */ -			continue; -		} - -		if (m.status & MCI_STATUS_MISCV) -			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); -		if (m.status & MCI_STATUS_ADDRV) -			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - -		mce_get_rip(&m, regs); -		mce_log(&m); - -		/* Did this bank cause the exception? */ -		/* Assume that the bank with uncorrectable errors did it, -		   and that there is only a single one. */ -		if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { -			panicm = m; -			panicm_found = 1; -		} -	} - -	/* If we didn't find an uncorrectable error, pick -	   the last one (shouldn't happen, just being safe). */ -	if (!panicm_found) -		panicm = m; - -	/* -	 * If we have decided that we just CAN'T continue, and the user -	 *  has not set tolerant to an insane level, give up and die. -	 */ -	if (no_way_out && tolerant < 3) -		mce_panic("Machine check", &panicm, mcestart); - -	/* -	 * If the error seems to be unrecoverable, something should be -	 * done.  Try to kill as little as possible.  If we can kill just -	 * one task, do that.  If the user has set the tolerance very -	 * high, don't try to do anything at all. -	 */ -	if (kill_it && tolerant < 3) { -		int user_space = 0; - -		/* -		 * If the EIPV bit is set, it means the saved IP is the -		 * instruction which caused the MCE. -		 */ -		if (m.mcgstatus & MCG_STATUS_EIPV) -			user_space = panicm.ip && (panicm.cs & 3); - -		/* -		 * If we know that the error was in user space, send a -		 * SIGBUS.  Otherwise, panic if tolerance is low. -		 * -		 * force_sig() takes an awful lot of locks and has a slight -		 * risk of deadlocking. -		 */ -		if (user_space) { -			force_sig(SIGBUS, current); -		} else if (panic_on_oops || tolerant < 2) { -			mce_panic("Uncorrected machine check", -				&panicm, mcestart); -		} -	} - -	/* notify userspace ASAP */ -	set_thread_flag(TIF_MCE_NOTIFY); - -	/* the last thing we do is clear state */ -	for (i = 0; i < banks; i++) { -		if (test_bit(i, toclear)) -			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); -	} -	wrmsrl(MSR_IA32_MCG_STATUS, 0); - out2: -	atomic_dec(&mce_entry); -} - -#ifdef CONFIG_X86_MCE_INTEL -/*** - * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occurred. - * @status: Event status information - * - * This function should be called by the thermal interrupt after the - * event has been processed and the decision was made to log the event - * further. - * - * The status parameter will be saved to the 'status' field of 'struct mce' - * and historically has been the register value of the - * MSR_IA32_THERMAL_STATUS (Intel) msr. - */ -void mce_log_therm_throt_event(__u64 status) -{ -	struct mce m; - -	mce_setup(&m); -	m.bank = MCE_THERMAL_BANK; -	m.status = status; -	mce_log(&m); -} -#endif /* CONFIG_X86_MCE_INTEL */ - -/* - * Periodic polling timer for "silent" machine check errors.  If the - * poller finds an MCE, poll 2x faster.  When the poller finds no more - * errors, poll 2x slower (up to check_interval seconds). - */ - -static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ -static void mcheck_timer(unsigned long); -static DEFINE_PER_CPU(struct timer_list, mce_timer); - -static void mcheck_timer(unsigned long data) -{ -	struct timer_list *t = &per_cpu(mce_timer, data); -	int *n; - -	WARN_ON(smp_processor_id() != data); - -	if (mce_available(¤t_cpu_data)) -		machine_check_poll(MCP_TIMESTAMP, -				&__get_cpu_var(mce_poll_banks)); - -	/* -	 * Alert userspace if needed.  If we logged an MCE, reduce the -	 * polling interval, otherwise increase the polling interval. -	 */ -	n = &__get_cpu_var(next_interval); -	if (mce_notify_user()) { -		*n = max(*n/2, HZ/100); -	} else { -		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); -	} - -	t->expires = jiffies + *n; -	add_timer(t); -} - -static void mce_do_trigger(struct work_struct *work) -{ -	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -int mce_notify_user(void) -{ -	/* Not more than two messages every minute */ -	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - -	clear_thread_flag(TIF_MCE_NOTIFY); -	if (test_and_clear_bit(0, ¬ify_user)) { -		wake_up_interruptible(&mce_wait); - -		/* -		 * There is no risk of missing notifications because -		 * work_pending is always cleared before the function is -		 * executed. -		 */ -		if (trigger[0] && !work_pending(&mce_trigger_work)) -			schedule_work(&mce_trigger_work); - -		if (__ratelimit(&ratelimit)) -			printk(KERN_INFO "Machine check events logged\n"); - -		return 1; -	} -	return 0; -} - -/* see if the idle task needs to notify userspace */ -static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) -{ -	/* IDLE_END should be safe - interrupts are back on */ -	if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) -		mce_notify_user(); - -	return NOTIFY_OK; -} - -static struct notifier_block mce_idle_notifier = { -	.notifier_call = mce_idle_callback, -}; - -static __init int periodic_mcheck_init(void) -{ -       idle_notifier_register(&mce_idle_notifier); -       return 0; -} -__initcall(periodic_mcheck_init); - -/* - * Initialize Machine Checks for a CPU. - */ -static int mce_cap_init(void) -{ -	u64 cap; -	unsigned b; - -	rdmsrl(MSR_IA32_MCG_CAP, cap); -	b = cap & 0xff; -	if (b > MAX_NR_BANKS) { -		printk(KERN_WARNING -		       "MCE: Using only %u machine check banks out of %u\n", -			MAX_NR_BANKS, b); -		b = MAX_NR_BANKS; -	} - -	/* Don't support asymmetric configurations today */ -	WARN_ON(banks != 0 && b != banks); -	banks = b; -	if (!bank) { -		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); -		if (!bank) -			return -ENOMEM; -		memset(bank, 0xff, banks * sizeof(u64)); -	} - -	/* Use accurate RIP reporting if available. */ -	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) -		rip_msr = MSR_IA32_MCG_EIP; - -	return 0; -} - -static void mce_init(void *dummy) -{ -	u64 cap; -	int i; -	mce_banks_t all_banks; - -	/* -	 * Log the machine checks left over from the previous reset. -	 */ -	bitmap_fill(all_banks, MAX_NR_BANKS); -	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); - -	set_in_cr4(X86_CR4_MCE); - -	rdmsrl(MSR_IA32_MCG_CAP, cap); -	if (cap & MCG_CTL_P) -		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - -	for (i = 0; i < banks; i++) { -		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); -		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); -	} -} - -/* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) -{ -	/* This should be disabled by the BIOS, but isn't always */ -	if (c->x86_vendor == X86_VENDOR_AMD) { -		if (c->x86 == 15 && banks > 4) -			/* disable GART TBL walk error reporting, which trips off -			   incorrectly with the IOMMU & 3ware & Cerberus. */ -			clear_bit(10, (unsigned long *)&bank[4]); -		if(c->x86 <= 17 && mce_bootlog < 0) -			/* Lots of broken BIOS around that don't clear them -			   by default and leave crap in there. Don't log. */ -			mce_bootlog = 0; -	} - -} - -static void mce_cpu_features(struct cpuinfo_x86 *c) -{ -	switch (c->x86_vendor) { -	case X86_VENDOR_INTEL: -		mce_intel_feature_init(c); -		break; -	case X86_VENDOR_AMD: -		mce_amd_feature_init(c); -		break; -	default: -		break; -	} -} - -static void mce_init_timer(void) -{ -	struct timer_list *t = &__get_cpu_var(mce_timer); -	int *n = &__get_cpu_var(next_interval); - -	*n = check_interval * HZ; -	if (!*n) -		return; -	setup_timer(t, mcheck_timer, smp_processor_id()); -	t->expires = round_jiffies(jiffies + *n); -	add_timer(t); -} - -/* - * Called for each booted CPU to set up machine checks. - * Must be called with preempt off. - */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) -{ -	if (!mce_available(c)) -		return; - -	if (mce_cap_init() < 0) { -		mce_dont_init = 1; -		return; -	} -	mce_cpu_quirks(c); - -	mce_init(NULL); -	mce_cpu_features(c); -	mce_init_timer(); -} - -/* - * Character device to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_state_lock); -static int open_count;	/* #times opened */ -static int open_exclu;	/* already open exclusive? */ - -static int mce_open(struct inode *inode, struct file *file) -{ -	lock_kernel(); -	spin_lock(&mce_state_lock); - -	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { -		spin_unlock(&mce_state_lock); -		unlock_kernel(); -		return -EBUSY; -	} - -	if (file->f_flags & O_EXCL) -		open_exclu = 1; -	open_count++; - -	spin_unlock(&mce_state_lock); -	unlock_kernel(); - -	return nonseekable_open(inode, file); -} - -static int mce_release(struct inode *inode, struct file *file) -{ -	spin_lock(&mce_state_lock); - -	open_count--; -	open_exclu = 0; - -	spin_unlock(&mce_state_lock); - -	return 0; -} - -static void collect_tscs(void *data) -{ -	unsigned long *cpu_tsc = (unsigned long *)data; - -	rdtscll(cpu_tsc[smp_processor_id()]); -} - -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, -			loff_t *off) -{ -	unsigned long *cpu_tsc; -	static DEFINE_MUTEX(mce_read_mutex); -	unsigned prev, next; -	char __user *buf = ubuf; -	int i, err; - -	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); -	if (!cpu_tsc) -		return -ENOMEM; - -	mutex_lock(&mce_read_mutex); -	next = rcu_dereference(mcelog.next); - -	/* Only supports full reads right now */ -	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { -		mutex_unlock(&mce_read_mutex); -		kfree(cpu_tsc); -		return -EINVAL; -	} - -	err = 0; -	prev = 0; -	do { -		for (i = prev; i < next; i++) { -			unsigned long start = jiffies; - -			while (!mcelog.entry[i].finished) { -				if (time_after_eq(jiffies, start + 2)) { -					memset(mcelog.entry + i, 0, -					       sizeof(struct mce)); -					goto timeout; -				} -				cpu_relax(); -			} -			smp_rmb(); -			err |= copy_to_user(buf, mcelog.entry + i, -					    sizeof(struct mce)); -			buf += sizeof(struct mce); -timeout: -			; -		} - -		memset(mcelog.entry + prev, 0, -		       (next - prev) * sizeof(struct mce)); -		prev = next; -		next = cmpxchg(&mcelog.next, prev, 0); -	} while (next != prev); - -	synchronize_sched(); - -	/* -	 * Collect entries that were still getting written before the -	 * synchronize. -	 */ -	on_each_cpu(collect_tscs, cpu_tsc, 1); -	for (i = next; i < MCE_LOG_LEN; i++) { -		if (mcelog.entry[i].finished && -		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { -			err |= copy_to_user(buf, mcelog.entry+i, -					    sizeof(struct mce)); -			smp_rmb(); -			buf += sizeof(struct mce); -			memset(&mcelog.entry[i], 0, sizeof(struct mce)); -		} -	} -	mutex_unlock(&mce_read_mutex); -	kfree(cpu_tsc); -	return err ? -EFAULT : buf - ubuf; -} - -static unsigned int mce_poll(struct file *file, poll_table *wait) -{ -	poll_wait(file, &mce_wait, wait); -	if (rcu_dereference(mcelog.next)) -		return POLLIN | POLLRDNORM; -	return 0; -} - -static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ -	int __user *p = (int __user *)arg; - -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; -	switch (cmd) { -	case MCE_GET_RECORD_LEN: -		return put_user(sizeof(struct mce), p); -	case MCE_GET_LOG_LEN: -		return put_user(MCE_LOG_LEN, p); -	case MCE_GETCLEAR_FLAGS: { -		unsigned flags; - -		do { -			flags = mcelog.flags; -		} while (cmpxchg(&mcelog.flags, flags, 0) != flags); -		return put_user(flags, p); -	} -	default: -		return -ENOTTY; -	} -} - -static const struct file_operations mce_chrdev_ops = { -	.open = mce_open, -	.release = mce_release, -	.read = mce_read, -	.poll = mce_poll, -	.unlocked_ioctl = mce_ioctl, -}; - -static struct miscdevice mce_log_device = { -	MISC_MCELOG_MINOR, -	"mcelog", -	&mce_chrdev_ops, -}; - -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) -{ -	mce_dont_init = 1; -	return 1; -} - -/* mce=off disables machine check. -   mce=TOLERANCELEVEL (number, see above) -   mce=bootlog Log MCEs from before booting. Disabled by default on AMD. -   mce=nobootlog Don't log MCEs from before booting. */ -static int __init mcheck_enable(char *str) -{ -	if (!strcmp(str, "off")) -		mce_dont_init = 1; -	else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) -		mce_bootlog = str[0] == 'b'; -	else if (isdigit(str[0])) -		get_option(&str, &tolerant); -	else -		printk("mce= argument %s ignored. Please use /sys", str); -	return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce=", mcheck_enable); - -/* - * Sysfs support - */ - -/* - * Disable machine checks on suspend and shutdown. We can't really handle - * them later. - */ -static int mce_disable(void) -{ -	int i; - -	for (i = 0; i < banks; i++) -		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); -	return 0; -} - -static int mce_suspend(struct sys_device *dev, pm_message_t state) -{ -	return mce_disable(); -} - -static int mce_shutdown(struct sys_device *dev) -{ -	return mce_disable(); -} - -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. -   Only one CPU is active at this time, the others get readded later using -   CPU hotplug. */ -static int mce_resume(struct sys_device *dev) -{ -	mce_init(NULL); -	mce_cpu_features(¤t_cpu_data); -	return 0; -} - -static void mce_cpu_restart(void *data) -{ -	del_timer_sync(&__get_cpu_var(mce_timer)); -	if (mce_available(¤t_cpu_data)) -		mce_init(NULL); -	mce_init_timer(); -} - -/* Reinit MCEs after user configuration changes */ -static void mce_restart(void) -{ -	on_each_cpu(mce_cpu_restart, NULL, 1); -} - -static struct sysdev_class mce_sysclass = { -	.suspend = mce_suspend, -	.shutdown = mce_shutdown, -	.resume = mce_resume, -	.name = "machinecheck", -}; - -DEFINE_PER_CPU(struct sys_device, device_mce); -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; - -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ -	static ssize_t show_ ## name(struct sys_device *s,		\ -				     struct sysdev_attribute *attr,	\ -				     char *buf) {			\ -		return sprintf(buf, "%lx\n", (unsigned long)var);	\ -	}								\ -	static ssize_t set_ ## name(struct sys_device *s,		\ -				    struct sysdev_attribute *attr,	\ -				    const char *buf, size_t siz) {	\ -		char *end;						\ -		unsigned long new = simple_strtoul(buf, &end, 0);	\ -		if (end == buf) return -EINVAL;				\ -		var = new;						\ -		start;							\ -		return end-buf;						\ -	}								\ -	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - -static struct sysdev_attribute *bank_attrs; - -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, -			 char *buf) -{ -	u64 b = bank[attr - bank_attrs]; -	return sprintf(buf, "%llx\n", b); -} - -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, -			const char *buf, size_t siz) -{ -	char *end; -	u64 new = simple_strtoull(buf, &end, 0); -	if (end == buf) -		return -EINVAL; -	bank[attr - bank_attrs] = new; -	mce_restart(); -	return end-buf; -} - -static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, -				char *buf) -{ -	strcpy(buf, trigger); -	strcat(buf, "\n"); -	return strlen(trigger) + 1; -} - -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, -				const char *buf,size_t siz) -{ -	char *p; -	int len; -	strncpy(trigger, buf, sizeof(trigger)); -	trigger[sizeof(trigger)-1] = 0; -	len = strlen(trigger); -	p = strchr(trigger, '\n'); -	if (*p) *p = 0; -	return len; -} - -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -ACCESSOR(check_interval,check_interval,mce_restart()) -static struct sysdev_attribute *mce_attributes[] = { -	&attr_tolerant.attr, &attr_check_interval, &attr_trigger, -	NULL -}; - -static cpumask_var_t mce_device_initialized; - -/* Per cpu sysdev init.  All of the cpus still share the same ctl bank */ -static __cpuinit int mce_create_device(unsigned int cpu) -{ -	int err; -	int i; - -	if (!mce_available(&boot_cpu_data)) -		return -EIO; - -	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); -	per_cpu(device_mce,cpu).id = cpu; -	per_cpu(device_mce,cpu).cls = &mce_sysclass; - -	err = sysdev_register(&per_cpu(device_mce,cpu)); -	if (err) -		return err; - -	for (i = 0; mce_attributes[i]; i++) { -		err = sysdev_create_file(&per_cpu(device_mce,cpu), -					 mce_attributes[i]); -		if (err) -			goto error; -	} -	for (i = 0; i < banks; i++) { -		err = sysdev_create_file(&per_cpu(device_mce, cpu), -					&bank_attrs[i]); -		if (err) -			goto error2; -	} -	cpumask_set_cpu(cpu, mce_device_initialized); - -	return 0; -error2: -	while (--i >= 0) { -		sysdev_remove_file(&per_cpu(device_mce, cpu), -					&bank_attrs[i]); -	} -error: -	while (--i >= 0) { -		sysdev_remove_file(&per_cpu(device_mce,cpu), -				   mce_attributes[i]); -	} -	sysdev_unregister(&per_cpu(device_mce,cpu)); - -	return err; -} - -static __cpuinit void mce_remove_device(unsigned int cpu) -{ -	int i; - -	if (!cpumask_test_cpu(cpu, mce_device_initialized)) -		return; - -	for (i = 0; mce_attributes[i]; i++) -		sysdev_remove_file(&per_cpu(device_mce,cpu), -			mce_attributes[i]); -	for (i = 0; i < banks; i++) -		sysdev_remove_file(&per_cpu(device_mce, cpu), -			&bank_attrs[i]); -	sysdev_unregister(&per_cpu(device_mce,cpu)); -	cpumask_clear_cpu(cpu, mce_device_initialized); -} - -/* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) -{ -	int i; -	unsigned long action = *(unsigned long *)h; - -	if (!mce_available(¤t_cpu_data)) -		return; -	if (!(action & CPU_TASKS_FROZEN)) -		cmci_clear(); -	for (i = 0; i < banks; i++) -		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); -} - -static void mce_reenable_cpu(void *h) -{ -	int i; -	unsigned long action = *(unsigned long *)h; - -	if (!mce_available(¤t_cpu_data)) -		return; -	if (!(action & CPU_TASKS_FROZEN)) -		cmci_reenable(); -	for (i = 0; i < banks; i++) -		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, -				      unsigned long action, void *hcpu) -{ -	unsigned int cpu = (unsigned long)hcpu; -	struct timer_list *t = &per_cpu(mce_timer, cpu); - -	switch (action) { -	case CPU_ONLINE: -	case CPU_ONLINE_FROZEN: -		mce_create_device(cpu); -		if (threshold_cpu_callback) -			threshold_cpu_callback(action, cpu); -		break; -	case CPU_DEAD: -	case CPU_DEAD_FROZEN: -		if (threshold_cpu_callback) -			threshold_cpu_callback(action, cpu); -		mce_remove_device(cpu); -		break; -	case CPU_DOWN_PREPARE: -	case CPU_DOWN_PREPARE_FROZEN: -		del_timer_sync(t); -		smp_call_function_single(cpu, mce_disable_cpu, &action, 1); -		break; -	case CPU_DOWN_FAILED: -	case CPU_DOWN_FAILED_FROZEN: -		t->expires = round_jiffies(jiffies + -						__get_cpu_var(next_interval)); -		add_timer_on(t, cpu); -		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); -		break; -	case CPU_POST_DEAD: -		/* intentionally ignoring frozen here */ -		cmci_rediscover(cpu); -		break; -	} -	return NOTIFY_OK; -} - -static struct notifier_block mce_cpu_notifier __cpuinitdata = { -	.notifier_call = mce_cpu_callback, -}; - -static __init int mce_init_banks(void) -{ -	int i; - -	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, -				GFP_KERNEL); -	if (!bank_attrs) -		return -ENOMEM; - -	for (i = 0; i < banks; i++) { -		struct sysdev_attribute *a = &bank_attrs[i]; -		a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); -		if (!a->attr.name) -			goto nomem; -		a->attr.mode = 0644; -		a->show = show_bank; -		a->store = set_bank; -	} -	return 0; - -nomem: -	while (--i >= 0) -		kfree(bank_attrs[i].attr.name); -	kfree(bank_attrs); -	bank_attrs = NULL; -	return -ENOMEM; -} - -static __init int mce_init_device(void) -{ -	int err; -	int i = 0; - -	if (!mce_available(&boot_cpu_data)) -		return -EIO; - -	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); - -	err = mce_init_banks(); -	if (err) -		return err; - -	err = sysdev_class_register(&mce_sysclass); -	if (err) -		return err; - -	for_each_online_cpu(i) { -		err = mce_create_device(i); -		if (err) -			return err; -	} - -	register_hotcpu_notifier(&mce_cpu_notifier); -	misc_register(&mce_log_device); -	return err; -} - -device_initcall(mce_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 56dde9c4bc9..ddae21620bd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -13,22 +13,22 @@   *   *  All MC4_MISCi registers are shared between multi-cores   */ - -#include <linux/cpu.h> -#include <linux/errno.h> -#include <linux/init.h>  #include <linux/interrupt.h> -#include <linux/kobject.h>  #include <linux/notifier.h> -#include <linux/sched.h> -#include <linux/smp.h> +#include <linux/kobject.h> +#include <linux/percpu.h>  #include <linux/sysdev.h> +#include <linux/errno.h> +#include <linux/sched.h>  #include <linux/sysfs.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/smp.h> +  #include <asm/apic.h> +#include <asm/idle.h>  #include <asm/mce.h>  #include <asm/msr.h> -#include <asm/percpu.h> -#include <asm/idle.h>  #define PFX               "mce_threshold: "  #define VERSION           "version 1.1.1" @@ -48,26 +48,26 @@  #define MCG_XBLK_ADDR     0xC0000400  struct threshold_block { -	unsigned int block; -	unsigned int bank; -	unsigned int cpu; -	u32 address; -	u16 interrupt_enable; -	u16 threshold_limit; -	struct kobject kobj; -	struct list_head miscj; +	unsigned int		block; +	unsigned int		bank; +	unsigned int		cpu; +	u32			address; +	u16			interrupt_enable; +	u16			threshold_limit; +	struct kobject		kobj; +	struct list_head	miscj;  };  /* defaults used early on boot */  static struct threshold_block threshold_defaults = { -	.interrupt_enable = 0, -	.threshold_limit = THRESHOLD_MAX, +	.interrupt_enable	= 0, +	.threshold_limit	= THRESHOLD_MAX,  };  struct threshold_bank { -	struct kobject *kobj; -	struct threshold_block *blocks; -	cpumask_var_t cpus; +	struct kobject		*kobj; +	struct threshold_block	*blocks; +	cpumask_var_t		cpus;  };  static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); @@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);   */  struct thresh_restart { -	struct threshold_block *b; -	int reset; -	u16 old_limit; +	struct threshold_block	*b; +	int			reset; +	u16			old_limit;  };  /* must be called with correct cpu affinity */ @@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)  	} else if (tr->old_limit) {	/* change limit w/o reset */  		int new_count = (mci_misc_hi & THRESHOLD_MAX) +  		    (tr->old_limit - tr->b->threshold_limit); +  		mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |  		    (new_count & THRESHOLD_MAX);  	} @@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)  /* cpu init entry point, called from mce.c with preempt off */  void mce_amd_feature_init(struct cpuinfo_x86 *c)  { -	unsigned int bank, block;  	unsigned int cpu = smp_processor_id(); -	u8 lvt_off;  	u32 low = 0, high = 0, address = 0; +	unsigned int bank, block;  	struct thresh_restart tr; +	u8 lvt_off;  	for (bank = 0; bank < NR_BANKS; ++bank) {  		for (block = 0; block < NR_BLOCKS; ++block) { @@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)  				if (!address)  					break;  				address += MCG_XBLK_ADDR; -			} -			else +			} else  				++address;  			if (rdmsr_safe(address, &low, &high)) @@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)   */  static void amd_threshold_interrupt(void)  { +	u32 low = 0, high = 0, address = 0;  	unsigned int bank, block;  	struct mce m; -	u32 low = 0, high = 0, address = 0;  	mce_setup(&m); @@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)  		if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))  			continue;  		for (block = 0; block < NR_BLOCKS; ++block) { -			if (block == 0) +			if (block == 0) {  				address = MSR_IA32_MC0_MISC + bank * 4; -			else if (block == 1) { +			} else if (block == 1) {  				address = (low & MASK_BLKPTR_LO) >> 21;  				if (!address)  					break;  				address += MCG_XBLK_ADDR; -			} -			else +			} else {  				++address; +			}  			if (rdmsr_safe(address, &low, &high))  				break; @@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)  			     (high & MASK_LOCKED_HI))  				continue; -			/* Log the machine check that caused the threshold -			   event. */ +			/* +			 * Log the machine check that caused the threshold +			 * event. +			 */  			machine_check_poll(MCP_TIMESTAMP,  					&__get_cpu_var(mce_poll_banks)); @@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)  struct threshold_attr {  	struct attribute attr; -	ssize_t(*show) (struct threshold_block *, char *); -	ssize_t(*store) (struct threshold_block *, const char *, size_t count); +	ssize_t (*show) (struct threshold_block *, char *); +	ssize_t (*store) (struct threshold_block *, const char *, size_t count);  }; -#define SHOW_FIELDS(name)                                           \ -static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ -{                                                                   \ -        return sprintf(buf, "%lx\n", (unsigned long) b->name);      \ +#define SHOW_FIELDS(name)						\ +static ssize_t show_ ## name(struct threshold_block *b, char *buf)	\ +{									\ +	return sprintf(buf, "%lx\n", (unsigned long) b->name);		\  }  SHOW_FIELDS(interrupt_enable)  SHOW_FIELDS(threshold_limit) -static ssize_t store_interrupt_enable(struct threshold_block *b, -				      const char *buf, size_t count) +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)  { -	char *end;  	struct thresh_restart tr; -	unsigned long new = simple_strtoul(buf, &end, 0); -	if (end == buf) +	unsigned long new; + +	if (strict_strtoul(buf, 0, &new) < 0)  		return -EINVAL; +  	b->interrupt_enable = !!new; -	tr.b = b; -	tr.reset = 0; -	tr.old_limit = 0; +	tr.b		= b; +	tr.reset	= 0; +	tr.old_limit	= 0; +  	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); -	return end - buf; +	return size;  } -static ssize_t store_threshold_limit(struct threshold_block *b, -				     const char *buf, size_t count) +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)  { -	char *end;  	struct thresh_restart tr; -	unsigned long new = simple_strtoul(buf, &end, 0); -	if (end == buf) +	unsigned long new; + +	if (strict_strtoul(buf, 0, &new) < 0)  		return -EINVAL; +  	if (new > THRESHOLD_MAX)  		new = THRESHOLD_MAX;  	if (new < 1)  		new = 1; +  	tr.old_limit = b->threshold_limit;  	b->threshold_limit = new;  	tr.b = b; @@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,  	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); -	return end - buf; +	return size;  }  struct threshold_block_cross_cpu { -	struct threshold_block *tb; -	long retval; +	struct threshold_block	*tb; +	long			retval;  };  static void local_error_count_handler(void *_tbcc) @@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,  	return 1;  } -#define THRESHOLD_ATTR(_name,_mode,_show,_store) {            \ -        .attr = {.name = __stringify(_name), .mode = _mode }, \ -        .show = _show,                                        \ -        .store = _store,                                      \ +#define RW_ATTR(val)							\ +static struct threshold_attr val = {					\ +	.attr	= {.name = __stringify(val), .mode = 0644 },		\ +	.show	= show_## val,						\ +	.store	= store_## val,						\  }; -#define RW_ATTR(name)                                           \ -static struct threshold_attr name =                             \ -        THRESHOLD_ATTR(name, 0644, show_## name, store_## name) -  RW_ATTR(interrupt_enable);  RW_ATTR(threshold_limit);  RW_ATTR(error_count); @@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {  	NULL  }; -#define to_block(k) container_of(k, struct threshold_block, kobj) -#define to_attr(a) container_of(a, struct threshold_attr, attr) +#define to_block(k)	container_of(k, struct threshold_block, kobj) +#define to_attr(a)	container_of(a, struct threshold_attr, attr)  static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)  {  	struct threshold_block *b = to_block(kobj);  	struct threshold_attr *a = to_attr(attr);  	ssize_t ret; +  	ret = a->show ? a->show(b, buf) : -EIO; +  	return ret;  } @@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,  	struct threshold_block *b = to_block(kobj);  	struct threshold_attr *a = to_attr(attr);  	ssize_t ret; +  	ret = a->store ? a->store(b, buf, count) : -EIO; +  	return ret;  }  static struct sysfs_ops threshold_ops = { -	.show = show, -	.store = store, +	.show			= show, +	.store			= store,  };  static struct kobj_type threshold_ktype = { -	.sysfs_ops = &threshold_ops, -	.default_attrs = default_attrs, +	.sysfs_ops		= &threshold_ops, +	.default_attrs		= default_attrs,  };  static __cpuinit int allocate_threshold_blocks(unsigned int cpu, @@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,  					       unsigned int block,  					       u32 address)  { -	int err; -	u32 low, high;  	struct threshold_block *b = NULL; +	u32 low, high; +	int err;  	if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))  		return 0; @@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,  	if (!b)  		return -ENOMEM; -	b->block = block; -	b->bank = bank; -	b->cpu = cpu; -	b->address = address; -	b->interrupt_enable = 0; -	b->threshold_limit = THRESHOLD_MAX; +	b->block		= block; +	b->bank			= bank; +	b->cpu			= cpu; +	b->address		= address; +	b->interrupt_enable	= 0; +	b->threshold_limit	= THRESHOLD_MAX;  	INIT_LIST_HEAD(&b->miscj); -	if (per_cpu(threshold_banks, cpu)[bank]->blocks) +	if (per_cpu(threshold_banks, cpu)[bank]->blocks) {  		list_add(&b->miscj,  			 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); -	else +	} else {  		per_cpu(threshold_banks, cpu)[bank]->blocks = b; +	}  	err = kobject_init_and_add(&b->kobj, &threshold_ktype,  				   per_cpu(threshold_banks, cpu)[bank]->kobj, @@ -447,8 +455,9 @@ recurse:  		if (!address)  			return 0;  		address += MCG_XBLK_ADDR; -	} else +	} else {  		++address; +	}  	err = allocate_threshold_blocks(cpu, bank, ++block, address);  	if (err) @@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		if (!b)  			goto out; -		err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, +		err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,  					b->kobj, name);  		if (err)  			goto out;  		cpumask_copy(b->cpus, cpu_core_mask(cpu));  		per_cpu(threshold_banks, cpu)[bank] = b; +  		goto out;  	}  #endif @@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		goto out;  	} -	b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); +	b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);  	if (!b->kobj)  		goto out_free; @@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		if (i == cpu)  			continue; -		err = sysfs_create_link(&per_cpu(device_mce, i).kobj, +		err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,  					b->kobj, name);  		if (err)  			goto out; @@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,  static void threshold_remove_bank(unsigned int cpu, int bank)  { -	int i = 0;  	struct threshold_bank *b;  	char name[32]; +	int i = 0;  	b = per_cpu(threshold_banks, cpu)[bank]; -  	if (!b)  		return; -  	if (!b->blocks)  		goto free_out; @@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  #ifdef CONFIG_SMP  	/* sibling symlink */  	if (shared_bank[bank] && b->blocks->cpu != cpu) { -		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); +		sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);  		per_cpu(threshold_banks, cpu)[bank] = NULL; +  		return;  	}  #endif @@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  		if (i == cpu)  			continue; -		sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); +		sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);  		per_cpu(threshold_banks, i)[bank] = NULL;  	} @@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)  }  /* get notified when a cpu comes on/off */ -static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, -						     unsigned int cpu) +static void __cpuinit +amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)  { -	if (cpu >= NR_CPUS) -		return; -  	switch (action) {  	case CPU_ONLINE:  	case CPU_ONLINE_FROZEN: @@ -686,11 +692,12 @@ static __init int threshold_init_device(void)  	/* to hit CPUs online before the notifier is up */  	for_each_online_cpu(lcpu) {  		int err = threshold_create_device(lcpu); +  		if (err)  			return err;  	}  	threshold_cpu_callback = amd_64_threshold_cpu_callback; +  	return 0;  } -  device_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index cef3ee30744..e1acec0f7a3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,85 +8,10 @@  #include <linux/init.h>  #include <linux/interrupt.h>  #include <linux/percpu.h> -#include <asm/processor.h>  #include <asm/apic.h> +#include <asm/processor.h>  #include <asm/msr.h>  #include <asm/mce.h> -#include <asm/hw_irq.h> -#include <asm/idle.h> -#include <asm/therm_throt.h> -#include <asm/apic.h> - -asmlinkage void smp_thermal_interrupt(void) -{ -	__u64 msr_val; - -	ack_APIC_irq(); - -	exit_idle(); -	irq_enter(); - -	rdmsrl(MSR_IA32_THERM_STATUS, msr_val); -	if (therm_throt_process(msr_val & 1)) -		mce_log_therm_throt_event(msr_val); - -	inc_irq_stat(irq_thermal_count); -	irq_exit(); -} - -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ -	u32 l, h; -	int tm2 = 0; -	unsigned int cpu = smp_processor_id(); - -	if (!cpu_has(c, X86_FEATURE_ACPI)) -		return; - -	if (!cpu_has(c, X86_FEATURE_ACC)) -		return; - -	/* first check if TM1 is already enabled by the BIOS, in which -	 * case there might be some SMM goo which handles it, so we can't even -	 * put a handler since it might be delivered via SMI already. -	 */ -	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	h = apic_read(APIC_LVTTHMR); -	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { -		printk(KERN_DEBUG -		       "CPU%d: Thermal monitoring handled by SMI\n", cpu); -		return; -	} - -	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) -		tm2 = 1; - -	if (h & APIC_VECTOR_MASK) { -		printk(KERN_DEBUG -		       "CPU%d: Thermal LVT vector (%#x) already " -		       "installed\n", cpu, (h & APIC_VECTOR_MASK)); -		return; -	} - -	h = THERMAL_APIC_VECTOR; -	h |= (APIC_DM_FIXED | APIC_LVT_MASKED); -	apic_write(APIC_LVTTHMR, h); - -	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); -	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - -	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - -	l = apic_read(APIC_LVTTHMR); -	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); -	printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", -		cpu, tm2 ? "TM2" : "TM1"); - -	/* enable thermal throttle processing */ -	atomic_set(&therm_throt_en, 1); -	return; -}  /*   * Support for Intel Correct Machine Check Interrupts. This allows @@ -109,6 +34,9 @@ static int cmci_supported(int *banks)  {  	u64 cap; +	if (mce_cmci_disabled || mce_ignore_ce) +		return 0; +  	/*  	 * Vendor check is not strictly needed, but the initial  	 * initialization is vendor keyed and this @@ -132,7 +60,7 @@ static int cmci_supported(int *banks)  static void intel_threshold_interrupt(void)  {  	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); -	mce_notify_user(); +	mce_notify_irq();  }  static void print_update(char *type, int *hdr, int num) @@ -248,7 +176,7 @@ void cmci_rediscover(int dying)  		return;  	cpumask_copy(old, ¤t->cpus_allowed); -	for_each_online_cpu (cpu) { +	for_each_online_cpu(cpu) {  		if (cpu == dying)  			continue;  		if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index a74af128efc..f5f2d6f71fb 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -6,25 +6,23 @@   * This file contains routines to check for non-fatal MCEs every 15s   *   */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/jiffies.h> -#include <linux/workqueue.h>  #include <linux/interrupt.h> -#include <linux/smp.h> +#include <linux/workqueue.h> +#include <linux/jiffies.h> +#include <linux/kernel.h>  #include <linux/module.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/smp.h>  #include <asm/processor.h>  #include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include "mce.h" +static int		firstbank; -static int firstbank; - -#define MCE_RATE	15*HZ	/* timer rate is 15s */ +#define MCE_RATE	(15*HZ)	/* timer rate is 15s */  static void mce_checkregs(void *info)  { @@ -34,23 +32,24 @@ static void mce_checkregs(void *info)  	for (i = firstbank; i < nr_mce_banks; i++) {  		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); -		if (high & (1<<31)) { -			printk(KERN_INFO "MCE: The hardware reports a non " -				"fatal, correctable incident occurred on " -				"CPU %d.\n", +		if (!(high & (1<<31))) +			continue; + +		printk(KERN_INFO "MCE: The hardware reports a non fatal, " +			"correctable incident occurred on CPU %d.\n",  				smp_processor_id()); -			printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); -			/* -			 * Scrub the error so we don't pick it up in MCE_RATE -			 * seconds time. -			 */ -			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); +		printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); -			/* Serialize */ -			wmb(); -			add_taint(TAINT_MACHINE_CHECK); -		} +		/* +		 * Scrub the error so we don't pick it up in MCE_RATE +		 * seconds time: +		 */ +		wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + +		/* Serialize: */ +		wmb(); +		add_taint(TAINT_MACHINE_CHECK);  	}  } @@ -77,16 +76,17 @@ static int __init init_nonfatal_mce_checker(void)  	/* Some Athlons misbehave when we frob bank 0 */  	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && -		boot_cpu_data.x86 == 6) -			firstbank = 1; +						boot_cpu_data.x86 == 6) +		firstbank = 1;  	else -			firstbank = 0; +		firstbank = 0;  	/*  	 * Check for non-fatal errors every MCE_RATE s  	 */  	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));  	printk(KERN_INFO "Machine check exception polling timer started.\n"); +  	return 0;  }  module_init(init_nonfatal_mce_checker); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f53bdcbaf38..4482aea9aa2 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -1,21 +1,14 @@  /*   * P4 specific Machine Check Exception Reporting   */ - -#include <linux/init.h> -#include <linux/types.h>  #include <linux/kernel.h> -#include <linux/interrupt.h> +#include <linux/types.h> +#include <linux/init.h>  #include <linux/smp.h>  #include <asm/processor.h> -#include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include <asm/apic.h> - -#include <asm/therm_throt.h> - -#include "mce.h"  /* as supported by the P4/Xeon family */  struct intel_mce_extended_msrs { @@ -34,98 +27,8 @@ struct intel_mce_extended_msrs {  static int mce_num_extended_msrs; - -#ifdef CONFIG_X86_MCE_P4THERMAL -static void unexpected_thermal_interrupt(struct pt_regs *regs) -{ -	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", -			smp_processor_id()); -	add_taint(TAINT_MACHINE_CHECK); -} - -/* P4/Xeon Thermal transition interrupt handler */ -static void intel_thermal_interrupt(struct pt_regs *regs) -{ -	__u64 msr_val; - -	ack_APIC_irq(); - -	rdmsrl(MSR_IA32_THERM_STATUS, msr_val); -	therm_throt_process(msr_val & 0x1); -} - -/* Thermal interrupt handler for this CPU setup */ -static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; - -void smp_thermal_interrupt(struct pt_regs *regs) -{ -	irq_enter(); -	vendor_thermal_interrupt(regs); -	__get_cpu_var(irq_stat).irq_thermal_count++; -	irq_exit(); -} - -/* P4/Xeon Thermal regulation detect and init */ -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ -	u32 l, h; -	unsigned int cpu = smp_processor_id(); - -	/* Thermal monitoring */ -	if (!cpu_has(c, X86_FEATURE_ACPI)) -		return;	/* -ENODEV */ - -	/* Clock modulation */ -	if (!cpu_has(c, X86_FEATURE_ACC)) -		return;	/* -ENODEV */ - -	/* first check if its enabled already, in which case there might -	 * be some SMM goo which handles it, so we can't even put a handler -	 * since it might be delivered via SMI already -zwanem. -	 */ -	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	h = apic_read(APIC_LVTTHMR); -	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { -		printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", -				cpu); -		return; /* -EBUSY */ -	} - -	/* check whether a vector already exists, temporarily masked? */ -	if (h & APIC_VECTOR_MASK) { -		printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " -				"installed\n", -			cpu, (h & APIC_VECTOR_MASK)); -		return; /* -EBUSY */ -	} - -	/* The temperature transition interrupt handler setup */ -	h = THERMAL_APIC_VECTOR;		/* our delivery vector */ -	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */ -	apic_write(APIC_LVTTHMR, h); - -	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); -	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); - -	/* ok we're good to go... */ -	vendor_thermal_interrupt = intel_thermal_interrupt; - -	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - -	l = apic_read(APIC_LVTTHMR); -	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); -	printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); - -	/* enable thermal throttle processing */ -	atomic_set(&therm_throt_en, 1); -	return; -} -#endif /* CONFIG_X86_MCE_P4THERMAL */ - -  /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)  {  	u32 h; @@ -143,9 +46,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)  static void intel_machine_check(struct pt_regs *regs, long error_code)  { -	int recover = 1;  	u32 alow, ahigh, high, low;  	u32 mcgstl, mcgsth; +	int recover = 1;  	int i;  	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -157,7 +60,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  	if (mce_num_extended_msrs > 0) {  		struct intel_mce_extended_msrs dbg; +  		intel_get_extended_msrs(&dbg); +  		printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"  			"\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"  			"\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", @@ -171,6 +76,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  		if (high & (1<<31)) {  			char misc[20];  			char addr[24]; +  			misc[0] = addr[0] = '\0';  			if (high & (1<<29))  				recover |= 1; @@ -196,6 +102,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  		panic("Unable to continue");  	printk(KERN_EMERG "Attempting to continue.\n"); +  	/*  	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not  	 * recoverable/continuable.This will allow BIOS to look at the MSRs @@ -217,7 +124,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);  } -  void intel_p4_mcheck_init(struct cpuinfo_x86 *c)  {  	u32 l, h; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index c9f77ea69ed..5c0e6533d9b 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -2,52 +2,67 @@   * P5 specific Machine Check Exception Reporting   * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>   */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h>  #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h>  #include <linux/smp.h>  #include <asm/processor.h>  #include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include "mce.h" +/* By default disabled */ +int mce_p5_enabled __read_mostly; -/* Machine check handler for Pentium class Intel */ +/* Machine check handler for Pentium class Intel CPUs: */  static void pentium_machine_check(struct pt_regs *regs, long error_code)  {  	u32 loaddr, hi, lotype; +  	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);  	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); -	printk(KERN_EMERG "CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); -	if (lotype&(1<<5)) -		printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); + +	printk(KERN_EMERG +		"CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n", +		smp_processor_id(), loaddr, lotype); + +	if (lotype & (1<<5)) { +		printk(KERN_EMERG +			"CPU#%d: Possible thermal failure (CPU on fire ?).\n", +			smp_processor_id()); +	} +  	add_taint(TAINT_MACHINE_CHECK);  } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */  void intel_p5_mcheck_init(struct cpuinfo_x86 *c)  {  	u32 l, h; -	/*Check for MCE support */ -	if (!cpu_has(c, X86_FEATURE_MCE)) +	/* Default P5 to off as its often misconnected: */ +	if (!mce_p5_enabled)  		return; -	/* Default P5 to off as its often misconnected */ -	if (mce_disabled != -1) +	/* Check for MCE support: */ +	if (!cpu_has(c, X86_FEATURE_MCE))  		return; +  	machine_check_vector = pentium_machine_check; +	/* Make sure the vector pointer is visible before we enable MCEs: */  	wmb(); -	/* Read registers before enabling */ +	/* Read registers before enabling: */  	rdmsr(MSR_IA32_P5_MC_ADDR, l, h);  	rdmsr(MSR_IA32_P5_MC_TYPE, l, h); -	printk(KERN_INFO "Intel old style machine check architecture supported.\n"); +	printk(KERN_INFO +	       "Intel old style machine check architecture supported.\n"); -	/* Enable MCE */ +	/* Enable MCE: */  	set_in_cr4(X86_CR4_MCE); -	printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); +	printk(KERN_INFO +	       "Intel old style machine check reporting enabled on CPU#%d.\n", +	       smp_processor_id());  } diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 2ac52d7b434..01e4f817818 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -2,25 +2,23 @@   * P6 specific Machine Check Exception Reporting   * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>   */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h>  #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h>  #include <linux/smp.h>  #include <asm/processor.h>  #include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include "mce.h" -  /* Machine Check Handler For PII/PIII */  static void intel_machine_check(struct pt_regs *regs, long error_code)  { -	int recover = 1;  	u32 alow, ahigh, high, low;  	u32 mcgstl, mcgsth; +	int recover = 1;  	int i;  	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -35,12 +33,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  		if (high & (1<<31)) {  			char misc[20];  			char addr[24]; -			misc[0] = addr[0] = '\0'; + +			misc[0] = '\0'; +			addr[0] = '\0'; +  			if (high & (1<<29))  				recover |= 1;  			if (high & (1<<25))  				recover |= 2;  			high &= ~(1<<31); +  			if (high & (1<<27)) {  				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);  				snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,6 +51,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);  				snprintf(addr, 24, " at %08x%08x", ahigh, alow);  			} +  			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",  				smp_processor_id(), i, high, low, misc, addr);  		} @@ -63,16 +66,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  	/*  	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not  	 * recoverable/continuable.This will allow BIOS to look at the MSRs -	 * for errors if the OS could not log the error. +	 * for errors if the OS could not log the error:  	 */  	for (i = 0; i < nr_mce_banks; i++) {  		unsigned int msr; +  		msr = MSR_IA32_MC0_STATUS+i*4;  		rdmsr(msr, low, high);  		if (high & (1<<31)) { -			/* Clear it */ +			/* Clear it: */  			wrmsr(msr, 0UL, 0UL); -			/* Serialize */ +			/* Serialize: */  			wmb();  			add_taint(TAINT_MACHINE_CHECK);  		} @@ -81,7 +85,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)  	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);  } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */  void intel_p6_mcheck_init(struct cpuinfo_x86 *c)  {  	u32 l, h; @@ -97,6 +101,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)  	/* Ok machine check is available */  	machine_check_vector = intel_machine_check; +	/* Make sure the vector pointer is visible before we enable MCEs: */  	wmb();  	printk(KERN_INFO "Intel machine check architecture supported.\n"); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d5ae2243f0b..bff8dd191dd 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -1,7 +1,7 @@  /* - *   * Thermal throttle event support code (such as syslog messaging and rate   * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + *   * This allows consistent reporting of CPU thermal throttle events.   *   * Maintains a counter in /sys that keeps track of the number of thermal @@ -13,43 +13,53 @@   * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.   *          Inspired by Ross Biro's and Al Borchers' counter code.   */ - +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/jiffies.h> +#include <linux/kernel.h>  #include <linux/percpu.h>  #include <linux/sysdev.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/smp.h>  #include <linux/cpu.h> -#include <asm/cpu.h> -#include <linux/notifier.h> -#include <linux/jiffies.h> -#include <asm/therm_throt.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/mce.h> +#include <asm/msr.h>  /* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL              (300 * HZ) +#define CHECK_INTERVAL		(300 * HZ)  static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;  static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -atomic_t therm_throt_en = ATOMIC_INIT(0); + +static atomic_t therm_throt_en		= ATOMIC_INIT(0);  #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name)                              \ -        static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) +#define define_therm_throt_sysdev_one_ro(_name)				\ +	static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) -#define define_therm_throt_sysdev_show_func(name)                            \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,        \ -					struct sysdev_attribute *attr,	     \ -                                              char *buf)                     \ -{                                                                            \ -	unsigned int cpu = dev->id;                                          \ -	ssize_t ret;                                                         \ -                                                                             \ -	preempt_disable();              /* CPU hotplug */                    \ -	if (cpu_online(cpu))                                                 \ -		ret = sprintf(buf, "%lu\n",                                  \ -			      per_cpu(thermal_throttle_##name, cpu));        \ -	else                                                                 \ -		ret = 0;                                                     \ -	preempt_enable();                                                    \ -                                                                             \ -	return ret;                                                          \ +#define define_therm_throt_sysdev_show_func(name)			\ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,	\ +					struct sysdev_attribute *attr,	\ +					      char *buf)		\ +{									\ +	unsigned int cpu = dev->id;					\ +	ssize_t ret;							\ +									\ +	preempt_disable();	/* CPU hotplug */			\ +	if (cpu_online(cpu))						\ +		ret = sprintf(buf, "%lu\n",				\ +			      per_cpu(thermal_throttle_##name, cpu));	\ +	else								\ +		ret = 0;						\ +	preempt_enable();						\ +									\ +	return ret;							\  }  define_therm_throt_sysdev_show_func(count); @@ -61,8 +71,8 @@ static struct attribute *thermal_throttle_attrs[] = {  };  static struct attribute_group thermal_throttle_attr_group = { -	.attrs = thermal_throttle_attrs, -	.name = "thermal_throttle" +	.attrs	= thermal_throttle_attrs, +	.name	= "thermal_throttle"  };  #endif /* CONFIG_SYSFS */ @@ -82,7 +92,7 @@ static struct attribute_group thermal_throttle_attr_group = {   *          1 : Event should be logged further, and a message has been   *              printed to the syslog.   */ -int therm_throt_process(int curr) +static int therm_throt_process(int curr)  {  	unsigned int cpu = smp_processor_id();  	__u64 tmp_jiffs = get_jiffies_64(); @@ -110,10 +120,11 @@ int therm_throt_process(int curr)  }  #ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device */ +/* Add/Remove thermal_throttle interface for CPU device: */  static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)  { -	return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); +	return sysfs_create_group(&sys_dev->kobj, +				  &thermal_throttle_attr_group);  }  static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) @@ -121,19 +132,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)  	sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);  } -/* Mutex protecting device creation against CPU hotplug */ +/* Mutex protecting device creation against CPU hotplug: */  static DEFINE_MUTEX(therm_cpu_lock);  /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, -						   unsigned long action, -						   void *hcpu) +static __cpuinit int +thermal_throttle_cpu_callback(struct notifier_block *nfb, +			      unsigned long action, +			      void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu;  	struct sys_device *sys_dev;  	int err = 0;  	sys_dev = get_cpu_sysdev(cpu); +  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_UP_PREPARE_FROZEN: @@ -183,6 +196,94 @@ static __init int thermal_throttle_init_device(void)  	return 0;  } -  device_initcall(thermal_throttle_init_device); +  #endif /* CONFIG_SYSFS */ + +/* Thermal transition interrupt handler */ +static void intel_thermal_interrupt(void) +{ +	__u64 msr_val; + +	rdmsrl(MSR_IA32_THERM_STATUS, msr_val); +	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) +		mce_log_therm_throt_event(msr_val); +} + +static void unexpected_thermal_interrupt(void) +{ +	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", +			smp_processor_id()); +	add_taint(TAINT_MACHINE_CHECK); +} + +static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; + +asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +{ +	exit_idle(); +	irq_enter(); +	inc_irq_stat(irq_thermal_count); +	smp_thermal_vector(); +	irq_exit(); +	/* Ack only at the end to avoid potential reentry */ +	ack_APIC_irq(); +} + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ +	unsigned int cpu = smp_processor_id(); +	int tm2 = 0; +	u32 l, h; + +	/* Thermal monitoring depends on ACPI and clock modulation*/ +	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) +		return; + +	/* +	 * First check if its enabled already, in which case there might +	 * be some SMM goo which handles it, so we can't even put a handler +	 * since it might be delivered via SMI already: +	 */ +	rdmsr(MSR_IA32_MISC_ENABLE, l, h); +	h = apic_read(APIC_LVTTHMR); +	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { +		printk(KERN_DEBUG +		       "CPU%d: Thermal monitoring handled by SMI\n", cpu); +		return; +	} + +	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) +		tm2 = 1; + +	/* Check whether a vector already exists */ +	if (h & APIC_VECTOR_MASK) { +		printk(KERN_DEBUG +		       "CPU%d: Thermal LVT vector (%#x) already installed\n", +		       cpu, (h & APIC_VECTOR_MASK)); +		return; +	} + +	/* We'll mask the thermal vector in the lapic till we're ready: */ +	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; +	apic_write(APIC_LVTTHMR, h); + +	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); +	wrmsr(MSR_IA32_THERM_INTERRUPT, +		l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + +	smp_thermal_vector = intel_thermal_interrupt; + +	rdmsr(MSR_IA32_MISC_ENABLE, l, h); +	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + +	/* Unmask the thermal vector: */ +	l = apic_read(APIC_LVTTHMR); +	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + +	printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", +	       cpu, tm2 ? "TM2" : "TM1"); + +	/* enable thermal throttle processing */ +	atomic_set(&therm_throt_en, 1); +} diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 23ee9e730f7..d746df2909c 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)  void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage void mce_threshold_interrupt(void) +asmlinkage void smp_threshold_interrupt(void)  {  	exit_idle();  	irq_enter(); diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 2a043d89811..54060f56597 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -2,19 +2,17 @@   * IDT Winchip specific Machine Check Exception Reporting   * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>   */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h>  #include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h>  #include <asm/processor.h>  #include <asm/system.h> +#include <asm/mce.h>  #include <asm/msr.h> -#include "mce.h" - -/* Machine check handler for WinChip C6 */ +/* Machine check handler for WinChip C6: */  static void winchip_machine_check(struct pt_regs *regs, long error_code)  {  	printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); @@ -25,12 +23,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)  void winchip_mcheck_init(struct cpuinfo_x86 *c)  {  	u32 lo, hi; +  	machine_check_vector = winchip_machine_check; +	/* Make sure the vector pointer is visible before we enable MCEs: */  	wmb(); +  	rdmsr(MSR_IDT_FCR1, lo, hi);  	lo |= (1<<2);	/* Enable EIERRINT (int 18 MCE) */  	lo &= ~(1<<4);	/* Enable MCE */  	wrmsr(MSR_IDT_FCR1, lo, hi); +  	set_in_cr4(X86_CR4_MCE); -	printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); + +	printk(KERN_INFO +	       "Winchip machine check reporting enabled on CPU#0.\n");  } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ce0fe4b5c04..1d584a18a50 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)  	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)  		return 0; -	rdmsr(MTRRdefType_MSR, def, dummy); +	rdmsr(MSR_MTRRdefType, def, dummy);  	def &= 0xff;  	if (def != MTRR_TYPE_UNCACHABLE)  		return 0; @@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)  	 */  	if (!is_cpu(INTEL) || disable_mtrr_trim)  		return 0; -	rdmsr(MTRRdefType_MSR, def, dummy); +	rdmsr(MSR_MTRRdefType, def, dummy);  	def &= 0xff;  	if (def != MTRR_TYPE_UNCACHABLE)  		return 0; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d21d4fb161f..0543f69f0b2 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -20,9 +20,9 @@ struct fixed_range_block {  };  static struct fixed_range_block fixed_range_blocks[] = { -	{ MTRRfix64K_00000_MSR, 1 }, /* one  64k MTRR  */ -	{ MTRRfix16K_80000_MSR, 2 }, /* two  16k MTRRs */ -	{ MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */ +	{ MSR_MTRRfix64K_00000, 1 }, /* one  64k MTRR  */ +	{ MSR_MTRRfix16K_80000, 2 }, /* two  16k MTRRs */ +	{ MSR_MTRRfix4K_C0000,  8 }, /* eight 4k MTRRs */  	{}  }; @@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)  	k8_check_syscfg_dram_mod_en(); -	rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); +	rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);  	for (i = 0; i < 2; i++) -		rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); +		rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);  	for (i = 0; i < 8; i++) -		rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); +		rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);  }  void mtrr_save_fixed_ranges(void *info) @@ -310,7 +310,7 @@ void __init get_mtrr_state(void)  	vrs = mtrr_state.var_ranges; -	rdmsr(MTRRcap_MSR, lo, dummy); +	rdmsr(MSR_MTRRcap, lo, dummy);  	mtrr_state.have_fixed = (lo >> 8) & 1;  	for (i = 0; i < num_var_ranges; i++) @@ -318,7 +318,7 @@ void __init get_mtrr_state(void)  	if (mtrr_state.have_fixed)  		get_fixed_ranges(mtrr_state.fixed_ranges); -	rdmsr(MTRRdefType_MSR, lo, dummy); +	rdmsr(MSR_MTRRdefType, lo, dummy);  	mtrr_state.def_type = (lo & 0xff);  	mtrr_state.enabled = (lo & 0xc00) >> 10; @@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)  	__flush_tlb();  	/*  Save MTRR state */ -	rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); +	rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);  	/*  Disable MTRRs, and set the default type to uncached  */ -	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); +	mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);  }  static void post_set(void) __releases(set_atomicity_lock) @@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)  	__flush_tlb();  	/* Intel (P6) standard MTRRs */ -	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); +	mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);  	/*  Enable caches  */  	write_cr0(read_cr0() & 0xbfffffff); @@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i  static int generic_have_wrcomb(void)  {  	unsigned long config, dummy; -	rdmsr(MTRRcap_MSR, config, dummy); +	rdmsr(MSR_MTRRcap, config, dummy);  	return (config & (1 << 10));  } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 03cda01f57c..8fc248b5aea 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)  	unsigned long config = 0, dummy;  	if (use_intel()) { -		rdmsr(MTRRcap_MSR, config, dummy); +		rdmsr(MSR_MTRRcap, config, dummy);  	} else if (is_cpu(AMD))  		config = 2;  	else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 77f67f7b347..7538b767f20 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,21 +5,6 @@  #include <linux/types.h>  #include <linux/stddef.h> -#define MTRRcap_MSR     0x0fe -#define MTRRdefType_MSR 0x2ff - -#define MTRRfix64K_00000_MSR 0x250 -#define MTRRfix16K_80000_MSR 0x258 -#define MTRRfix16K_A0000_MSR 0x259 -#define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f -  #define MTRR_CHANGE_MASK_FIXED     0x01  #define MTRR_CHANGE_MASK_VARIABLE  0x02  #define MTRR_CHANGE_MASK_DEFTYPE   0x04 diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 7f7e2753685..1f5fb1588d1 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)  		if (use_intel())  			/*  Save MTRR state */ -			rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); +			rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);  		else  			/* Cyrix ARRs - everything else were excluded at the top */  			ctxt->ccr3 = getCx86(CX86_CCR3); @@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)  {  	if (use_intel())  		/*  Disable MTRRs, and set the default type to uncached  */ -		mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, +		mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,  		      ctxt->deftype_hi);  	else if (is_cpu(CYRIX))  		/* Cyrix ARRs - everything else were excluded at the top */ @@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)  		/*  Restore MTRRdefType  */  		if (use_intel())  			/* Intel (P6) standard MTRRs */ -			mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); +			mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);  		else  			/* Cyrix ARRs - everything else was excluded at the top */  			setCx86(CX86_CCR3, ctxt->ccr3); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 00000000000..76dfef23f78 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -0,0 +1,1721 @@ +/* + * Performance counter x86 architecture code + * + *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + *  Copyright (C) 2009 Jaswinder Singh Rajput + *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + *  For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_counter.h> +#include <linux/capability.h> +#include <linux/notifier.h> +#include <linux/hardirq.h> +#include <linux/kprobes.h> +#include <linux/module.h> +#include <linux/kdebug.h> +#include <linux/sched.h> +#include <linux/uaccess.h> +#include <linux/highmem.h> + +#include <asm/apic.h> +#include <asm/stacktrace.h> +#include <asm/nmi.h> + +static u64 perf_counter_mask __read_mostly; + +struct cpu_hw_counters { +	struct perf_counter	*counters[X86_PMC_IDX_MAX]; +	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +	unsigned long		interrupts; +	int			enabled; +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { +	const char	*name; +	int		version; +	int		(*handle_irq)(struct pt_regs *); +	void		(*disable_all)(void); +	void		(*enable_all)(void); +	void		(*enable)(struct hw_perf_counter *, int); +	void		(*disable)(struct hw_perf_counter *, int); +	unsigned	eventsel; +	unsigned	perfctr; +	u64		(*event_map)(int); +	u64		(*raw_event)(u64); +	int		max_events; +	int		num_counters; +	int		num_counters_fixed; +	int		counter_bits; +	u64		counter_mask; +	u64		max_period; +	u64		intel_ctrl; +}; + +static struct x86_pmu x86_pmu __read_mostly; + +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { +	.enabled = 1, +}; + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ +  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c, +  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0, +  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e, +  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e, +  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4, +  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5, +  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c, +}; + +static u64 intel_pmu_event_map(int event) +{ +	return intel_perfmon_event_map[event]; +} + +/* + * Generalized hw caching related event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'event makes no sense on + * this CPU', any other value means the raw event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */ +		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */ +		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */ +		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */ +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */ +		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */ +		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */ +		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */ +		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */ +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */ +		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */ +		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */ +		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ +		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +static const u64 core2_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */ +		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */ +		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */ +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */ +		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */ +		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */ +		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +static const u64 atom_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */ +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */ +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */ +		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */ +		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */ +		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +static u64 intel_pmu_raw_event(u64 event) +{ +#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL +#define CORE_EVNTSEL_INV_MASK		0x00800000ULL +#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL + +#define CORE_EVNTSEL_MASK		\ +	(CORE_EVNTSEL_EVENT_MASK |	\ +	 CORE_EVNTSEL_UNIT_MASK  |	\ +	 CORE_EVNTSEL_EDGE_MASK  |	\ +	 CORE_EVNTSEL_INV_MASK  |	\ +	 CORE_EVNTSEL_COUNTER_MASK) + +	return event & CORE_EVNTSEL_MASK; +} + +static const u64 amd_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */ +		[ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */ +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */ +		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */ +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */ +		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ +		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */ +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */ +		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */ +		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */ +		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ +  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076, +  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0, +  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080, +  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081, +  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4, +  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5, +}; + +static u64 amd_pmu_event_map(int event) +{ +	return amd_perfmon_event_map[event]; +} + +static u64 amd_pmu_raw_event(u64 event) +{ +#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL +#define K7_EVNTSEL_INV_MASK	0x000800000ULL +#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL + +#define K7_EVNTSEL_MASK			\ +	(K7_EVNTSEL_EVENT_MASK |	\ +	 K7_EVNTSEL_UNIT_MASK  |	\ +	 K7_EVNTSEL_EDGE_MASK  |	\ +	 K7_EVNTSEL_INV_MASK   |	\ +	 K7_EVNTSEL_COUNTER_MASK) + +	return event & K7_EVNTSEL_MASK; +} + +/* + * Propagate counter elapsed time into the generic counter. + * Can only be executed on the CPU where the counter is active. + * Returns the delta events processed. + */ +static u64 +x86_perf_counter_update(struct perf_counter *counter, +			struct hw_perf_counter *hwc, int idx) +{ +	int shift = 64 - x86_pmu.counter_bits; +	u64 prev_raw_count, new_raw_count; +	s64 delta; + +	/* +	 * Careful: an NMI might modify the previous counter value. +	 * +	 * Our tactic to handle this is to first atomically read and +	 * exchange a new raw count - then add that new-prev delta +	 * count to the generic counter atomically: +	 */ +again: +	prev_raw_count = atomic64_read(&hwc->prev_count); +	rdmsrl(hwc->counter_base + idx, new_raw_count); + +	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, +					new_raw_count) != prev_raw_count) +		goto again; + +	/* +	 * Now we have the new raw value and have updated the prev +	 * timestamp already. We can now calculate the elapsed delta +	 * (counter-)time and add that to the generic counter. +	 * +	 * Careful, not all hw sign-extends above the physical width +	 * of the count. +	 */ +	delta = (new_raw_count << shift) - (prev_raw_count << shift); +	delta >>= shift; + +	atomic64_add(delta, &counter->count); +	atomic64_sub(delta, &hwc->period_left); + +	return new_raw_count; +} + +static atomic_t active_counters; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ +	int i; + +	if (nmi_watchdog == NMI_LOCAL_APIC) +		disable_lapic_nmi_watchdog(); + +	for (i = 0; i < x86_pmu.num_counters; i++) { +		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) +			goto perfctr_fail; +	} + +	for (i = 0; i < x86_pmu.num_counters; i++) { +		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) +			goto eventsel_fail; +	} + +	return true; + +eventsel_fail: +	for (i--; i >= 0; i--) +		release_evntsel_nmi(x86_pmu.eventsel + i); + +	i = x86_pmu.num_counters; + +perfctr_fail: +	for (i--; i >= 0; i--) +		release_perfctr_nmi(x86_pmu.perfctr + i); + +	if (nmi_watchdog == NMI_LOCAL_APIC) +		enable_lapic_nmi_watchdog(); + +	return false; +} + +static void release_pmc_hardware(void) +{ +	int i; + +	for (i = 0; i < x86_pmu.num_counters; i++) { +		release_perfctr_nmi(x86_pmu.perfctr + i); +		release_evntsel_nmi(x86_pmu.eventsel + i); +	} + +	if (nmi_watchdog == NMI_LOCAL_APIC) +		enable_lapic_nmi_watchdog(); +} + +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ +	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { +		release_pmc_hardware(); +		mutex_unlock(&pmc_reserve_mutex); +	} +} + +static inline int x86_pmu_initialized(void) +{ +	return x86_pmu.handle_irq != NULL; +} + +static inline int +set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +{ +	unsigned int cache_type, cache_op, cache_result; +	u64 config, val; + +	config = attr->config; + +	cache_type = (config >>  0) & 0xff; +	if (cache_type >= PERF_COUNT_HW_CACHE_MAX) +		return -EINVAL; + +	cache_op = (config >>  8) & 0xff; +	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) +		return -EINVAL; + +	cache_result = (config >> 16) & 0xff; +	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) +		return -EINVAL; + +	val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + +	if (val == 0) +		return -ENOENT; + +	if (val == -1) +		return -EINVAL; + +	hwc->config |= val; + +	return 0; +} + +/* + * Setup the hardware configuration for a given attr_type + */ +static int __hw_perf_counter_init(struct perf_counter *counter) +{ +	struct perf_counter_attr *attr = &counter->attr; +	struct hw_perf_counter *hwc = &counter->hw; +	int err; + +	if (!x86_pmu_initialized()) +		return -ENODEV; + +	err = 0; +	if (!atomic_inc_not_zero(&active_counters)) { +		mutex_lock(&pmc_reserve_mutex); +		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) +			err = -EBUSY; +		else +			atomic_inc(&active_counters); +		mutex_unlock(&pmc_reserve_mutex); +	} +	if (err) +		return err; + +	/* +	 * Generate PMC IRQs: +	 * (keep 'enabled' bit clear for now) +	 */ +	hwc->config = ARCH_PERFMON_EVENTSEL_INT; + +	/* +	 * Count user and OS events unless requested not to. +	 */ +	if (!attr->exclude_user) +		hwc->config |= ARCH_PERFMON_EVENTSEL_USR; +	if (!attr->exclude_kernel) +		hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + +	if (!hwc->sample_period) { +		hwc->sample_period = x86_pmu.max_period; +		hwc->last_period = hwc->sample_period; +		atomic64_set(&hwc->period_left, hwc->sample_period); +	} + +	counter->destroy = hw_perf_counter_destroy; + +	/* +	 * Raw event type provide the config in the event structure +	 */ +	if (attr->type == PERF_TYPE_RAW) { +		hwc->config |= x86_pmu.raw_event(attr->config); +		return 0; +	} + +	if (attr->type == PERF_TYPE_HW_CACHE) +		return set_ext_hw_attr(hwc, attr); + +	if (attr->config >= x86_pmu.max_events) +		return -EINVAL; +	/* +	 * The generic map: +	 */ +	hwc->config |= x86_pmu.event_map(attr->config); + +	return 0; +} + +static void intel_pmu_disable_all(void) +{ +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +} + +static void amd_pmu_disable_all(void) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	int idx; + +	if (!cpuc->enabled) +		return; + +	cpuc->enabled = 0; +	/* +	 * ensure we write the disable before we start disabling the +	 * counters proper, so that amd_pmu_enable_counter() does the +	 * right thing. +	 */ +	barrier(); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		u64 val; + +		if (!test_bit(idx, cpuc->active_mask)) +			continue; +		rdmsrl(MSR_K7_EVNTSEL0 + idx, val); +		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) +			continue; +		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; +		wrmsrl(MSR_K7_EVNTSEL0 + idx, val); +	} +} + +void hw_perf_disable(void) +{ +	if (!x86_pmu_initialized()) +		return; +	return x86_pmu.disable_all(); +} + +static void intel_pmu_enable_all(void) +{ +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); +} + +static void amd_pmu_enable_all(void) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	int idx; + +	if (cpuc->enabled) +		return; + +	cpuc->enabled = 1; +	barrier(); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		u64 val; + +		if (!test_bit(idx, cpuc->active_mask)) +			continue; +		rdmsrl(MSR_K7_EVNTSEL0 + idx, val); +		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) +			continue; +		val |= ARCH_PERFMON_EVENTSEL0_ENABLE; +		wrmsrl(MSR_K7_EVNTSEL0 + idx, val); +	} +} + +void hw_perf_enable(void) +{ +	if (!x86_pmu_initialized()) +		return; +	x86_pmu.enable_all(); +} + +static inline u64 intel_pmu_get_status(void) +{ +	u64 status; + +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + +	return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ +	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ +	int err; +	err = checking_wrmsrl(hwc->config_base + idx, +			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ +	int err; +	err = checking_wrmsrl(hwc->config_base + idx, +			      hwc->config); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) +{ +	int idx = __idx - X86_PMC_IDX_FIXED; +	u64 ctrl_val, mask; +	int err; + +	mask = 0xfULL << (idx * 4); + +	rdmsrl(hwc->config_base, ctrl_val); +	ctrl_val &= ~mask; +	err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { +		intel_pmu_disable_fixed(hwc, idx); +		return; +	} + +	x86_pmu_disable_counter(hwc, idx); +} + +static inline void +amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ +	x86_pmu_disable_counter(hwc, idx); +} + +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the counter disabled in hw: + */ +static int +x86_perf_counter_set_period(struct perf_counter *counter, +			     struct hw_perf_counter *hwc, int idx) +{ +	s64 left = atomic64_read(&hwc->period_left); +	s64 period = hwc->sample_period; +	int err, ret = 0; + +	/* +	 * If we are way outside a reasoable range then just skip forward: +	 */ +	if (unlikely(left <= -period)) { +		left = period; +		atomic64_set(&hwc->period_left, left); +		hwc->last_period = period; +		ret = 1; +	} + +	if (unlikely(left <= 0)) { +		left += period; +		atomic64_set(&hwc->period_left, left); +		hwc->last_period = period; +		ret = 1; +	} +	/* +	 * Quirk: certain CPUs dont like it if just 1 event is left: +	 */ +	if (unlikely(left < 2)) +		left = 2; + +	if (left > x86_pmu.max_period) +		left = x86_pmu.max_period; + +	per_cpu(prev_left[idx], smp_processor_id()) = left; + +	/* +	 * The hw counter starts counting from this counter offset, +	 * mark it to be able to extra future deltas: +	 */ +	atomic64_set(&hwc->prev_count, (u64)-left); + +	err = checking_wrmsrl(hwc->counter_base + idx, +			     (u64)(-left) & x86_pmu.counter_mask); + +	return ret; +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) +{ +	int idx = __idx - X86_PMC_IDX_FIXED; +	u64 ctrl_val, bits, mask; +	int err; + +	/* +	 * Enable IRQ generation (0x8), +	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1) +	 * if requested: +	 */ +	bits = 0x8ULL; +	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) +		bits |= 0x2; +	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) +		bits |= 0x1; +	bits <<= (idx * 4); +	mask = 0xfULL << (idx * 4); + +	rdmsrl(hwc->config_base, ctrl_val); +	ctrl_val &= ~mask; +	ctrl_val |= bits; +	err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { +		intel_pmu_enable_fixed(hwc, idx); +		return; +	} + +	x86_pmu_enable_counter(hwc, idx); +} + +static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + +	if (cpuc->enabled) +		x86_pmu_enable_counter(hwc, idx); +	else +		x86_pmu_disable_counter(hwc, idx); +} + +static int +fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) +{ +	unsigned int event; + +	if (!x86_pmu.num_counters_fixed) +		return -1; + +	/* +	 * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: +	 */ +	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && +					boot_cpu_data.x86_model == 28) +		return -1; + +	event = hwc->config & ARCH_PERFMON_EVENT_MASK; + +	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) +		return X86_PMC_IDX_FIXED_INSTRUCTIONS; +	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) +		return X86_PMC_IDX_FIXED_CPU_CYCLES; +	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) +		return X86_PMC_IDX_FIXED_BUS_CYCLES; + +	return -1; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in counter: + */ +static int x86_pmu_enable(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct hw_perf_counter *hwc = &counter->hw; +	int idx; + +	idx = fixed_mode_idx(counter, hwc); +	if (idx >= 0) { +		/* +		 * Try to get the fixed counter, if that is already taken +		 * then try to get a generic counter: +		 */ +		if (test_and_set_bit(idx, cpuc->used_mask)) +			goto try_generic; + +		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; +		/* +		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to +		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: +		 */ +		hwc->counter_base = +			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; +		hwc->idx = idx; +	} else { +		idx = hwc->idx; +		/* Try to get the previous generic counter again */ +		if (test_and_set_bit(idx, cpuc->used_mask)) { +try_generic: +			idx = find_first_zero_bit(cpuc->used_mask, +						  x86_pmu.num_counters); +			if (idx == x86_pmu.num_counters) +				return -EAGAIN; + +			set_bit(idx, cpuc->used_mask); +			hwc->idx = idx; +		} +		hwc->config_base  = x86_pmu.eventsel; +		hwc->counter_base = x86_pmu.perfctr; +	} + +	perf_counters_lapic_init(); + +	x86_pmu.disable(hwc, idx); + +	cpuc->counters[idx] = counter; +	set_bit(idx, cpuc->active_mask); + +	x86_perf_counter_set_period(counter, hwc, idx); +	x86_pmu.enable(hwc, idx); + +	return 0; +} + +static void x86_pmu_unthrottle(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct hw_perf_counter *hwc = &counter->hw; + +	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || +				cpuc->counters[hwc->idx] != counter)) +		return; + +	x86_pmu.enable(hwc, hwc->idx); +} + +void perf_counter_print_debug(void) +{ +	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; +	struct cpu_hw_counters *cpuc; +	unsigned long flags; +	int cpu, idx; + +	if (!x86_pmu.num_counters) +		return; + +	local_irq_save(flags); + +	cpu = smp_processor_id(); +	cpuc = &per_cpu(cpu_hw_counters, cpu); + +	if (x86_pmu.version >= 2) { +		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); +		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); +		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); +		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + +		pr_info("\n"); +		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl); +		pr_info("CPU#%d: status:     %016llx\n", cpu, status); +		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow); +		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed); +	} +	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); +		rdmsrl(x86_pmu.perfctr  + idx, pmc_count); + +		prev_left = per_cpu(prev_left[idx], cpu); + +		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n", +			cpu, idx, pmc_ctrl); +		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n", +			cpu, idx, pmc_count); +		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n", +			cpu, idx, prev_left); +	} +	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { +		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + +		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", +			cpu, idx, pmc_count); +	} +	local_irq_restore(flags); +} + +static void x86_pmu_disable(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct hw_perf_counter *hwc = &counter->hw; +	int idx = hwc->idx; + +	/* +	 * Must be done before we disable, otherwise the nmi handler +	 * could reenable again: +	 */ +	clear_bit(idx, cpuc->active_mask); +	x86_pmu.disable(hwc, idx); + +	/* +	 * Make sure the cleared pointer becomes visible before we +	 * (potentially) free the counter: +	 */ +	barrier(); + +	/* +	 * Drain the remaining delta count out of a counter +	 * that we are disabling: +	 */ +	x86_perf_counter_update(counter, hwc, idx); +	cpuc->counters[idx] = NULL; +	clear_bit(idx, cpuc->used_mask); +} + +/* + * Save and restart an expired counter. Called by NMI contexts, + * so it has to be careful about preempting normal counter ops: + */ +static int intel_pmu_save_and_restart(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	int idx = hwc->idx; +	int ret; + +	x86_perf_counter_update(counter, hwc, idx); +	ret = x86_perf_counter_set_period(counter, hwc, idx); + +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) +		intel_pmu_enable_counter(hwc, idx); + +	return ret; +} + +static void intel_pmu_reset(void) +{ +	unsigned long flags; +	int idx; + +	if (!x86_pmu.num_counters) +		return; + +	local_irq_save(flags); + +	printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); +		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull); +	} +	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { +		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); +	} + +	local_irq_restore(flags); +} + + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ +	struct perf_sample_data data; +	struct cpu_hw_counters *cpuc; +	int bit, cpu, loops; +	u64 ack, status; + +	data.regs = regs; +	data.addr = 0; + +	cpu = smp_processor_id(); +	cpuc = &per_cpu(cpu_hw_counters, cpu); + +	perf_disable(); +	status = intel_pmu_get_status(); +	if (!status) { +		perf_enable(); +		return 0; +	} + +	loops = 0; +again: +	if (++loops > 100) { +		WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); +		perf_counter_print_debug(); +		intel_pmu_reset(); +		perf_enable(); +		return 1; +	} + +	inc_irq_stat(apic_perf_irqs); +	ack = status; +	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { +		struct perf_counter *counter = cpuc->counters[bit]; + +		clear_bit(bit, (unsigned long *) &status); +		if (!test_bit(bit, cpuc->active_mask)) +			continue; + +		if (!intel_pmu_save_and_restart(counter)) +			continue; + +		data.period = counter->hw.last_period; + +		if (perf_counter_overflow(counter, 1, &data)) +			intel_pmu_disable_counter(&counter->hw, bit); +	} + +	intel_pmu_ack_status(ack); + +	/* +	 * Repeat if there is more work to be done: +	 */ +	status = intel_pmu_get_status(); +	if (status) +		goto again; + +	perf_enable(); + +	return 1; +} + +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ +	struct perf_sample_data data; +	struct cpu_hw_counters *cpuc; +	struct perf_counter *counter; +	struct hw_perf_counter *hwc; +	int cpu, idx, handled = 0; +	u64 val; + +	data.regs = regs; +	data.addr = 0; + +	cpu = smp_processor_id(); +	cpuc = &per_cpu(cpu_hw_counters, cpu); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		if (!test_bit(idx, cpuc->active_mask)) +			continue; + +		counter = cpuc->counters[idx]; +		hwc = &counter->hw; + +		val = x86_perf_counter_update(counter, hwc, idx); +		if (val & (1ULL << (x86_pmu.counter_bits - 1))) +			continue; + +		/* +		 * counter overflow +		 */ +		handled		= 1; +		data.period	= counter->hw.last_period; + +		if (!x86_perf_counter_set_period(counter, hwc, idx)) +			continue; + +		if (perf_counter_overflow(counter, 1, &data)) +			amd_pmu_disable_counter(hwc, idx); +	} + +	if (handled) +		inc_irq_stat(apic_perf_irqs); + +	return handled; +} + +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ +	irq_enter(); +	ack_APIC_irq(); +	inc_irq_stat(apic_pending_irqs); +	perf_counter_do_pending(); +	irq_exit(); +} + +void set_perf_counter_pending(void) +{ +	apic->send_IPI_self(LOCAL_PENDING_VECTOR); +} + +void perf_counters_lapic_init(void) +{ +	if (!x86_pmu_initialized()) +		return; + +	/* +	 * Always use NMI for PMU +	 */ +	apic_write(APIC_LVTPC, APIC_DM_NMI); +} + +static int __kprobes +perf_counter_nmi_handler(struct notifier_block *self, +			 unsigned long cmd, void *__args) +{ +	struct die_args *args = __args; +	struct pt_regs *regs; + +	if (!atomic_read(&active_counters)) +		return NOTIFY_DONE; + +	switch (cmd) { +	case DIE_NMI: +	case DIE_NMI_IPI: +		break; + +	default: +		return NOTIFY_DONE; +	} + +	regs = args->regs; + +	apic_write(APIC_LVTPC, APIC_DM_NMI); +	/* +	 * Can't rely on the handled return value to say it was our NMI, two +	 * counters could trigger 'simultaneously' raising two back-to-back NMIs. +	 * +	 * If the first NMI handles both, the latter will be empty and daze +	 * the CPU. +	 */ +	x86_pmu.handle_irq(regs); + +	return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_counter_nmi_notifier = { +	.notifier_call		= perf_counter_nmi_handler, +	.next			= NULL, +	.priority		= 1 +}; + +static struct x86_pmu intel_pmu = { +	.name			= "Intel", +	.handle_irq		= intel_pmu_handle_irq, +	.disable_all		= intel_pmu_disable_all, +	.enable_all		= intel_pmu_enable_all, +	.enable			= intel_pmu_enable_counter, +	.disable		= intel_pmu_disable_counter, +	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0, +	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0, +	.event_map		= intel_pmu_event_map, +	.raw_event		= intel_pmu_raw_event, +	.max_events		= ARRAY_SIZE(intel_perfmon_event_map), +	/* +	 * Intel PMCs cannot be accessed sanely above 32 bit width, +	 * so we install an artificial 1<<31 period regardless of +	 * the generic counter period: +	 */ +	.max_period		= (1ULL << 31) - 1, +}; + +static struct x86_pmu amd_pmu = { +	.name			= "AMD", +	.handle_irq		= amd_pmu_handle_irq, +	.disable_all		= amd_pmu_disable_all, +	.enable_all		= amd_pmu_enable_all, +	.enable			= amd_pmu_enable_counter, +	.disable		= amd_pmu_disable_counter, +	.eventsel		= MSR_K7_EVNTSEL0, +	.perfctr		= MSR_K7_PERFCTR0, +	.event_map		= amd_pmu_event_map, +	.raw_event		= amd_pmu_raw_event, +	.max_events		= ARRAY_SIZE(amd_perfmon_event_map), +	.num_counters		= 4, +	.counter_bits		= 48, +	.counter_mask		= (1ULL << 48) - 1, +	/* use highest bit to detect overflow */ +	.max_period		= (1ULL << 47) - 1, +}; + +static int intel_pmu_init(void) +{ +	union cpuid10_edx edx; +	union cpuid10_eax eax; +	unsigned int unused; +	unsigned int ebx; +	int version; + +	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) +		return -ENODEV; + +	/* +	 * Check whether the Architectural PerfMon supports +	 * Branch Misses Retired Event or not. +	 */ +	cpuid(10, &eax.full, &ebx, &unused, &edx.full); +	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) +		return -ENODEV; + +	version = eax.split.version_id; +	if (version < 2) +		return -ENODEV; + +	x86_pmu				= intel_pmu; +	x86_pmu.version			= version; +	x86_pmu.num_counters		= eax.split.num_counters; +	x86_pmu.counter_bits		= eax.split.bit_width; +	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1; + +	/* +	 * Quirk: v2 perfmon does not report fixed-purpose counters, so +	 * assume at least 3 counters: +	 */ +	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3); + +	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + +	/* +	 * Install the hw-cache-events table: +	 */ +	switch (boot_cpu_data.x86_model) { +	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ +	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ +	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ +	case 29: /* six-core 45 nm xeon "Dunnington" */ +		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); + +		pr_cont("Core2 events, "); +		break; +	default: +	case 26: +		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); + +		pr_cont("Nehalem/Corei7 events, "); +		break; +	case 28: +		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); + +		pr_cont("Atom events, "); +		break; +	} +	return 0; +} + +static int amd_pmu_init(void) +{ +	/* Performance-monitoring supported from K7 and later: */ +	if (boot_cpu_data.x86 < 6) +		return -ENODEV; + +	x86_pmu = amd_pmu; + +	/* Events are common for all AMDs */ +	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, +	       sizeof(hw_cache_event_ids)); + +	return 0; +} + +void __init init_hw_perf_counters(void) +{ +	int err; + +	pr_info("Performance Counters: "); + +	switch (boot_cpu_data.x86_vendor) { +	case X86_VENDOR_INTEL: +		err = intel_pmu_init(); +		break; +	case X86_VENDOR_AMD: +		err = amd_pmu_init(); +		break; +	default: +		return; +	} +	if (err != 0) { +		pr_cont("no PMU driver, software counters only.\n"); +		return; +	} + +	pr_cont("%s PMU driver.\n", x86_pmu.name); + +	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { +		x86_pmu.num_counters = X86_PMC_MAX_GENERIC; +		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", +		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC); +	} +	perf_counter_mask = (1 << x86_pmu.num_counters) - 1; +	perf_max_counters = x86_pmu.num_counters; + +	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { +		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; +		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", +		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); +	} + +	perf_counter_mask |= +		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + +	perf_counters_lapic_init(); +	register_die_notifier(&perf_counter_nmi_notifier); + +	pr_info("... version:                 %d\n",     x86_pmu.version); +	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits); +	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters); +	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask); +	pr_info("... max period:              %016Lx\n", x86_pmu.max_period); +	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed); +	pr_info("... counter mask:            %016Lx\n", perf_counter_mask); +} + +static inline void x86_pmu_read(struct perf_counter *counter) +{ +	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +} + +static const struct pmu pmu = { +	.enable		= x86_pmu_enable, +	.disable	= x86_pmu_disable, +	.read		= x86_pmu_read, +	.unthrottle	= x86_pmu_unthrottle, +}; + +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +{ +	int err; + +	err = __hw_perf_counter_init(counter); +	if (err) +		return ERR_PTR(err); + +	return &pmu; +} + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, u64 ip) +{ +	if (entry->nr < PERF_MAX_STACK_DEPTH) +		entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ +	/* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ +	/* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ +	/* Process all stacks: */ +	return 0; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ +	struct perf_callchain_entry *entry = data; + +	if (reliable) +		callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { +	.warning		= backtrace_warning, +	.warning_symbol		= backtrace_warning_symbol, +	.stack			= backtrace_stack, +	.address		= backtrace_address, +}; + +#include "../dumpstack.h" + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ +	callchain_store(entry, PERF_CONTEXT_KERNEL); +	callchain_store(entry, regs->ip); + +	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); +} + +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ +	unsigned long offset, addr = (unsigned long)from; +	int type = in_nmi() ? KM_NMI : KM_IRQ0; +	unsigned long size, len = 0; +	struct page *page; +	void *map; +	int ret; + +	do { +		ret = __get_user_pages_fast(addr, 1, 0, &page); +		if (!ret) +			break; + +		offset = addr & (PAGE_SIZE - 1); +		size = min(PAGE_SIZE - offset, n - len); + +		map = kmap_atomic(page, type); +		memcpy(to, map+offset, size); +		kunmap_atomic(map, type); +		put_page(page); + +		len  += size; +		to   += size; +		addr += size; + +	} while (len < n); + +	return len; +} + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ +	unsigned long bytes; + +	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + +	return bytes == sizeof(*frame); +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ +	struct stack_frame frame; +	const void __user *fp; + +	if (!user_mode(regs)) +		regs = task_pt_regs(current); + +	fp = (void __user *)regs->bp; + +	callchain_store(entry, PERF_CONTEXT_USER); +	callchain_store(entry, regs->ip); + +	while (entry->nr < PERF_MAX_STACK_DEPTH) { +		frame.next_frame	     = NULL; +		frame.return_address = 0; + +		if (!copy_stack_frame(fp, &frame)) +			break; + +		if ((unsigned long)fp < regs->sp) +			break; + +		callchain_store(entry, frame.return_address); +		fp = frame.next_frame; +	} +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ +	int is_user; + +	if (!regs) +		return; + +	is_user = user_mode(regs); + +	if (!current || current->pid == 0) +		return; + +	if (is_user && current->state != TASK_RUNNING) +		return; + +	if (!is_user) +		perf_callchain_kernel(regs, entry); + +	if (current->mm) +		perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ +	struct perf_callchain_entry *entry; + +	if (in_nmi()) +		entry = &__get_cpu_var(nmi_entry); +	else +		entry = &__get_cpu_var(irq_entry); + +	entry->nr = 0; + +	perf_do_callchain(regs, entry); + +	return entry; +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index f6c70a164e3..5c481f6205b 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -19,8 +19,8 @@  #include <linux/nmi.h>  #include <linux/kprobes.h> -#include <asm/genapic.h> -#include <asm/intel_arch_perfmon.h> +#include <asm/apic.h> +#include <asm/perf_counter.h>  struct nmi_watchdog_ctlblk {  	unsigned int cccr_msr; @@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void)  		wd_ops = &k7_wd_ops;  		break;  	case X86_VENDOR_INTEL: -		/* -		 * Work around Core Duo (Yonah) errata AE49 where perfctr1 -		 * doesn't have a working enable bit. +		/* Work around where perfctr1 doesn't have a working enable +		 * bit as described in the following errata: +		 * AE49 Core Duo and Intel Core Solo 65 nm +		 * AN49 Intel Pentium Dual-Core +		 * AF49 Dual-Core Intel Xeon Processor LV  		 */ -		if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { +		if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || +		    ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && +		     boot_cpu_data.x86_mask == 4))) {  			intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;  			intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;  		} diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2ac1f0c2beb..b07af886124 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =  	.notifier_call = cpuid_class_cpu_callback,  }; +static char *cpuid_nodename(struct device *dev) +{ +	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); +} +  static int __init cpuid_init(void)  {  	int i, err = 0; @@ -198,6 +203,7 @@ static int __init cpuid_init(void)  		err = PTR_ERR(cpuid_class);  		goto out_chrdev;  	} +	cpuid_class->nodename = cpuid_nodename;  	for_each_online_cpu(i) {  		err = cpuid_device_create(i);  		if (err != 0) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ff958248e61..5e409dc298a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,6 +27,7 @@  #include <asm/cpu.h>  #include <asm/reboot.h>  #include <asm/virtext.h> +#include <asm/iommu.h>  #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -103,5 +104,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs)  #ifdef CONFIG_HPET_TIMER  	hpet_disable();  #endif + +#ifdef CONFIG_X86_64 +	pci_iommu_shutdown(); +#endif +  	crash_save_cpu(regs, safe_smp_processor_id());  } diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 87b67e3a765..48bfe138603 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -19,45 +19,61 @@   * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009   */ - -#include <asm/ds.h> - -#include <linux/errno.h> +#include <linux/kernel.h>  #include <linux/string.h> -#include <linux/slab.h> +#include <linux/errno.h>  #include <linux/sched.h> +#include <linux/slab.h>  #include <linux/mm.h> -#include <linux/kernel.h> +#include <linux/trace_clock.h> + +#include <asm/ds.h> +#include "ds_selftest.h"  /* - * The configuration for a particular DS hardware implementation. + * The configuration for a particular DS hardware implementation:   */  struct ds_configuration { -	/* the name of the configuration */ -	const char *name; -	/* the size of one pointer-typed field in the DS structure and -	   in the BTS and PEBS buffers in bytes; -	   this covers the first 8 DS fields related to buffer management. */ -	unsigned char  sizeof_field; -	/* the size of a BTS/PEBS record in bytes */ -	unsigned char  sizeof_rec[2]; -	/* a series of bit-masks to control various features indexed -	 * by enum ds_feature */ -	unsigned long ctl[dsf_ctl_max]; +	/* The name of the configuration: */ +	const char		*name; + +	/* The size of pointer-typed fields in DS, BTS, and PEBS: */ +	unsigned char		sizeof_ptr_field; + +	/* The size of a BTS/PEBS record in bytes: */ +	unsigned char		sizeof_rec[2]; + +	/* The number of pebs counter reset values in the DS structure. */ +	unsigned char		nr_counter_reset; + +	/* Control bit-masks indexed by enum ds_feature: */ +	unsigned long		ctl[dsf_ctl_max];  }; -static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +static struct ds_configuration ds_cfg __read_mostly; + + +/* Maximal size of a DS configuration: */ +#define MAX_SIZEOF_DS		0x80 -#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) +/* Maximal size of a BTS record: */ +#define MAX_SIZEOF_BTS		(3 * 8) -#define MAX_SIZEOF_DS (12 * 8)	/* maximal size of a DS configuration */ -#define MAX_SIZEOF_BTS (3 * 8)	/* maximal size of a BTS record */ -#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */ +/* BTS and PEBS buffer alignment: */ +#define DS_ALIGNMENT		(1 << 3) -#define BTS_CONTROL \ - (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ -  ds_cfg.ctl[dsf_bts_overflow]) +/* Number of buffer pointers in DS: */ +#define NUM_DS_PTR_FIELDS	8 +/* Size of a pebs reset value in DS: */ +#define PEBS_RESET_FIELD_SIZE	8 + +/* Mask of control bits in the DS MSR register: */ +#define BTS_CONTROL				  \ +	( ds_cfg.ctl[dsf_bts]			| \ +	  ds_cfg.ctl[dsf_bts_kernel]		| \ +	  ds_cfg.ctl[dsf_bts_user]		| \ +	  ds_cfg.ctl[dsf_bts_overflow] )  /*   * A BTS or PEBS tracer. @@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);   * to identify tracers.   */  struct ds_tracer { -	/* the DS context (partially) owned by this tracer */ -	struct ds_context *context; -	/* the buffer provided on ds_request() and its size in bytes */ -	void *buffer; -	size_t size; +	/* The DS context (partially) owned by this tracer. */ +	struct ds_context	*context; +	/* The buffer provided on ds_request() and its size in bytes. */ +	void			*buffer; +	size_t			size;  };  struct bts_tracer { -	/* the common DS part */ -	struct ds_tracer ds; -	/* the trace including the DS configuration */ -	struct bts_trace trace; -	/* buffer overflow notification function */ -	bts_ovfl_callback_t ovfl; +	/* The common DS part: */ +	struct ds_tracer	ds; + +	/* The trace including the DS configuration: */ +	struct bts_trace	trace; + +	/* Buffer overflow notification function: */ +	bts_ovfl_callback_t	ovfl; + +	/* Active flags affecting trace collection. */ +	unsigned int		flags;  };  struct pebs_tracer { -	/* the common DS part */ -	struct ds_tracer ds; -	/* the trace including the DS configuration */ -	struct pebs_trace trace; -	/* buffer overflow notification function */ -	pebs_ovfl_callback_t ovfl; +	/* The common DS part: */ +	struct ds_tracer	ds; + +	/* The trace including the DS configuration: */ +	struct pebs_trace	trace; + +	/* Buffer overflow notification function: */ +	pebs_ovfl_callback_t	ovfl;  };  /* @@ -97,6 +120,7 @@ struct pebs_tracer {   *   * The DS configuration consists of the following fields; different   * architetures vary in the size of those fields. + *   * - double-word aligned base linear address of the BTS buffer   * - write pointer into the BTS buffer   * - end linear address of the BTS buffer (one byte beyond the end of @@ -135,21 +159,22 @@ enum ds_field {  };  enum ds_qualifier { -	ds_bts  = 0, +	ds_bts = 0,  	ds_pebs  }; -static inline unsigned long ds_get(const unsigned char *base, -				   enum ds_qualifier qual, enum ds_field field) +static inline unsigned long +ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)  { -	base += (ds_cfg.sizeof_field * (field + (4 * qual))); +	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));  	return *(unsigned long *)base;  } -static inline void ds_set(unsigned char *base, enum ds_qualifier qual, -			  enum ds_field field, unsigned long value) +static inline void +ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, +       unsigned long value)  { -	base += (ds_cfg.sizeof_field * (field + (4 * qual))); +	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));  	(*(unsigned long *)base) = value;  } @@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,   */  static DEFINE_SPINLOCK(ds_lock); -  /*   * We either support (system-wide) per-cpu or per-thread allocation.   * We distinguish the two based on the task_struct pointer, where a @@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock);   */  static atomic_t tracers = ATOMIC_INIT(0); -static inline void get_tracer(struct task_struct *task) +static inline int get_tracer(struct task_struct *task)  { -	if (task) +	int error; + +	spin_lock_irq(&ds_lock); + +	if (task) { +		error = -EPERM; +		if (atomic_read(&tracers) < 0) +			goto out;  		atomic_inc(&tracers); -	else +	} else { +		error = -EPERM; +		if (atomic_read(&tracers) > 0) +			goto out;  		atomic_dec(&tracers); +	} + +	error = 0; +out: +	spin_unlock_irq(&ds_lock); +	return error;  }  static inline void put_tracer(struct task_struct *task) @@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task)  		atomic_inc(&tracers);  } -static inline int check_tracer(struct task_struct *task) -{ -	return task ? -		(atomic_read(&tracers) >= 0) : -		(atomic_read(&tracers) <= 0); -} - -  /*   * The DS context is either attached to a thread or to a cpu:   * - in the former case, the thread_struct contains a pointer to the @@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task)   * deallocated when the last user puts the context.   */  struct ds_context { -	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ -	unsigned char ds[MAX_SIZEOF_DS]; -	/* the owner of the BTS and PEBS configuration, respectively */ -	struct bts_tracer *bts_master; -	struct pebs_tracer *pebs_master; -	/* use count */ -	unsigned long count; -	/* a pointer to the context location inside the thread_struct -	 * or the per_cpu context array */ -	struct ds_context **this; -	/* a pointer to the task owning this context, or NULL, if the -	 * context is owned by a cpu */ -	struct task_struct *task; -}; +	/* The DS configuration; goes into MSR_IA32_DS_AREA: */ +	unsigned char		ds[MAX_SIZEOF_DS]; + +	/* The owner of the BTS and PEBS configuration, respectively: */ +	struct bts_tracer	*bts_master; +	struct pebs_tracer	*pebs_master; -static DEFINE_PER_CPU(struct ds_context *, system_context_array); +	/* Use count: */ +	unsigned long		count; -#define system_context per_cpu(system_context_array, smp_processor_id()) +	/* Pointer to the context pointer field: */ +	struct ds_context	**this; + +	/* The traced task; NULL for cpu tracing: */ +	struct task_struct	*task; + +	/* The traced cpu; only valid if task is NULL: */ +	int			cpu; +}; +static DEFINE_PER_CPU(struct ds_context *, cpu_context); -static inline struct ds_context *ds_get_context(struct task_struct *task) + +static struct ds_context *ds_get_context(struct task_struct *task, int cpu)  {  	struct ds_context **p_context = -		(task ? &task->thread.ds_ctx : &system_context); +		(task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));  	struct ds_context *context = NULL;  	struct ds_context *new_context = NULL; -	unsigned long irq;  	/* Chances are small that we already have a context. */  	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);  	if (!new_context)  		return NULL; -	spin_lock_irqsave(&ds_lock, irq); +	spin_lock_irq(&ds_lock);  	context = *p_context; -	if (!context) { +	if (likely(!context)) {  		context = new_context;  		context->this = p_context;  		context->task = task; +		context->cpu = cpu;  		context->count = 0; -		if (task) -			set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - -		if (!task || (task == current)) -			wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); -  		*p_context = context;  	}  	context->count++; -	spin_unlock_irqrestore(&ds_lock, irq); +	spin_unlock_irq(&ds_lock);  	if (context != new_context)  		kfree(new_context); @@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)  	return context;  } -static inline void ds_put_context(struct ds_context *context) +static void ds_put_context(struct ds_context *context)  { +	struct task_struct *task;  	unsigned long irq;  	if (!context) @@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context)  	*(context->this) = NULL; -	if (context->task) -		clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); +	task = context->task; + +	if (task) +		clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); -	if (!context->task || (context->task == current)) -		wrmsrl(MSR_IA32_DS_AREA, 0); +	/* +	 * We leave the (now dangling) pointer to the DS configuration in +	 * the DS_AREA msr. This is as good or as bad as replacing it with +	 * NULL - the hardware would crash if we enabled tracing. +	 * +	 * This saves us some problems with having to write an msr on a +	 * different cpu while preventing others from doing the same for the +	 * next context for that same cpu. +	 */  	spin_unlock_irqrestore(&ds_lock, irq); +	/* The context might still be in use for context switching. */ +	if (task && (task != current)) +		wait_task_context_switch(task); +  	kfree(context);  } +static void ds_install_ds_area(struct ds_context *context) +{ +	unsigned long ds; + +	ds = (unsigned long)context->ds; + +	/* +	 * There is a race between the bts master and the pebs master. +	 * +	 * The thread/cpu access is synchronized via get/put_cpu() for +	 * task tracing and via wrmsr_on_cpu for cpu tracing. +	 * +	 * If bts and pebs are collected for the same task or same cpu, +	 * the same confiuration is written twice. +	 */ +	if (context->task) { +		get_cpu(); +		if (context->task == current) +			wrmsrl(MSR_IA32_DS_AREA, ds); +		set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); +		put_cpu(); +	} else +		wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA, +			     (u32)((u64)ds), (u32)((u64)ds >> 32)); +}  /*   * Call the tracer's callback on a buffer overflow. @@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)   * The remainder of any partially written record is zeroed out.   *   * context: the DS context - * qual: the buffer type - * record: the data to write - * size: the size of the data + * qual:    the buffer type + * record:  the data to write + * size:    the size of the data   */  static int ds_write(struct ds_context *context, enum ds_qualifier qual,  		    const void *record, size_t size) @@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,  		unsigned long write_size, adj_write_size;  		/* -		 * write as much as possible without producing an +		 * Write as much as possible without producing an  		 * overflow interrupt.  		 * -		 * interrupt_threshold must either be +		 * Interrupt_threshold must either be  		 * - bigger than absolute_maximum or  		 * - point to a record between buffer_base and absolute_maximum  		 * -		 * index points to a valid record. +		 * Index points to a valid record.  		 */  		base   = ds_get(context->ds, qual, ds_buffer_base);  		index  = ds_get(context->ds, qual, ds_index); @@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,  		write_end = min(end, int_th); -		/* if we are already beyond the interrupt threshold, -		 * we fill the entire buffer */ +		/* +		 * If we are already beyond the interrupt threshold, +		 * we fill the entire buffer. +		 */  		if (write_end <= index)  			write_end = end; @@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,  		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];  		adj_write_size *= ds_cfg.sizeof_rec[qual]; -		/* zero out trailing bytes */ +		/* Zero out trailing bytes. */  		memset((char *)index + write_size, 0,  		       adj_write_size - write_size);  		index += adj_write_size; @@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,   * Later architectures use 64bit pointers throughout, whereas earlier   * architectures use 32bit pointers in 32bit mode.   * - * We compute the base address for the first 8 fields based on: + * We compute the base address for the fields based on:   * - the field size stored in the DS configuration   * - the relative field position   * @@ -431,23 +501,23 @@ enum bts_field {  	bts_to,  	bts_flags, -	bts_qual = bts_from, -	bts_jiffies = bts_to, -	bts_pid = bts_flags, +	bts_qual		= bts_from, +	bts_clock		= bts_to, +	bts_pid			= bts_flags, -	bts_qual_mask = (bts_qual_max - 1), -	bts_escape = ((unsigned long)-1 & ~bts_qual_mask) +	bts_qual_mask		= (bts_qual_max - 1), +	bts_escape		= ((unsigned long)-1 & ~bts_qual_mask)  };  static inline unsigned long bts_get(const char *base, enum bts_field field)  { -	base += (ds_cfg.sizeof_field * field); +	base += (ds_cfg.sizeof_ptr_field * field);  	return *(unsigned long *)base;  }  static inline void bts_set(char *base, enum bts_field field, unsigned long val)  { -	base += (ds_cfg.sizeof_field * field);; +	base += (ds_cfg.sizeof_ptr_field * field);;  	(*(unsigned long *)base) = val;  } @@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)   *   * return: bytes read/written on success; -Eerrno, otherwise   */ -static int bts_read(struct bts_tracer *tracer, const void *at, -		    struct bts_struct *out) +static int +bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)  {  	if (!tracer)  		return -EINVAL; @@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at,  	memset(out, 0, sizeof(*out));  	if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {  		out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); -		out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); -		out->variant.timestamp.pid = bts_get(at, bts_pid); +		out->variant.event.clock = bts_get(at, bts_clock); +		out->variant.event.pid = bts_get(at, bts_pid);  	} else {  		out->qualifier = bts_branch;  		out->variant.lbr.from = bts_get(at, bts_from); @@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)  	case bts_task_arrives:  	case bts_task_departs:  		bts_set(raw, bts_qual, (bts_escape | in->qualifier)); -		bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); -		bts_set(raw, bts_pid, in->variant.timestamp.pid); +		bts_set(raw, bts_clock, in->variant.event.clock); +		bts_set(raw, bts_pid, in->variant.event.pid);  		break;  	default:  		return -EINVAL; @@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,  			     unsigned int flags) {  	unsigned long buffer, adj; -	/* adjust the buffer address and size to meet alignment +	/* +	 * Adjust the buffer address and size to meet alignment  	 * constraints:  	 * - buffer is double-word aligned  	 * - size is multiple of record size @@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,  	trace->begin = (void *)buffer;  	trace->top = trace->begin;  	trace->end = (void *)(buffer + size); -	/* The value for 'no threshold' is -1, which will set the +	/* +	 * The value for 'no threshold' is -1, which will set the  	 * threshold outside of the buffer, just like we want it.  	 */ +	ith *= ds_cfg.sizeof_rec[qual];  	trace->ith = (void *)(buffer + size - ith);  	trace->flags = flags; @@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,  static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,  		      enum ds_qualifier qual, struct task_struct *task, -		      void *base, size_t size, size_t th, unsigned int flags) +		      int cpu, void *base, size_t size, size_t th)  {  	struct ds_context *context;  	int error; +	size_t req_size; + +	error = -EOPNOTSUPP; +	if (!ds_cfg.sizeof_rec[qual]) +		goto out;  	error = -EINVAL;  	if (!base)  		goto out; -	/* we require some space to do alignment adjustments below */ +	req_size = ds_cfg.sizeof_rec[qual]; +	/* We might need space for alignment adjustments. */ +	if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT)) +		req_size += DS_ALIGNMENT; +  	error = -EINVAL; -	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) +	if (size < req_size)  		goto out;  	if (th != (size_t)-1) { @@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,  	tracer->size = size;  	error = -ENOMEM; -	context = ds_get_context(task); +	context = ds_get_context(task, cpu);  	if (!context)  		goto out;  	tracer->context = context; -	ds_init_ds_trace(trace, qual, base, size, th, flags); +	/* +	 * Defer any tracer-specific initialization work for the context until +	 * context ownership has been clarified. +	 */  	error = 0;   out:  	return error;  } -struct bts_tracer *ds_request_bts(struct task_struct *task, -				  void *base, size_t size, -				  bts_ovfl_callback_t ovfl, size_t th, -				  unsigned int flags) +static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu, +					 void *base, size_t size, +					 bts_ovfl_callback_t ovfl, size_t th, +					 unsigned int flags)  {  	struct bts_tracer *tracer; -	unsigned long irq;  	int error; +	/* Buffer overflow notification is not yet implemented. */  	error = -EOPNOTSUPP; -	if (!ds_cfg.ctl[dsf_bts]) +	if (ovfl)  		goto out; -	/* buffer overflow notification is not yet implemented */ -	error = -EOPNOTSUPP; -	if (ovfl) +	error = get_tracer(task); +	if (error < 0)  		goto out;  	error = -ENOMEM;  	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);  	if (!tracer) -		goto out; +		goto out_put_tracer;  	tracer->ovfl = ovfl; +	/* Do some more error checking and acquire a tracing context. */  	error = ds_request(&tracer->ds, &tracer->trace.ds, -			   ds_bts, task, base, size, th, flags); +			   ds_bts, task, cpu, base, size, th);  	if (error < 0)  		goto out_tracer; - -	spin_lock_irqsave(&ds_lock, irq); - -	error = -EPERM; -	if (!check_tracer(task)) -		goto out_unlock; -	get_tracer(task); +	/* Claim the bts part of the tracing context we acquired above. */ +	spin_lock_irq(&ds_lock);  	error = -EPERM;  	if (tracer->ds.context->bts_master) -		goto out_put_tracer; +		goto out_unlock;  	tracer->ds.context->bts_master = tracer; -	spin_unlock_irqrestore(&ds_lock, irq); +	spin_unlock_irq(&ds_lock); +	/* +	 * Now that we own the bts part of the context, let's complete the +	 * initialization for that part. +	 */ +	ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags); +	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); +	ds_install_ds_area(tracer->ds.context);  	tracer->trace.read  = bts_read;  	tracer->trace.write = bts_write; -	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); +	/* Start tracing. */  	ds_resume_bts(tracer);  	return tracer; - out_put_tracer: -	put_tracer(task);   out_unlock: -	spin_unlock_irqrestore(&ds_lock, irq); +	spin_unlock_irq(&ds_lock);  	ds_put_context(tracer->ds.context);   out_tracer:  	kfree(tracer); + out_put_tracer: +	put_tracer(task);   out:  	return ERR_PTR(error);  } -struct pebs_tracer *ds_request_pebs(struct task_struct *task, -				    void *base, size_t size, -				    pebs_ovfl_callback_t ovfl, size_t th, -				    unsigned int flags) +struct bts_tracer *ds_request_bts_task(struct task_struct *task, +				       void *base, size_t size, +				       bts_ovfl_callback_t ovfl, +				       size_t th, unsigned int flags) +{ +	return ds_request_bts(task, 0, base, size, ovfl, th, flags); +} + +struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, +				      bts_ovfl_callback_t ovfl, +				      size_t th, unsigned int flags) +{ +	return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags); +} + +static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu, +					   void *base, size_t size, +					   pebs_ovfl_callback_t ovfl, size_t th, +					   unsigned int flags)  {  	struct pebs_tracer *tracer; -	unsigned long irq;  	int error; -	/* buffer overflow notification is not yet implemented */ +	/* Buffer overflow notification is not yet implemented. */  	error = -EOPNOTSUPP;  	if (ovfl)  		goto out; +	error = get_tracer(task); +	if (error < 0) +		goto out; +  	error = -ENOMEM;  	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);  	if (!tracer) -		goto out; +		goto out_put_tracer;  	tracer->ovfl = ovfl; +	/* Do some more error checking and acquire a tracing context. */  	error = ds_request(&tracer->ds, &tracer->trace.ds, -			   ds_pebs, task, base, size, th, flags); +			   ds_pebs, task, cpu, base, size, th);  	if (error < 0)  		goto out_tracer; -	spin_lock_irqsave(&ds_lock, irq); - -	error = -EPERM; -	if (!check_tracer(task)) -		goto out_unlock; -	get_tracer(task); +	/* Claim the pebs part of the tracing context we acquired above. */ +	spin_lock_irq(&ds_lock);  	error = -EPERM;  	if (tracer->ds.context->pebs_master) -		goto out_put_tracer; +		goto out_unlock;  	tracer->ds.context->pebs_master = tracer; -	spin_unlock_irqrestore(&ds_lock, irq); +	spin_unlock_irq(&ds_lock); +	/* +	 * Now that we own the pebs part of the context, let's complete the +	 * initialization for that part. +	 */ +	ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);  	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); +	ds_install_ds_area(tracer->ds.context); + +	/* Start tracing. */  	ds_resume_pebs(tracer);  	return tracer; - out_put_tracer: -	put_tracer(task);   out_unlock: -	spin_unlock_irqrestore(&ds_lock, irq); +	spin_unlock_irq(&ds_lock);  	ds_put_context(tracer->ds.context);   out_tracer:  	kfree(tracer); + out_put_tracer: +	put_tracer(task);   out:  	return ERR_PTR(error);  } -void ds_release_bts(struct bts_tracer *tracer) +struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, +					 void *base, size_t size, +					 pebs_ovfl_callback_t ovfl, +					 size_t th, unsigned int flags)  { -	if (!tracer) -		return; +	return ds_request_pebs(task, 0, base, size, ovfl, th, flags); +} -	ds_suspend_bts(tracer); +struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size, +					pebs_ovfl_callback_t ovfl, +					size_t th, unsigned int flags) +{ +	return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags); +} + +static void ds_free_bts(struct bts_tracer *tracer) +{ +	struct task_struct *task; + +	task = tracer->ds.context->task;  	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);  	tracer->ds.context->bts_master = NULL; -	put_tracer(tracer->ds.context->task); +	/* Make sure tracing stopped and the tracer is not in use. */ +	if (task && (task != current)) +		wait_task_context_switch(task); +  	ds_put_context(tracer->ds.context); +	put_tracer(task);  	kfree(tracer);  } +void ds_release_bts(struct bts_tracer *tracer) +{ +	might_sleep(); + +	if (!tracer) +		return; + +	ds_suspend_bts(tracer); +	ds_free_bts(tracer); +} + +int ds_release_bts_noirq(struct bts_tracer *tracer) +{ +	struct task_struct *task; +	unsigned long irq; +	int error; + +	if (!tracer) +		return 0; + +	task = tracer->ds.context->task; + +	local_irq_save(irq); + +	error = -EPERM; +	if (!task && +	    (tracer->ds.context->cpu != smp_processor_id())) +		goto out; + +	error = -EPERM; +	if (task && (task != current)) +		goto out; + +	ds_suspend_bts_noirq(tracer); +	ds_free_bts(tracer); + +	error = 0; + out: +	local_irq_restore(irq); +	return error; +} + +static void update_task_debugctlmsr(struct task_struct *task, +				    unsigned long debugctlmsr) +{ +	task->thread.debugctlmsr = debugctlmsr; + +	get_cpu(); +	if (task == current) +		update_debugctlmsr(debugctlmsr); +	put_cpu(); +} +  void ds_suspend_bts(struct bts_tracer *tracer)  {  	struct task_struct *task; +	unsigned long debugctlmsr; +	int cpu;  	if (!tracer)  		return; +	tracer->flags = 0; +  	task = tracer->ds.context->task; +	cpu  = tracer->ds.context->cpu; -	if (!task || (task == current)) -		update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); +	WARN_ON(!task && irqs_disabled()); -	if (task) { -		task->thread.debugctlmsr &= ~BTS_CONTROL; +	debugctlmsr = (task ? +		       task->thread.debugctlmsr : +		       get_debugctlmsr_on_cpu(cpu)); +	debugctlmsr &= ~BTS_CONTROL; -		if (!task->thread.debugctlmsr) -			clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); -	} +	if (task) +		update_task_debugctlmsr(task, debugctlmsr); +	else +		update_debugctlmsr_on_cpu(cpu, debugctlmsr);  } -void ds_resume_bts(struct bts_tracer *tracer) +int ds_suspend_bts_noirq(struct bts_tracer *tracer)  {  	struct task_struct *task; -	unsigned long control; +	unsigned long debugctlmsr, irq; +	int cpu, error = 0;  	if (!tracer) -		return; +		return 0; + +	tracer->flags = 0;  	task = tracer->ds.context->task; +	cpu  = tracer->ds.context->cpu; + +	local_irq_save(irq); + +	error = -EPERM; +	if (!task && (cpu != smp_processor_id())) +		goto out; + +	debugctlmsr = (task ? +		       task->thread.debugctlmsr : +		       get_debugctlmsr()); +	debugctlmsr &= ~BTS_CONTROL; + +	if (task) +		update_task_debugctlmsr(task, debugctlmsr); +	else +		update_debugctlmsr(debugctlmsr); + +	error = 0; + out: +	local_irq_restore(irq); +	return error; +} + +static unsigned long ds_bts_control(struct bts_tracer *tracer) +{ +	unsigned long control;  	control = ds_cfg.ctl[dsf_bts];  	if (!(tracer->trace.ds.flags & BTS_KERNEL)) @@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer)  	if (!(tracer->trace.ds.flags & BTS_USER))  		control |= ds_cfg.ctl[dsf_bts_user]; -	if (task) { -		task->thread.debugctlmsr |= control; -		set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); -	} - -	if (!task || (task == current)) -		update_debugctlmsr(get_debugctlmsr() | control); +	return control;  } -void ds_release_pebs(struct pebs_tracer *tracer) +void ds_resume_bts(struct bts_tracer *tracer)  { +	struct task_struct *task; +	unsigned long debugctlmsr; +	int cpu; +  	if (!tracer)  		return; -	ds_suspend_pebs(tracer); +	tracer->flags = tracer->trace.ds.flags; + +	task = tracer->ds.context->task; +	cpu  = tracer->ds.context->cpu; + +	WARN_ON(!task && irqs_disabled()); + +	debugctlmsr = (task ? +		       task->thread.debugctlmsr : +		       get_debugctlmsr_on_cpu(cpu)); +	debugctlmsr |= ds_bts_control(tracer); + +	if (task) +		update_task_debugctlmsr(task, debugctlmsr); +	else +		update_debugctlmsr_on_cpu(cpu, debugctlmsr); +} + +int ds_resume_bts_noirq(struct bts_tracer *tracer) +{ +	struct task_struct *task; +	unsigned long debugctlmsr, irq; +	int cpu, error = 0; + +	if (!tracer) +		return 0; + +	tracer->flags = tracer->trace.ds.flags; + +	task = tracer->ds.context->task; +	cpu  = tracer->ds.context->cpu; + +	local_irq_save(irq); + +	error = -EPERM; +	if (!task && (cpu != smp_processor_id())) +		goto out; + +	debugctlmsr = (task ? +		       task->thread.debugctlmsr : +		       get_debugctlmsr()); +	debugctlmsr |= ds_bts_control(tracer); + +	if (task) +		update_task_debugctlmsr(task, debugctlmsr); +	else +		update_debugctlmsr(debugctlmsr); + +	error = 0; + out: +	local_irq_restore(irq); +	return error; +} + +static void ds_free_pebs(struct pebs_tracer *tracer) +{ +	struct task_struct *task; + +	task = tracer->ds.context->task;  	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);  	tracer->ds.context->pebs_master = NULL; -	put_tracer(tracer->ds.context->task);  	ds_put_context(tracer->ds.context); +	put_tracer(task);  	kfree(tracer);  } +void ds_release_pebs(struct pebs_tracer *tracer) +{ +	might_sleep(); + +	if (!tracer) +		return; + +	ds_suspend_pebs(tracer); +	ds_free_pebs(tracer); +} + +int ds_release_pebs_noirq(struct pebs_tracer *tracer) +{ +	struct task_struct *task; +	unsigned long irq; +	int error; + +	if (!tracer) +		return 0; + +	task = tracer->ds.context->task; + +	local_irq_save(irq); + +	error = -EPERM; +	if (!task && +	    (tracer->ds.context->cpu != smp_processor_id())) +		goto out; + +	error = -EPERM; +	if (task && (task != current)) +		goto out; + +	ds_suspend_pebs_noirq(tracer); +	ds_free_pebs(tracer); + +	error = 0; + out: +	local_irq_restore(irq); +	return error; +} +  void ds_suspend_pebs(struct pebs_tracer *tracer)  {  } +int ds_suspend_pebs_noirq(struct pebs_tracer *tracer) +{ +	return 0; +} +  void ds_resume_pebs(struct pebs_tracer *tracer)  {  } +int ds_resume_pebs_noirq(struct pebs_tracer *tracer) +{ +	return 0; +} +  const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)  {  	if (!tracer) @@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)  		return NULL;  	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); -	tracer->trace.reset_value = -		*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + +	tracer->trace.counters = ds_cfg.nr_counter_reset; +	memcpy(tracer->trace.counter_reset, +	       tracer->ds.context->ds + +	       (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field), +	       ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);  	return &tracer->trace;  } @@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer)  	tracer->trace.ds.top = tracer->trace.ds.begin; -	ds_set(tracer->ds.context->ds, ds_bts, ds_index, +	ds_set(tracer->ds.context->ds, ds_pebs, ds_index,  	       (unsigned long)tracer->trace.ds.top);  	return 0;  } -int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, +		      unsigned int counter, u64 value)  {  	if (!tracer)  		return -EINVAL; -	*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; +	if (ds_cfg.nr_counter_reset < counter) +		return -EINVAL; + +	*(u64 *)(tracer->ds.context->ds + +		 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) + +		 (counter * PEBS_RESET_FIELD_SIZE)) = value;  	return 0;  } @@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = {  	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),  	.ctl[dsf_bts_kernel]	= (1 << 5),  	.ctl[dsf_bts_user]	= (1 << 6), - -	.sizeof_field		= sizeof(long), -	.sizeof_rec[ds_bts]	= sizeof(long) * 3, -#ifdef __i386__ -	.sizeof_rec[ds_pebs]	= sizeof(long) * 10, -#else -	.sizeof_rec[ds_pebs]	= sizeof(long) * 18, -#endif +	.nr_counter_reset	= 1,  };  static const struct ds_configuration ds_cfg_pentium_m = {  	.name = "Pentium M",  	.ctl[dsf_bts]		= (1 << 6) | (1 << 7), - -	.sizeof_field		= sizeof(long), -	.sizeof_rec[ds_bts]	= sizeof(long) * 3, -#ifdef __i386__ -	.sizeof_rec[ds_pebs]	= sizeof(long) * 10, -#else -	.sizeof_rec[ds_pebs]	= sizeof(long) * 18, -#endif +	.nr_counter_reset	= 1,  };  static const struct ds_configuration ds_cfg_core2_atom = {  	.name = "Core 2/Atom",  	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),  	.ctl[dsf_bts_kernel]	= (1 << 9),  	.ctl[dsf_bts_user]	= (1 << 10), - -	.sizeof_field		= 8, -	.sizeof_rec[ds_bts]	= 8 * 3, -	.sizeof_rec[ds_pebs]	= 8 * 18, +	.nr_counter_reset	= 1, +}; +static const struct ds_configuration ds_cfg_core_i7 = { +	.name = "Core i7", +	.ctl[dsf_bts]		= (1 << 6) | (1 << 7), +	.ctl[dsf_bts_kernel]	= (1 << 9), +	.ctl[dsf_bts_user]	= (1 << 10), +	.nr_counter_reset	= 4,  };  static void -ds_configure(const struct ds_configuration *cfg) +ds_configure(const struct ds_configuration *cfg, +	     struct cpuinfo_x86 *cpu)  { +	unsigned long nr_pebs_fields = 0; + +	printk(KERN_INFO "[ds] using %s configuration\n", cfg->name); + +#ifdef __i386__ +	nr_pebs_fields = 10; +#else +	nr_pebs_fields = 18; +#endif + +	/* +	 * Starting with version 2, architectural performance +	 * monitoring supports a format specifier. +	 */ +	if ((cpuid_eax(0xa) & 0xff) > 1) { +		unsigned long perf_capabilities, format; + +		rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities); + +		format = (perf_capabilities >> 8) & 0xf; + +		switch (format) { +		case 0: +			nr_pebs_fields = 18; +			break; +		case 1: +			nr_pebs_fields = 22; +			break; +		default: +			printk(KERN_INFO +			       "[ds] unknown PEBS format: %lu\n", format); +			nr_pebs_fields = 0; +			break; +		} +	} +  	memset(&ds_cfg, 0, sizeof(ds_cfg));  	ds_cfg = *cfg; -	printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); +	ds_cfg.sizeof_ptr_field = +		(cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4); + +	ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3; +	ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields; -	if (!cpu_has_bts) { -		ds_cfg.ctl[dsf_bts] = 0; +	if (!cpu_has(cpu, X86_FEATURE_BTS)) { +		ds_cfg.sizeof_rec[ds_bts] = 0;  		printk(KERN_INFO "[ds] bts not available\n");  	} -	if (!cpu_has_pebs) +	if (!cpu_has(cpu, X86_FEATURE_PEBS)) { +		ds_cfg.sizeof_rec[ds_pebs] = 0;  		printk(KERN_INFO "[ds] pebs not available\n"); +	} + +	printk(KERN_INFO "[ds] sizes: address: %u bit, ", +	       8 * ds_cfg.sizeof_ptr_field); +	printk("bts/pebs record: %u/%u bytes\n", +	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); -	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); +	WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);  }  void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)  { +	/* Only configure the first cpu. Others are identical. */ +	if (ds_cfg.name) +		return; +  	switch (c->x86) {  	case 0x6:  		switch (c->x86_model) {  		case 0x9:  		case 0xd: /* Pentium M */ -			ds_configure(&ds_cfg_pentium_m); +			ds_configure(&ds_cfg_pentium_m, c);  			break;  		case 0xf:  		case 0x17: /* Core2 */  		case 0x1c: /* Atom */ -			ds_configure(&ds_cfg_core2_atom); +			ds_configure(&ds_cfg_core2_atom, c); +			break; +		case 0x1a: /* Core i7 */ +			ds_configure(&ds_cfg_core_i7, c);  			break; -		case 0x1a: /* i7 */  		default: -			/* sorry, don't know about them */ +			/* Sorry, don't know about them. */  			break;  		}  		break; @@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)  		case 0x0:  		case 0x1:  		case 0x2: /* Netburst */ -			ds_configure(&ds_cfg_netburst); +			ds_configure(&ds_cfg_netburst, c);  			break;  		default: -			/* sorry, don't know about them */ +			/* Sorry, don't know about them. */  			break;  		}  		break;  	default: -		/* sorry, don't know about them */ +		/* Sorry, don't know about them. */  		break;  	}  } +static inline void ds_take_timestamp(struct ds_context *context, +				     enum bts_qualifier qualifier, +				     struct task_struct *task) +{ +	struct bts_tracer *tracer = context->bts_master; +	struct bts_struct ts; + +	/* Prevent compilers from reading the tracer pointer twice. */ +	barrier(); + +	if (!tracer || !(tracer->flags & BTS_TIMESTAMPS)) +		return; + +	memset(&ts, 0, sizeof(ts)); +	ts.qualifier		= qualifier; +	ts.variant.event.clock	= trace_clock_global(); +	ts.variant.event.pid	= task->pid; + +	bts_write(tracer, &ts); +} +  /*   * Change the DS configuration from tracing prev to tracing next.   */  void ds_switch_to(struct task_struct *prev, struct task_struct *next)  { -	struct ds_context *prev_ctx = prev->thread.ds_ctx; -	struct ds_context *next_ctx = next->thread.ds_ctx; +	struct ds_context *prev_ctx	= prev->thread.ds_ctx; +	struct ds_context *next_ctx	= next->thread.ds_ctx; +	unsigned long debugctlmsr	= next->thread.debugctlmsr; + +	/* Make sure all data is read before we start. */ +	barrier();  	if (prev_ctx) {  		update_debugctlmsr(0); -		if (prev_ctx->bts_master && -		    (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { -			struct bts_struct ts = { -				.qualifier = bts_task_departs, -				.variant.timestamp.jiffies = jiffies_64, -				.variant.timestamp.pid = prev->pid -			}; -			bts_write(prev_ctx->bts_master, &ts); -		} +		ds_take_timestamp(prev_ctx, bts_task_departs, prev);  	}  	if (next_ctx) { -		if (next_ctx->bts_master && -		    (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { -			struct bts_struct ts = { -				.qualifier = bts_task_arrives, -				.variant.timestamp.jiffies = jiffies_64, -				.variant.timestamp.pid = next->pid -			}; -			bts_write(next_ctx->bts_master, &ts); -		} +		ds_take_timestamp(next_ctx, bts_task_arrives, next);  		wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);  	} -	update_debugctlmsr(next->thread.debugctlmsr); +	update_debugctlmsr(debugctlmsr);  } -void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +static __init int ds_selftest(void)  { -	clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); -	tsk->thread.ds_ctx = NULL; -} +	if (ds_cfg.sizeof_rec[ds_bts]) { +		int error; -void ds_exit_thread(struct task_struct *tsk) -{ +		error = ds_selftest_bts(); +		if (error) { +			WARN(1, "[ds] selftest failed. disabling bts.\n"); +			ds_cfg.sizeof_rec[ds_bts] = 0; +		} +	} + +	if (ds_cfg.sizeof_rec[ds_pebs]) { +		int error; + +		error = ds_selftest_pebs(); +		if (error) { +			WARN(1, "[ds] selftest failed. disabling pebs.\n"); +			ds_cfg.sizeof_rec[ds_pebs] = 0; +		} +	} + +	return 0;  } +device_initcall(ds_selftest); diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c new file mode 100644 index 00000000000..6bc7c199ab9 --- /dev/null +++ b/arch/x86/kernel/ds_selftest.c @@ -0,0 +1,408 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2009 + */ + +#include "ds_selftest.h" + +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <asm/ds.h> + + +#define BUFFER_SIZE		521	/* Intentionally chose an odd size. */ +#define SMALL_BUFFER_SIZE	24	/* A single bts entry. */ + +struct ds_selftest_bts_conf { +	struct bts_tracer *tracer; +	int error; +	int (*suspend)(struct bts_tracer *); +	int (*resume)(struct bts_tracer *); +}; + +static int ds_selftest_bts_consistency(const struct bts_trace *trace) +{ +	int error = 0; + +	if (!trace) { +		printk(KERN_CONT "failed to access trace..."); +		/* Bail out. Other tests are pointless. */ +		return -1; +	} + +	if (!trace->read) { +		printk(KERN_CONT "bts read not available..."); +		error = -1; +	} + +	/* Do some sanity checks on the trace configuration. */ +	if (!trace->ds.n) { +		printk(KERN_CONT "empty bts buffer..."); +		error = -1; +	} +	if (!trace->ds.size) { +		printk(KERN_CONT "bad bts trace setup..."); +		error = -1; +	} +	if (trace->ds.end != +	    (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) { +		printk(KERN_CONT "bad bts buffer setup..."); +		error = -1; +	} +	/* +	 * We allow top in [begin; end], since its not clear when the +	 * overflow adjustment happens: after the increment or before the +	 * write. +	 */ +	if ((trace->ds.top < trace->ds.begin) || +	    (trace->ds.end < trace->ds.top)) { +		printk(KERN_CONT "bts top out of bounds..."); +		error = -1; +	} + +	return error; +} + +static int ds_selftest_bts_read(struct bts_tracer *tracer, +				const struct bts_trace *trace, +				const void *from, const void *to) +{ +	const unsigned char *at; + +	/* +	 * Check a few things which do not belong to this test. +	 * They should be covered by other tests. +	 */ +	if (!trace) +		return -1; + +	if (!trace->read) +		return -1; + +	if (to < from) +		return -1; + +	if (from < trace->ds.begin) +		return -1; + +	if (trace->ds.end < to) +		return -1; + +	if (!trace->ds.size) +		return -1; + +	/* Now to the test itself. */ +	for (at = from; (void *)at < to; at += trace->ds.size) { +		struct bts_struct bts; +		unsigned long index; +		int error; + +		if (((void *)at - trace->ds.begin) % trace->ds.size) { +			printk(KERN_CONT +			       "read from non-integer index..."); +			return -1; +		} +		index = ((void *)at - trace->ds.begin) / trace->ds.size; + +		memset(&bts, 0, sizeof(bts)); +		error = trace->read(tracer, at, &bts); +		if (error < 0) { +			printk(KERN_CONT +			       "error reading bts trace at [%lu] (0x%p)...", +			       index, at); +			return error; +		} + +		switch (bts.qualifier) { +		case BTS_BRANCH: +			break; +		default: +			printk(KERN_CONT +			       "unexpected bts entry %llu at [%lu] (0x%p)...", +			       bts.qualifier, index, at); +			return -1; +		} +	} + +	return 0; +} + +static void ds_selftest_bts_cpu(void *arg) +{ +	struct ds_selftest_bts_conf *conf = arg; +	const struct bts_trace *trace; +	void *top; + +	if (IS_ERR(conf->tracer)) { +		conf->error = PTR_ERR(conf->tracer); +		conf->tracer = NULL; + +		printk(KERN_CONT +		       "initialization failed (err: %d)...", conf->error); +		return; +	} + +	/* We should meanwhile have enough trace. */ +	conf->error = conf->suspend(conf->tracer); +	if (conf->error < 0) +		return; + +	/* Let's see if we can access the trace. */ +	trace = ds_read_bts(conf->tracer); + +	conf->error = ds_selftest_bts_consistency(trace); +	if (conf->error < 0) +		return; + +	/* If everything went well, we should have a few trace entries. */ +	if (trace->ds.top == trace->ds.begin) { +		/* +		 * It is possible but highly unlikely that we got a +		 * buffer overflow and end up at exactly the same +		 * position we started from. +		 * Let's issue a warning, but continue. +		 */ +		printk(KERN_CONT "no trace/overflow..."); +	} + +	/* Let's try to read the trace we collected. */ +	conf->error = +		ds_selftest_bts_read(conf->tracer, trace, +				     trace->ds.begin, trace->ds.top); +	if (conf->error < 0) +		return; + +	/* +	 * Let's read the trace again. +	 * Since we suspended tracing, we should get the same result. +	 */ +	top = trace->ds.top; + +	trace = ds_read_bts(conf->tracer); +	conf->error = ds_selftest_bts_consistency(trace); +	if (conf->error < 0) +		return; + +	if (top != trace->ds.top) { +		printk(KERN_CONT "suspend not working..."); +		conf->error = -1; +		return; +	} + +	/* Let's collect some more trace - see if resume is working. */ +	conf->error = conf->resume(conf->tracer); +	if (conf->error < 0) +		return; + +	conf->error = conf->suspend(conf->tracer); +	if (conf->error < 0) +		return; + +	trace = ds_read_bts(conf->tracer); + +	conf->error = ds_selftest_bts_consistency(trace); +	if (conf->error < 0) +		return; + +	if (trace->ds.top == top) { +		/* +		 * It is possible but highly unlikely that we got a +		 * buffer overflow and end up at exactly the same +		 * position we started from. +		 * Let's issue a warning and check the full trace. +		 */ +		printk(KERN_CONT +		       "no resume progress/overflow..."); + +		conf->error = +			ds_selftest_bts_read(conf->tracer, trace, +					     trace->ds.begin, trace->ds.end); +	} else if (trace->ds.top < top) { +		/* +		 * We had a buffer overflow - the entire buffer should +		 * contain trace records. +		 */ +		conf->error = +			ds_selftest_bts_read(conf->tracer, trace, +					     trace->ds.begin, trace->ds.end); +	} else { +		/* +		 * It is quite likely that the buffer did not overflow. +		 * Let's just check the delta trace. +		 */ +		conf->error = +			ds_selftest_bts_read(conf->tracer, trace, top, +					     trace->ds.top); +	} +	if (conf->error < 0) +		return; + +	conf->error = 0; +} + +static int ds_suspend_bts_wrap(struct bts_tracer *tracer) +{ +	ds_suspend_bts(tracer); +	return 0; +} + +static int ds_resume_bts_wrap(struct bts_tracer *tracer) +{ +	ds_resume_bts(tracer); +	return 0; +} + +static void ds_release_bts_noirq_wrap(void *tracer) +{ +	(void)ds_release_bts_noirq(tracer); +} + +static int ds_selftest_bts_bad_release_noirq(int cpu, +					     struct bts_tracer *tracer) +{ +	int error = -EPERM; + +	/* Try to release the tracer on the wrong cpu. */ +	get_cpu(); +	if (cpu != smp_processor_id()) { +		error = ds_release_bts_noirq(tracer); +		if (error != -EPERM) +			printk(KERN_CONT "release on wrong cpu..."); +	} +	put_cpu(); + +	return error ? 0 : -1; +} + +static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer) +{ +	struct bts_tracer *tracer; +	int error; + +	/* Try to request cpu tracing while task tracing is active. */ +	tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, +				    (size_t)-1, BTS_KERNEL); +	error = PTR_ERR(tracer); +	if (!IS_ERR(tracer)) { +		ds_release_bts(tracer); +		error = 0; +	} + +	if (error != -EPERM) +		printk(KERN_CONT "cpu/task tracing overlap..."); + +	return error ? 0 : -1; +} + +static int ds_selftest_bts_bad_request_task(void *buffer) +{ +	struct bts_tracer *tracer; +	int error; + +	/* Try to request cpu tracing while task tracing is active. */ +	tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL, +				    (size_t)-1, BTS_KERNEL); +	error = PTR_ERR(tracer); +	if (!IS_ERR(tracer)) { +		error = 0; +		ds_release_bts(tracer); +	} + +	if (error != -EPERM) +		printk(KERN_CONT "task/cpu tracing overlap..."); + +	return error ? 0 : -1; +} + +int ds_selftest_bts(void) +{ +	struct ds_selftest_bts_conf conf; +	unsigned char buffer[BUFFER_SIZE], *small_buffer; +	unsigned long irq; +	int cpu; + +	printk(KERN_INFO "[ds] bts selftest..."); +	conf.error = 0; + +	small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8; + +	get_online_cpus(); +	for_each_online_cpu(cpu) { +		conf.suspend = ds_suspend_bts_wrap; +		conf.resume = ds_resume_bts_wrap; +		conf.tracer = +			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, +					   NULL, (size_t)-1, BTS_KERNEL); +		ds_selftest_bts_cpu(&conf); +		if (conf.error >= 0) +			conf.error = ds_selftest_bts_bad_request_task(buffer); +		ds_release_bts(conf.tracer); +		if (conf.error < 0) +			goto out; + +		conf.suspend = ds_suspend_bts_noirq; +		conf.resume = ds_resume_bts_noirq; +		conf.tracer = +			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, +					   NULL, (size_t)-1, BTS_KERNEL); +		smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1); +		if (conf.error >= 0) { +			conf.error = +				ds_selftest_bts_bad_release_noirq(cpu, +								  conf.tracer); +			/* We must not release the tracer twice. */ +			if (conf.error < 0) +				conf.tracer = NULL; +		} +		if (conf.error >= 0) +			conf.error = ds_selftest_bts_bad_request_task(buffer); +		smp_call_function_single(cpu, ds_release_bts_noirq_wrap, +					 conf.tracer, 1); +		if (conf.error < 0) +			goto out; +	} + +	conf.suspend = ds_suspend_bts_wrap; +	conf.resume = ds_resume_bts_wrap; +	conf.tracer = +		ds_request_bts_task(current, buffer, BUFFER_SIZE, +				    NULL, (size_t)-1, BTS_KERNEL); +	ds_selftest_bts_cpu(&conf); +	if (conf.error >= 0) +		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); +	ds_release_bts(conf.tracer); +	if (conf.error < 0) +		goto out; + +	conf.suspend = ds_suspend_bts_noirq; +	conf.resume = ds_resume_bts_noirq; +	conf.tracer = +		ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE, +				   NULL, (size_t)-1, BTS_KERNEL); +	local_irq_save(irq); +	ds_selftest_bts_cpu(&conf); +	if (conf.error >= 0) +		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); +	ds_release_bts_noirq(conf.tracer); +	local_irq_restore(irq); +	if (conf.error < 0) +		goto out; + +	conf.error = 0; + out: +	put_online_cpus(); +	printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed")); + +	return conf.error; +} + +int ds_selftest_pebs(void) +{ +	return 0; +} diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h new file mode 100644 index 00000000000..2ba8745c666 --- /dev/null +++ b/arch/x86/kernel/ds_selftest.h @@ -0,0 +1,15 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2009 + */ + +#ifdef CONFIG_X86_DS_SELFTEST +extern int ds_selftest_bts(void); +extern int ds_selftest_pebs(void); +#else +static inline int ds_selftest_bts(void) { return 0; } +static inline int ds_selftest_pebs(void) { return 0; } +#endif diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index da87590b869..81086c227ab 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  		unsigned long *sp, unsigned long bp, char *log_lvl);  extern unsigned int code_bytes; -extern int kstack_depth_to_print;  /* The form of the top of the frame on the stack */  struct stack_frame { diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 00628130292..7271fa33d79 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,   */  __init void e820_setup_gap(void)  { -	unsigned long gapstart, gapsize, round; +	unsigned long gapstart, gapsize;  	int found;  	gapstart = 0x10000000; @@ -635,14 +635,9 @@ __init void e820_setup_gap(void)  #endif  	/* -	 * See how much we want to round up: start off with -	 * rounding to the next 1MB area. +	 * e820_reserve_resources_late protect stolen RAM already  	 */ -	round = 0x100000; -	while ((gapsize >> 4) > round) -		round += round; -	/* Fun with two's complement */ -	pci_mem_start = (gapstart + round) & -round; +	pci_mem_start = gapstart;  	printk(KERN_INFO  	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", @@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)  	}  } +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ +	unsigned long mb = pos >> 20; + +	/* To 64kB in the first megabyte */ +	if (!mb) +		return 64*1024; + +	/* To 1MB in the first 16MB */ +	if (mb < 16) +		return 1024*1024; + +	/* To 32MB for anything above that */ +	return 32*1024*1024; +} +  void __init e820_reserve_resources_late(void)  {  	int i; @@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)  			insert_resource_expand_to_fit(&iomem_resource, res);  		res++;  	} + +	/* +	 * Try to bump up RAM regions to reasonable boundaries to +	 * avoid stolen RAM: +	 */ +	for (i = 0; i < e820.nr_map; i++) { +		struct e820entry *entry = &e820_saved.map[i]; +		resource_size_t start, end; + +		if (entry->type != E820_RAM) +			continue; +		start = entry->addr + entry->size; +		end = round_up(start, ram_alignment(start)); +		if (start == end) +			continue; +		reserve_region_with_split(&iomem_resource, start, +						  end - 1, "RAM buffer"); +	}  }  char *__init default_machine_specific_memory_setup(void) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 76b8cd953de..ebdb85cf268 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)  }  #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)  static u32 __init ati_ixp4x0_rev(int num, int slot, int func)  {  	u32 d; @@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)  	d &= 0xff;  	return d;  } +#endif  static void __init ati_bugs(int num, int slot, int func)  { diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 1736acc4d7a..96f7ac0bbf0 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -240,10 +240,35 @@ static void __init do_add_efi_memmap(void)  		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;  		int e820_type; -		if (md->attribute & EFI_MEMORY_WB) -			e820_type = E820_RAM; -		else +		switch (md->type) { +		case EFI_LOADER_CODE: +		case EFI_LOADER_DATA: +		case EFI_BOOT_SERVICES_CODE: +		case EFI_BOOT_SERVICES_DATA: +		case EFI_CONVENTIONAL_MEMORY: +			if (md->attribute & EFI_MEMORY_WB) +				e820_type = E820_RAM; +			else +				e820_type = E820_RESERVED; +			break; +		case EFI_ACPI_RECLAIM_MEMORY: +			e820_type = E820_ACPI; +			break; +		case EFI_ACPI_MEMORY_NVS: +			e820_type = E820_NVS; +			break; +		case EFI_UNUSABLE_MEMORY: +			e820_type = E820_UNUSABLE; +			break; +		default: +			/* +			 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE +			 * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO +			 * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE +			 */  			e820_type = E820_RESERVED; +			break; +		}  		e820_add_region(start, size, e820_type);  	}  	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c929add475c..c097e7d607c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -48,7 +48,6 @@  #include <asm/segment.h>  #include <asm/smp.h>  #include <asm/page_types.h> -#include <asm/desc.h>  #include <asm/percpu.h>  #include <asm/dwarf2.h>  #include <asm/processor-flags.h> @@ -84,7 +83,7 @@  #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF  #else  #define preempt_stop(clobbers) -#define resume_kernel		restore_nocheck +#define resume_kernel		restore_all  #endif  .macro TRACE_IRQS_IRET @@ -372,7 +371,7 @@ END(ret_from_exception)  ENTRY(resume_kernel)  	DISABLE_INTERRUPTS(CLBR_ANY)  	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ? -	jnz restore_nocheck +	jnz restore_all  need_resched:  	movl TI_flags(%ebp), %ecx	# need_resched set ?  	testb $_TIF_NEED_RESCHED, %cl @@ -540,6 +539,8 @@ syscall_exit:  	jne syscall_exit_work  restore_all: +	TRACE_IRQS_IRET +restore_all_notrace:  	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS  	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we  	# are returning to the kernel. @@ -551,8 +552,6 @@ restore_all:  	CFI_REMEMBER_STATE  	je ldt_ss			# returning to user-space with LDT SS  restore_nocheck: -	TRACE_IRQS_IRET -restore_nocheck_notrace:  	RESTORE_REGS 4			# skip orig_eax/error_code  	CFI_ADJUST_CFA_OFFSET -4  irq_return: @@ -588,22 +587,34 @@ ldt_ss:  	jne restore_nocheck  #endif -	/* If returning to userspace with 16bit stack, -	 * try to fix the higher word of ESP, as the CPU -	 * won't restore it. -	 * This is an "official" bug of all the x86-compatible -	 * CPUs, which we can try to work around to make -	 * dosemu and wine happy. */ -	movl PT_OLDESP(%esp), %eax -	movl %esp, %edx -	call patch_espfix_desc +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ +	mov %esp, %edx			/* load kernel esp */ +	mov PT_OLDESP(%esp), %eax	/* load userspace esp */ +	mov %dx, %ax			/* eax: new kernel esp */ +	sub %eax, %edx			/* offset (low word is 0) */ +	PER_CPU(gdt_page, %ebx) +	shr $16, %edx +	mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ +	mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */  	pushl $__ESPFIX_SS  	CFI_ADJUST_CFA_OFFSET 4 -	pushl %eax +	push %eax			/* new kernel esp */  	CFI_ADJUST_CFA_OFFSET 4 +	/* Disable interrupts, but do not irqtrace this section: we +	 * will soon execute iret and the tracer was already set to +	 * the irqstate after the iret */  	DISABLE_INTERRUPTS(CLBR_EAX) -	TRACE_IRQS_OFF -	lss (%esp), %esp +	lss (%esp), %esp		/* switch to espfix segment */  	CFI_ADJUST_CFA_OFFSET -8  	jmp restore_nocheck  	CFI_ENDPROC @@ -716,15 +727,24 @@ PTREGSCALL(vm86)  PTREGSCALL(vm86old)  .macro FIXUP_ESPFIX_STACK -	/* since we are on a wrong stack, we cant make it a C code :( */ +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ +	/* fixup the stack */  	PER_CPU(gdt_page, %ebx) -	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) -	addl %esp, %eax +	mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ +	mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ +	shl $16, %eax +	addl %esp, %eax			/* the adjusted stack pointer */  	pushl $__KERNEL_DS  	CFI_ADJUST_CFA_OFFSET 4  	pushl %eax  	CFI_ADJUST_CFA_OFFSET 4 -	lss (%esp), %esp +	lss (%esp), %esp		/* switch to the normal stack segment */  	CFI_ADJUST_CFA_OFFSET -8  .endm  .macro UNWIND_ESPFIX_STACK @@ -1154,6 +1174,7 @@ ENTRY(ftrace_graph_caller)  	pushl %edx  	movl 0xc(%esp), %edx  	lea 0x4(%ebp), %eax +	movl (%ebp), %ecx  	subl $MCOUNT_INSN_SIZE, %edx  	call prepare_ftrace_return  	popl %edx @@ -1168,6 +1189,7 @@ return_to_handler:  	pushl %eax  	pushl %ecx  	pushl %edx +	movl %ebp, %eax  	call ftrace_return_to_handler  	movl %eax, 0xc(%esp)  	popl %edx @@ -1329,7 +1351,7 @@ nmi_stack_correct:  	xorl %edx,%edx		# zero error code  	movl %esp,%eax		# pt_regs pointer  	call do_nmi -	jmp restore_nocheck_notrace +	jmp restore_all_notrace  	CFI_ENDPROC  nmi_stack_fixup: diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6e843..c251be74510 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -135,6 +135,7 @@ ENTRY(ftrace_graph_caller)  	leaq 8(%rbp), %rdi  	movq 0x38(%rsp), %rsi +	movq (%rbp), %rdx  	subq $MCOUNT_INSN_SIZE, %rsi  	call	prepare_ftrace_return @@ -147,27 +148,15 @@ END(ftrace_graph_caller)  GLOBAL(return_to_handler)  	subq  $80, %rsp +	/* Save the return values */  	movq %rax, (%rsp) -	movq %rcx, 8(%rsp) -	movq %rdx, 16(%rsp) -	movq %rsi, 24(%rsp) -	movq %rdi, 32(%rsp) -	movq %r8, 40(%rsp) -	movq %r9, 48(%rsp) -	movq %r10, 56(%rsp) -	movq %r11, 64(%rsp) +	movq %rdx, 8(%rsp) +	movq %rbp, %rdi  	call ftrace_return_to_handler  	movq %rax, 72(%rsp) -	movq 64(%rsp), %r11 -	movq 56(%rsp), %r10 -	movq 48(%rsp), %r9 -	movq 40(%rsp), %r8 -	movq 32(%rsp), %rdi -	movq 24(%rsp), %rsi -	movq 16(%rsp), %rdx -	movq 8(%rsp), %rcx +	movq 8(%rsp), %rdx  	movq (%rsp), %rax  	addq $72, %rsp  	retq @@ -976,6 +965,8 @@ END(\sym)  #ifdef CONFIG_SMP  apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \  	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt REBOOT_VECTOR \ +	reboot_interrupt smp_reboot_interrupt  #endif  #ifdef CONFIG_X86_UV @@ -1007,10 +998,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \  #endif  apicinterrupt THRESHOLD_APIC_VECTOR \ -	threshold_interrupt mce_threshold_interrupt +	threshold_interrupt smp_threshold_interrupt  apicinterrupt THERMAL_APIC_VECTOR \  	thermal_interrupt smp_thermal_interrupt +#ifdef CONFIG_X86_MCE +apicinterrupt MCE_SELF_VECTOR \ +	mce_self_interrupt smp_mce_self_interrupt +#endif +  #ifdef CONFIG_SMP  apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \  	call_function_single_interrupt smp_call_function_single_interrupt @@ -1025,6 +1021,11 @@ apicinterrupt ERROR_APIC_VECTOR \  apicinterrupt SPURIOUS_APIC_VECTOR \  	spurious_interrupt smp_spurious_interrupt +#ifdef CONFIG_PERF_COUNTERS +apicinterrupt LOCAL_PENDING_VECTOR \ +	perf_pending_interrupt smp_perf_pending_interrupt +#endif +  /*   * Exception entry points.   */ @@ -1379,10 +1380,15 @@ END(xen_failsafe_callback)  paranoidzeroentry_ist debug do_debug DEBUG_STACK  paranoidzeroentry_ist int3 do_int3 DEBUG_STACK  paranoiderrorentry stack_segment do_stack_segment +#ifdef CONFIG_XEN +zeroentry xen_debug do_debug +zeroentry xen_int3 do_int3 +errorentry xen_stack_segment do_stack_segment +#endif  errorentry general_protection do_general_protection  errorentry page_fault do_page_fault  #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check do_machine_check +paranoidzeroentry machine_check *machine_check_vector(%rip)  #endif  	/* diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b79c5533c42..d94e1ea3b9f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -408,7 +408,8 @@ int ftrace_disable_ftrace_graph_caller(void)   * Hook the return address and push it in the stack of return addrs   * in current thread info.   */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, +			   unsigned long frame_pointer)  {  	unsigned long old;  	int faulted; @@ -453,7 +454,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)  		return;  	} -	if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { +	if (ftrace_push_return_trace(old, self_addr, &trace.depth, +		    frame_pointer) == -EBUSY) {  		*parent = old;  		return;  	} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30683883e0c..8663afb5653 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -13,7 +13,6 @@  #include <asm/segment.h>  #include <asm/page_types.h>  #include <asm/pgtable_types.h> -#include <asm/desc.h>  #include <asm/cache.h>  #include <asm/thread_info.h>  #include <asm/asm-offsets.h> @@ -608,13 +607,6 @@ ignore_int:  ENTRY(initial_code)  	.long i386_start_kernel -.section .text -/* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) -  /*   * BSS section   */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 54b29bb24e7..fa54f78e2a0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -12,7 +12,6 @@  #include <linux/linkage.h>  #include <linux/threads.h>  #include <linux/init.h> -#include <asm/desc.h>  #include <asm/segment.h>  #include <asm/pgtable.h>  #include <asm/page.h> diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 81408b93f88..dedc2bddf7a 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -510,7 +510,8 @@ static int hpet_setup_irq(struct hpet_dev *dev)  {  	if (request_irq(dev->irq, hpet_interrupt_handler, -			IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev)) +			IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, +			dev->name, dev))  		return -1;  	disable_irq(dev->irq); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index c2e0bb0890d..5cf36c053ac 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -7,6 +7,7 @@  #include <linux/spinlock.h>  #include <linux/jiffies.h>  #include <linux/module.h> +#include <linux/timex.h>  #include <linux/delay.h>  #include <linux/init.h>  #include <linux/io.h> diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index df3bf269bea..270ff83efc1 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -12,7 +12,6 @@  static struct signal_struct init_signals = INIT_SIGNALS(init_signals);  static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm);  /*   * Initial thread structure. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c3fe010d74c..b0cdde6932f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,8 @@  #include <asm/io_apic.h>  #include <asm/irq.h>  #include <asm/idle.h> +#include <asm/mce.h> +#include <asm/hw_irq.h>  atomic_t irq_err_count; @@ -24,9 +26,9 @@ void (*generic_interrupt_extension)(void) = NULL;   */  void ack_bad_irq(unsigned int irq)  { -	printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); +	if (printk_ratelimit()) +		pr_err("unexpected IRQ trap at vector %02x\n", irq); -#ifdef CONFIG_X86_LOCAL_APIC  	/*  	 * Currently unexpected vectors happen only on SMP and APIC.  	 * We _must_ ack these because every local APIC has only N @@ -36,9 +38,7 @@ void ack_bad_irq(unsigned int irq)  	 * completely.  	 * But only ack when the APIC is enabled -AK  	 */ -	if (cpu_has_apic) -		ack_APIC_irq(); -#endif +	ack_APIC_irq();  }  #define irq_stats(x)		(&per_cpu(irq_stat, x)) @@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)  	for_each_online_cpu(j)  		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);  	seq_printf(p, "  Spurious interrupts\n"); +	seq_printf(p, "%*s: ", prec, "CNT"); +	for_each_online_cpu(j) +		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); +	seq_printf(p, "  Performance counter interrupts\n"); +	seq_printf(p, "%*s: ", prec, "PND"); +	for_each_online_cpu(j) +		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); +	seq_printf(p, "  Performance pending work\n");  #endif  	if (generic_interrupt_extension) {  		seq_printf(p, "%*s: ", prec, "PLT"); @@ -89,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec)  	for_each_online_cpu(j)  		seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);  	seq_printf(p, "  Thermal event interrupts\n"); -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD  	seq_printf(p, "%*s: ", prec, "THR");  	for_each_online_cpu(j)  		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);  	seq_printf(p, "  Threshold APIC interrupts\n");  # endif  #endif +#ifdef CONFIG_X86_NEW_MCE +	seq_printf(p, "%*s: ", prec, "MCE"); +	for_each_online_cpu(j) +		seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); +	seq_printf(p, "  Machine check exceptions\n"); +	seq_printf(p, "%*s: ", prec, "MCP"); +	for_each_online_cpu(j) +		seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); +	seq_printf(p, "  Machine check polls\n"); +#endif  	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));  #if defined(CONFIG_X86_IO_APIC)  	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); @@ -166,6 +184,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)  #ifdef CONFIG_X86_LOCAL_APIC  	sum += irq_stats(cpu)->apic_timer_irqs;  	sum += irq_stats(cpu)->irq_spurious_count; +	sum += irq_stats(cpu)->apic_perf_irqs; +	sum += irq_stats(cpu)->apic_pending_irqs;  #endif  	if (generic_interrupt_extension)  		sum += irq_stats(cpu)->generic_irqs; @@ -176,9 +196,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu)  #endif  #ifdef CONFIG_X86_MCE  	sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD  	sum += irq_stats(cpu)->irq_threshold_count; +# endif  #endif +#ifdef CONFIG_X86_NEW_MCE +	sum += per_cpu(mce_exception_count, cpu); +	sum += per_cpu(mce_poll_count, cpu);  #endif  	return sum;  } @@ -213,14 +237,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)  	irq = __get_cpu_var(vector_irq)[vector];  	if (!handle_irq(irq, regs)) { -#ifdef CONFIG_X86_64 -		if (!disable_apic) -			ack_APIC_irq(); -#endif +		ack_APIC_irq();  		if (printk_ratelimit()) -			printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", -			       __func__, smp_processor_id(), vector, irq); +			pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", +				__func__, smp_processor_id(), vector, irq);  	}  	irq_exit(); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c index 368b0a8836f..696f0e475c2 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit.c @@ -1,20 +1,25 @@ +#include <linux/linkage.h>  #include <linux/errno.h>  #include <linux/signal.h>  #include <linux/sched.h>  #include <linux/ioport.h>  #include <linux/interrupt.h> +#include <linux/timex.h>  #include <linux/slab.h>  #include <linux/random.h> +#include <linux/kprobes.h>  #include <linux/init.h>  #include <linux/kernel_stat.h>  #include <linux/sysdev.h>  #include <linux/bitops.h> +#include <linux/acpi.h>  #include <linux/io.h>  #include <linux/delay.h>  #include <asm/atomic.h>  #include <asm/system.h>  #include <asm/timer.h> +#include <asm/hw_irq.h>  #include <asm/pgtable.h>  #include <asm/desc.h>  #include <asm/apic.h> @@ -22,7 +27,23 @@  #include <asm/i8259.h>  #include <asm/traps.h> +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ +#ifdef CONFIG_X86_32  /*   * Note that on a 486, we don't want to do a SIGFPE on an irq13   * as the irq is unreliable, and exception 16 works correctly @@ -52,30 +73,7 @@ static struct irqaction fpu_irq = {  	.handler = math_error_irq,  	.name = "fpu",  }; - -void __init init_ISA_irqs(void) -{ -	int i; - -#ifdef CONFIG_X86_LOCAL_APIC -	init_bsp_APIC();  #endif -	init_8259A(0); - -	/* -	 * 16 old-style INTA-cycle interrupts: -	 */ -	for (i = 0; i < NR_IRQS_LEGACY; i++) { -		struct irq_desc *desc = irq_to_desc(i); - -		desc->status = IRQ_DISABLED; -		desc->action = NULL; -		desc->depth = 1; - -		set_irq_chip_and_handler_name(i, &i8259A_chip, -					      handle_level_irq, "XT"); -	} -}  /*   * IRQ2 is cascade interrupt to second interrupt controller @@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector)  	return 0;  } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) +static void __init init_ISA_irqs(void)  {  	int i; -	/* Execute any quirks before the call gates are initialised: */ -	x86_quirk_pre_intr_init(); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) +	init_bsp_APIC(); +#endif +	init_8259A(0);  	/* -	 * Cover the whole vector space, no vector can escape -	 * us. (some of these will be overridden and become -	 * 'special' SMP interrupts) +	 * 16 old-style INTA-cycle interrupts:  	 */ -	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { -		/* SYSCALL_VECTOR was reserved in trap_init. */ -		if (i != SYSCALL_VECTOR) -			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +	for (i = 0; i < NR_IRQS_LEGACY; i++) { +		struct irq_desc *desc = irq_to_desc(i); + +		desc->status = IRQ_DISABLED; +		desc->action = NULL; +		desc->depth = 1; + +		set_irq_chip_and_handler_name(i, &i8259A_chip, +					      handle_level_irq, "XT");  	} +} +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) +static void __init smp_intr_init(void) +{ +#ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)  	/*  	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper  	 * IPI, driven by wakeup. @@ -160,16 +166,35 @@ void __init native_init_IRQ(void)  	/* IPI for generic function call */  	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); -	/* IPI for single call function */ +	/* IPI for generic single function call */  	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, -				 call_function_single_interrupt); +			call_function_single_interrupt);  	/* Low priority IPI to cleanup after moving an irq */  	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);  	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); + +	/* IPI used for rebooting/stopping */ +	alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);  #endif +#endif /* CONFIG_SMP */ +} + +static void __init apic_intr_init(void) +{ +	smp_intr_init(); -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_THERMAL_VECTOR +	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +#ifdef CONFIG_X86_THRESHOLD +	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif +#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)  	/* self generated IPI for local APIC timer */  	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -179,16 +204,59 @@ void __init native_init_IRQ(void)  	/* IPI vectors for APIC spurious and error interrupts */  	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);  	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + +	/* Performance monitoring interrupts: */ +# ifdef CONFIG_PERF_COUNTERS +	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); +# endif +  #endif +} -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) -	/* thermal monitor LVT interrupt */ -	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + *	Perform any necessary interrupt initialisation prior to setting up + *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA + *	interrupts should be initialised here if the machine emulates a PC + *	in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ +#ifdef CONFIG_X86_32 +	if (x86_quirks->arch_pre_intr_init) { +		if (x86_quirks->arch_pre_intr_init()) +			return; +	}  #endif +	init_ISA_irqs(); +} + +void __init native_init_IRQ(void) +{ +	int i; + +	/* Execute any quirks before the call gates are initialised: */ +	x86_quirk_pre_intr_init(); + +	apic_intr_init(); + +	/* +	 * Cover the whole vector space, no vector can escape +	 * us. (some of these will be overridden and become +	 * 'special' SMP interrupts) +	 */ +	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +		/* IA32_SYSCALL_VECTOR could be used in trap_init already. */ +		if (!test_bit(i, used_vectors)) +			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +	}  	if (!acpi_ioapic)  		setup_irq(2, &irq2); +#ifdef CONFIG_X86_32  	/*  	 * Call quirks after call gates are initialised (usually add in  	 * the architecture specific gates): @@ -203,4 +271,5 @@ void __init native_init_IRQ(void)  		setup_irq(FPU_IRQ, &fpu_irq);  	irq_ctx_init(smp_processor_id()); +#endif  } diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c deleted file mode 100644 index 8cd10537fd4..00000000000 --- a/arch/x86/kernel/irqinit_64.c +++ /dev/null @@ -1,177 +0,0 @@ -#include <linux/linkage.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/bitops.h> -#include <linux/acpi.h> -#include <linux/io.h> -#include <linux/delay.h> - -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/hw_irq.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/apic.h> -#include <asm/i8259.h> - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { -	.handler = no_action, -	.name = "cascade", -}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { -	[0 ... IRQ0_VECTOR - 1] = -1, -	[IRQ0_VECTOR] = 0, -	[IRQ1_VECTOR] = 1, -	[IRQ2_VECTOR] = 2, -	[IRQ3_VECTOR] = 3, -	[IRQ4_VECTOR] = 4, -	[IRQ5_VECTOR] = 5, -	[IRQ6_VECTOR] = 6, -	[IRQ7_VECTOR] = 7, -	[IRQ8_VECTOR] = 8, -	[IRQ9_VECTOR] = 9, -	[IRQ10_VECTOR] = 10, -	[IRQ11_VECTOR] = 11, -	[IRQ12_VECTOR] = 12, -	[IRQ13_VECTOR] = 13, -	[IRQ14_VECTOR] = 14, -	[IRQ15_VECTOR] = 15, -	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -int vector_used_by_percpu_irq(unsigned int vector) -{ -	int cpu; - -	for_each_online_cpu(cpu) { -		if (per_cpu(vector_irq, cpu)[vector] != -1) -			return 1; -	} - -	return 0; -} - -static void __init init_ISA_irqs(void) -{ -	int i; - -	init_bsp_APIC(); -	init_8259A(0); - -	for (i = 0; i < NR_IRQS_LEGACY; i++) { -		struct irq_desc *desc = irq_to_desc(i); - -		desc->status = IRQ_DISABLED; -		desc->action = NULL; -		desc->depth = 1; - -		/* -		 * 16 old-style INTA-cycle interrupts: -		 */ -		set_irq_chip_and_handler_name(i, &i8259A_chip, -						      handle_level_irq, "XT"); -	} -} - -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP -	/* -	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper -	 * IPI, driven by wakeup. -	 */ -	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - -	/* IPIs for invalidation */ -	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); -	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); -	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); -	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); -	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); -	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); -	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); -	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - -	/* IPI for generic function call */ -	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - -	/* IPI for generic single function call */ -	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, -			call_function_single_interrupt); - -	/* Low priority IPI to cleanup after moving an irq */ -	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); -	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif -} - -static void __init apic_intr_init(void) -{ -	smp_intr_init(); - -	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - -	/* self generated IPI for local APIC timer */ -	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - -	/* generic IPI for platform specific use */ -	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); - -	/* IPI vectors for APIC spurious and error interrupts */ -	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); -	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -} - -void __init native_init_IRQ(void) -{ -	int i; - -	init_ISA_irqs(); -	/* -	 * Cover the whole vector space, no vector can escape -	 * us. (some of these will be overridden and become -	 * 'special' SMP interrupts) -	 */ -	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { -		int vector = FIRST_EXTERNAL_VECTOR + i; -		if (vector != IA32_SYSCALL_VECTOR) -			set_intr_gate(vector, interrupt[i]); -	} - -	apic_intr_init(); - -	if (!acpi_ioapic) -		setup_irq(2, &irq2); -} diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b1f4dffb919..8d82a77a3f3 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)  	gdb_regs32[GDB_PS]	= *(unsigned long *)(p->thread.sp + 8);  	gdb_regs32[GDB_CS]	= __KERNEL_CS;  	gdb_regs32[GDB_SS]	= __KERNEL_DS; -	gdb_regs[GDB_PC]	= p->thread.ip; +	gdb_regs[GDB_PC]	= 0;  	gdb_regs[GDB_R8]	= 0;  	gdb_regs[GDB_R9]	= 0;  	gdb_regs[GDB_R10]	= 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33019ddb56b..a78ecad0c90 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,6 +27,7 @@  #include <linux/mm.h>  #include <linux/highmem.h>  #include <linux/hardirq.h> +#include <asm/timer.h>  #define MMU_QUEUE_SIZE 1024 @@ -195,7 +196,7 @@ static void kvm_leave_lazy_mmu(void)  	struct kvm_para_state *state = kvm_para_state();  	mmu_queue_flush(state); -	paravirt_leave_lazy(paravirt_get_lazy_mode()); +	paravirt_leave_lazy_mmu();  	state->mode = paravirt_get_lazy_mode();  } @@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)  		pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;  		pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;  	} +#ifdef CONFIG_X86_IO_APIC +	no_timer_check = 1; +#endif  }  void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 453b5795a5c..366baa17991 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,25 +13,13 @@   *  Licensed under the terms of the GNU General Public   *  License version 2. See file COPYING for details.   */ -#include <linux/platform_device.h> -#include <linux/capability.h> -#include <linux/miscdevice.h>  #include <linux/firmware.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h>  #include <linux/pci_ids.h>  #include <linux/uaccess.h>  #include <linux/vmalloc.h>  #include <linux/kernel.h>  #include <linux/module.h> -#include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/cpu.h>  #include <linux/pci.h> -#include <linux/fs.h> -#include <linux/mm.h>  #include <asm/microcode.h>  #include <asm/processor.h> @@ -79,9 +67,6 @@ struct microcode_amd {  #define UCODE_CONTAINER_SECTION_HDR	8  #define UCODE_CONTAINER_HEADER_SIZE	12 -/* serialize access to the physical write */ -static DEFINE_SPINLOCK(microcode_update_lock); -  static struct equiv_cpu_entry *equiv_cpu_table;  static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) @@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)  	return 1;  } -static void apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu)  { -	unsigned long flags;  	u32 rev, dummy;  	int cpu_num = raw_smp_processor_id();  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)  	BUG_ON(cpu_num != cpu);  	if (mc_amd == NULL) -		return; +		return 0; -	spin_lock_irqsave(µcode_update_lock, flags);  	wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);  	/* get patch id after patching */  	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); -	spin_unlock_irqrestore(µcode_update_lock, flags);  	/* check current patch id and patch's id for match */  	if (rev != mc_amd->hdr.patch_id) {  		printk(KERN_ERR "microcode: CPU%d: update failed "  		       "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); -		return; +		return -1;  	}  	printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",  	       cpu, rev);  	uci->cpu_sig.rev = rev; + +	return 0;  }  static int get_ucode_data(void *to, const u8 *from, size_t n) @@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)  static void free_equiv_cpu_table(void)  { -	if (equiv_cpu_table) { -		vfree(equiv_cpu_table); -		equiv_cpu_table = NULL; -	} +	vfree(equiv_cpu_table); +	equiv_cpu_table = NULL;  } -static int generic_load_microcode(int cpu, const u8 *data, size_t size) +static enum ucode_state +generic_load_microcode(int cpu, const u8 *data, size_t size)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;  	const u8 *ucode_ptr = data; @@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)  	int new_rev = uci->cpu_sig.rev;  	unsigned int leftover;  	unsigned long offset; +	enum ucode_state state = UCODE_OK;  	offset = install_equiv_cpu_table(ucode_ptr);  	if (!offset) {  		printk(KERN_ERR "microcode: failed to create "  		       "equivalent cpu table\n"); -		return -EINVAL; +		return UCODE_ERROR;  	}  	ucode_ptr += offset; @@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)  		mc_header = (struct microcode_header_amd *)mc;  		if (get_matching_microcode(cpu, mc, new_rev)) { -			if (new_mc) -				vfree(new_mc); +			vfree(new_mc);  			new_rev = mc_header->patch_id;  			new_mc  = mc;  		} else @@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)  	if (new_mc) {  		if (!leftover) { -			if (uci->mc) -				vfree(uci->mc); +			vfree(uci->mc);  			uci->mc = new_mc;  			pr_debug("microcode: CPU%d found a matching microcode "  				 "update with version 0x%x (current=0x%x)\n",  				 cpu, new_rev, uci->cpu_sig.rev); -		} else +		} else {  			vfree(new_mc); -	} +			state = UCODE_ERROR; +		} +	} else +		state = UCODE_NFOUND;  	free_equiv_cpu_table(); -	return (int)leftover; +	return state;  } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device)  {  	const char *fw_name = "amd-ucode/microcode_amd.bin";  	const struct firmware *firmware; -	int ret; - -	/* We should bind the task to the CPU */ -	BUG_ON(cpu != raw_smp_processor_id()); +	enum ucode_state ret; -	ret = request_firmware(&firmware, fw_name, device); -	if (ret) { +	if (request_firmware(&firmware, fw_name, device)) {  		printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); -		return ret; +		return UCODE_NFOUND;  	}  	ret = generic_load_microcode(cpu, firmware->data, firmware->size); @@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)  	return ret;  } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size)  {  	printk(KERN_INFO "microcode: AMD microcode update via "  	       "/dev/cpu/microcode not supported\n"); -	return -1; +	return UCODE_ERROR;  }  static void microcode_fini_cpu_amd(int cpu) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 98c470c069d..9371448290a 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -71,27 +71,18 @@   *		Thanks to Stuart Swales for pointing out this bug.   */  #include <linux/platform_device.h> -#include <linux/capability.h>  #include <linux/miscdevice.h> -#include <linux/firmware.h> +#include <linux/capability.h>  #include <linux/smp_lock.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h>  #include <linux/kernel.h>  #include <linux/module.h>  #include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h>  #include <linux/cpu.h>  #include <linux/fs.h>  #include <linux/mm.h>  #include <asm/microcode.h>  #include <asm/processor.h> -#include <asm/msr.h>  MODULE_DESCRIPTION("Microcode Update Driver");  MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); @@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");  static struct microcode_ops	*microcode_ops; -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + *   the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */  static DEFINE_MUTEX(microcode_mutex);  struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];  EXPORT_SYMBOL_GPL(ucode_cpu_info); +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { +	struct cpu_signature	*cpu_sig; +	int			err; +}; + +static void collect_cpu_info_local(void *arg) +{ +	struct cpu_info_ctx *ctx = arg; + +	ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), +						   ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ +	struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; +	int ret; + +	ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); +	if (!ret) +		ret = ctx.err; + +	return ret; +} + +static int collect_cpu_info(int cpu) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	int ret; + +	memset(uci, 0, sizeof(*uci)); + +	ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); +	if (!ret) +		uci->valid = 1; + +	return ret; +} + +struct apply_microcode_ctx { +	int err; +}; + +static void apply_microcode_local(void *arg) +{ +	struct apply_microcode_ctx *ctx = arg; + +	ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ +	struct apply_microcode_ctx ctx = { .err = 0 }; +	int ret; + +	ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); +	if (!ret) +		ret = ctx.err; + +	return ret; +} +  #ifdef CONFIG_MICROCODE_OLD_INTERFACE  static int do_microcode_update(const void __user *buf, size_t size)  { -	cpumask_t old;  	int error = 0;  	int cpu; -	old = current->cpus_allowed; -  	for_each_online_cpu(cpu) {  		struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +		enum ucode_state ustate;  		if (!uci->valid)  			continue; -		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); -		error = microcode_ops->request_microcode_user(cpu, buf, size); -		if (error < 0) -			goto out; -		if (!error) -			microcode_ops->apply_microcode(cpu); +		ustate = microcode_ops->request_microcode_user(cpu, buf, size); +		if (ustate == UCODE_ERROR) { +			error = -1; +			break; +		} else if (ustate == UCODE_OK) +			apply_microcode_on_target(cpu);  	} -out: -	set_cpus_allowed_ptr(current, &old); +  	return error;  } @@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)  static ssize_t microcode_write(struct file *file, const char __user *buf,  			       size_t len, loff_t *ppos)  { -	ssize_t ret; +	ssize_t ret = -EINVAL;  	if ((len >> PAGE_SHIFT) > num_physpages) { -		printk(KERN_ERR "microcode: too much data (max %ld pages)\n", -		       num_physpages); -		return -EINVAL; +		pr_err("microcode: too much data (max %ld pages)\n", num_physpages); +		return ret;  	}  	get_online_cpus();  	mutex_lock(µcode_mutex); -	ret = do_microcode_update(buf, len); -	if (!ret) +	if (do_microcode_update(buf, len) == 0)  		ret = (ssize_t)len;  	mutex_unlock(µcode_mutex); @@ -165,15 +228,16 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,  }  static const struct file_operations microcode_fops = { -	.owner		= THIS_MODULE, -	.write		= microcode_write, -	.open		= microcode_open, +	.owner			= THIS_MODULE, +	.write			= microcode_write, +	.open			= microcode_open,  };  static struct miscdevice microcode_dev = { -	.minor		= MICROCODE_MINOR, -	.name		= "microcode", -	.fops		= µcode_fops, +	.minor			= MICROCODE_MINOR, +	.name			= "microcode", +	.devnode		= "cpu/microcode", +	.fops			= µcode_fops,  };  static int __init microcode_dev_init(void) @@ -182,9 +246,7 @@ static int __init microcode_dev_init(void)  	error = misc_register(µcode_dev);  	if (error) { -		printk(KERN_ERR -			"microcode: can't misc_register on minor=%d\n", -			MICROCODE_MINOR); +		pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);  		return error;  	} @@ -205,42 +267,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);  /* fake device for request_firmware */  static struct platform_device	*microcode_pdev; -static long reload_for_cpu(void *unused) +static int reload_for_cpu(int cpu)  { -	struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;  	int err = 0;  	mutex_lock(µcode_mutex);  	if (uci->valid) { -		err = microcode_ops->request_microcode_fw(smp_processor_id(), -							  µcode_pdev->dev); -		if (!err) -			microcode_ops->apply_microcode(smp_processor_id()); +		enum ucode_state ustate; + +		ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); +		if (ustate == UCODE_OK) +			apply_microcode_on_target(cpu); +		else +			if (ustate == UCODE_ERROR) +				err = -EINVAL;  	}  	mutex_unlock(µcode_mutex); +  	return err;  }  static ssize_t reload_store(struct sys_device *dev,  			    struct sysdev_attribute *attr, -			    const char *buf, size_t sz) +			    const char *buf, size_t size)  { -	char *end; -	unsigned long val = simple_strtoul(buf, &end, 0); -	int err = 0; +	unsigned long val;  	int cpu = dev->id; +	int ret = 0; +	char *end; +	val = simple_strtoul(buf, &end, 0);  	if (end == buf)  		return -EINVAL; +  	if (val == 1) {  		get_online_cpus();  		if (cpu_online(cpu)) -			err = work_on_cpu(cpu, reload_for_cpu, NULL); +			ret = reload_for_cpu(cpu);  		put_online_cpus();  	} -	if (err) -		return err; -	return sz; + +	if (!ret) +		ret = size; + +	return ret;  }  static ssize_t version_show(struct sys_device *dev, @@ -271,11 +342,11 @@ static struct attribute *mc_default_attrs[] = {  };  static struct attribute_group mc_attr_group = { -	.attrs		= mc_default_attrs, -	.name		= "microcode", +	.attrs			= mc_default_attrs, +	.name			= "microcode",  }; -static void __microcode_fini_cpu(int cpu) +static void microcode_fini_cpu(int cpu)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -283,103 +354,68 @@ static void __microcode_fini_cpu(int cpu)  	uci->valid = 0;  } -static void microcode_fini_cpu(int cpu) -{ -	mutex_lock(µcode_mutex); -	__microcode_fini_cpu(cpu); -	mutex_unlock(µcode_mutex); -} - -static void collect_cpu_info(int cpu) +static enum ucode_state microcode_resume_cpu(int cpu)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; -	memset(uci, 0, sizeof(*uci)); -	if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) -		uci->valid = 1; +	if (!uci->mc) +		return UCODE_NFOUND; + +	pr_debug("microcode: CPU%d updated upon resume\n", cpu); +	apply_microcode_on_target(cpu); + +	return UCODE_OK;  } -static int microcode_resume_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu)  { -	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; -	struct cpu_signature nsig; +	enum ucode_state ustate; -	pr_debug("microcode: CPU%d resumed\n", cpu); +	if (collect_cpu_info(cpu)) +		return UCODE_ERROR; -	if (!uci->mc) -		return 1; +	/* --dimm. Trigger a delayed update? */ +	if (system_state != SYSTEM_RUNNING) +		return UCODE_NFOUND; -	/* -	 * Let's verify that the 'cached' ucode does belong -	 * to this cpu (a bit of paranoia): -	 */ -	if (microcode_ops->collect_cpu_info(cpu, &nsig)) { -		__microcode_fini_cpu(cpu); -		printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", -				cpu); -		return -1; -	} +	ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); -	if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { -		__microcode_fini_cpu(cpu); -		printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", -				cpu); -		/* Should we look for a new ucode here? */ -		return 1; +	if (ustate == UCODE_OK) { +		pr_debug("microcode: CPU%d updated upon init\n", cpu); +		apply_microcode_on_target(cpu);  	} -	return 0; +	return ustate;  } -static long microcode_update_cpu(void *unused) +static enum ucode_state microcode_update_cpu(int cpu)  { -	struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); -	int err = 0; - -	/* -	 * Check if the system resume is in progress (uci->valid != NULL), -	 * otherwise just request a firmware: -	 */ -	if (uci->valid) { -		err = microcode_resume_cpu(smp_processor_id()); -	} else { -		collect_cpu_info(smp_processor_id()); -		if (uci->valid && system_state == SYSTEM_RUNNING) -			err = microcode_ops->request_microcode_fw( -					smp_processor_id(), -					µcode_pdev->dev); -	} -	if (!err) -		microcode_ops->apply_microcode(smp_processor_id()); -	return err; -} +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	enum ucode_state ustate; -static int microcode_init_cpu(int cpu) -{ -	int err; -	mutex_lock(µcode_mutex); -	err = work_on_cpu(cpu, microcode_update_cpu, NULL); -	mutex_unlock(µcode_mutex); +	if (uci->valid) +		ustate = microcode_resume_cpu(cpu); +	else +		ustate = microcode_init_cpu(cpu); -	return err; +	return ustate;  }  static int mc_sysdev_add(struct sys_device *sys_dev)  {  	int err, cpu = sys_dev->id; -	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;  	if (!cpu_online(cpu))  		return 0;  	pr_debug("microcode: CPU%d added\n", cpu); -	memset(uci, 0, sizeof(*uci));  	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);  	if (err)  		return err; -	err = microcode_init_cpu(cpu); +	if (microcode_init_cpu(cpu) == UCODE_ERROR) +		err = -EINVAL;  	return err;  } @@ -400,19 +436,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)  static int mc_sysdev_resume(struct sys_device *dev)  {  	int cpu = dev->id; +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;  	if (!cpu_online(cpu))  		return 0; -	/* only CPU 0 will apply ucode here */ -	microcode_update_cpu(NULL); +	/* +	 * All non-bootup cpus are still disabled, +	 * so only CPU 0 will apply ucode here. +	 * +	 * Moreover, there can be no concurrent +	 * updates from any other places at this point. +	 */ +	WARN_ON(cpu != 0); + +	if (uci->valid && uci->mc) +		microcode_ops->apply_microcode(cpu); +  	return 0;  }  static struct sysdev_driver mc_sysdev_driver = { -	.add		= mc_sysdev_add, -	.remove		= mc_sysdev_remove, -	.resume		= mc_sysdev_resume, +	.add			= mc_sysdev_add, +	.remove			= mc_sysdev_remove, +	.resume			= mc_sysdev_resume,  };  static __cpuinit int @@ -425,15 +472,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)  	switch (action) {  	case CPU_ONLINE:  	case CPU_ONLINE_FROZEN: -		if (microcode_init_cpu(cpu)) -			printk(KERN_ERR "microcode: failed to init CPU%d\n", -			       cpu); +		microcode_update_cpu(cpu);  	case CPU_DOWN_FAILED:  	case CPU_DOWN_FAILED_FROZEN:  		pr_debug("microcode: CPU%d added\n", cpu);  		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) -			printk(KERN_ERR "microcode: Failed to create the sysfs " -				"group for CPU%d\n", cpu); +			pr_err("microcode: Failed to create group for CPU%d\n", cpu);  		break;  	case CPU_DOWN_PREPARE:  	case CPU_DOWN_PREPARE_FROZEN: @@ -465,13 +509,10 @@ static int __init microcode_init(void)  		microcode_ops = init_amd_microcode();  	if (!microcode_ops) { -		printk(KERN_ERR "microcode: no support for this CPU vendor\n"); +		pr_err("microcode: no support for this CPU vendor\n");  		return -ENODEV;  	} -	error = microcode_dev_init(); -	if (error) -		return error;  	microcode_pdev = platform_device_register_simple("microcode", -1,  							 NULL, 0);  	if (IS_ERR(microcode_pdev)) { @@ -480,23 +521,31 @@ static int __init microcode_init(void)  	}  	get_online_cpus(); +	mutex_lock(µcode_mutex); +  	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + +	mutex_unlock(µcode_mutex);  	put_online_cpus(); +  	if (error) { -		microcode_dev_exit();  		platform_device_unregister(microcode_pdev);  		return error;  	} +	error = microcode_dev_init(); +	if (error) +		return error; +  	register_hotcpu_notifier(&mc_cpu_notifier); -	printk(KERN_INFO -	       "Microcode Update Driver: v" MICROCODE_VERSION +	pr_info("Microcode Update Driver: v" MICROCODE_VERSION  	       " <tigran@aivazian.fsnet.co.uk>,"  	       " Peter Oruba\n");  	return 0;  } +module_init(microcode_init);  static void __exit microcode_exit(void)  { @@ -505,16 +554,17 @@ static void __exit microcode_exit(void)  	unregister_hotcpu_notifier(&mc_cpu_notifier);  	get_online_cpus(); +	mutex_lock(µcode_mutex); +  	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + +	mutex_unlock(µcode_mutex);  	put_online_cpus();  	platform_device_unregister(microcode_pdev);  	microcode_ops = NULL; -	printk(KERN_INFO -	       "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");  } - -module_init(microcode_init);  module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 149b9ec7c1a..0d334ddd0a9 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,24 +70,11 @@   *		Fix sigmatch() macro to handle old CPUs with pf == 0.   *		Thanks to Stuart Swales for pointing out this bug.   */ -#include <linux/platform_device.h> -#include <linux/capability.h> -#include <linux/miscdevice.h>  #include <linux/firmware.h> -#include <linux/smp_lock.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h>  #include <linux/uaccess.h> -#include <linux/vmalloc.h>  #include <linux/kernel.h>  #include <linux/module.h> -#include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/cpu.h> -#include <linux/fs.h> -#include <linux/mm.h> +#include <linux/vmalloc.h>  #include <asm/microcode.h>  #include <asm/processor.h> @@ -150,13 +137,9 @@ struct extended_sigtable {  #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); -  static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)  {  	struct cpuinfo_x86 *c = &cpu_data(cpu_num); -	unsigned long flags;  	unsigned int val[2];  	memset(csig, 0, sizeof(*csig)); @@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)  		csig->pf = 1 << ((val[1] >> 18) & 7);  	} -	/* serialize access to the physical write to MSR 0x79 */ -	spin_lock_irqsave(µcode_update_lock, flags); -  	wrmsr(MSR_IA32_UCODE_REV, 0, 0);  	/* see notes above for revision 1.07.  Apparent chip bug */  	sync_core();  	/* get the current revision from MSR 0x8B */  	rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); -	spin_unlock_irqrestore(µcode_update_lock, flags); -	pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", -			csig->sig, csig->pf, csig->rev); +	printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", +			cpu_num, csig->sig, csig->pf, csig->rev);  	return 0;  } @@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)  	return 0;  } -static void apply_microcode(int cpu) +static int apply_microcode(int cpu)  {  	struct microcode_intel *mc_intel;  	struct ucode_cpu_info *uci; -	unsigned long flags;  	unsigned int val[2];  	int cpu_num; @@ -334,10 +312,7 @@ static void apply_microcode(int cpu)  	BUG_ON(cpu_num != cpu);  	if (mc_intel == NULL) -		return; - -	/* serialize access to the physical write to MSR 0x79 */ -	spin_lock_irqsave(µcode_update_lock, flags); +		return 0;  	/* write microcode via MSR 0x79 */  	wrmsr(MSR_IA32_UCODE_WRITE, @@ -351,30 +326,32 @@ static void apply_microcode(int cpu)  	/* get the current revision from MSR 0x8B */  	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); -	spin_unlock_irqrestore(µcode_update_lock, flags);  	if (val[1] != mc_intel->hdr.rev) { -		printk(KERN_ERR "microcode: CPU%d update from revision " -				"0x%x to 0x%x failed\n", -			cpu_num, uci->cpu_sig.rev, val[1]); -		return; +		printk(KERN_ERR "microcode: CPU%d update " +				"to revision 0x%x failed\n", +			cpu_num, mc_intel->hdr.rev); +		return -1;  	} -	printk(KERN_INFO "microcode: CPU%d updated from revision " -			 "0x%x to 0x%x, date = %04x-%02x-%02x \n", -		cpu_num, uci->cpu_sig.rev, val[1], +	printk(KERN_INFO "microcode: CPU%d updated to revision " +			 "0x%x, date = %04x-%02x-%02x \n", +		cpu_num, val[1],  		mc_intel->hdr.date & 0xffff,  		mc_intel->hdr.date >> 24,  		(mc_intel->hdr.date >> 16) & 0xff);  	uci->cpu_sig.rev = val[1]; + +	return 0;  } -static int generic_load_microcode(int cpu, void *data, size_t size, -		int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, +				int (*get_ucode_data)(void *, const void *, size_t))  {  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;  	u8 *ucode_ptr = data, *new_mc = NULL, *mc;  	int new_rev = uci->cpu_sig.rev;  	unsigned int leftover = size; +	enum ucode_state state = UCODE_OK;  	while (leftover) {  		struct microcode_header_intel mc_header; @@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,  		leftover  -= mc_size;  	} -	if (!new_mc) +	if (leftover) { +		if (new_mc) +			vfree(new_mc); +		state = UCODE_ERROR;  		goto out; +	} -	if (leftover) { -		vfree(new_mc); +	if (!new_mc) { +		state = UCODE_NFOUND;  		goto out;  	} @@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,  	pr_debug("microcode: CPU%d found a matching microcode update with"  		 " version 0x%x (current=0x%x)\n",  			cpu, new_rev, uci->cpu_sig.rev); - - out: -	return (int)leftover; +out: +	return state;  }  static int get_ucode_fw(void *to, const void *from, size_t n) @@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)  	return 0;  } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device)  {  	char name[30];  	struct cpuinfo_x86 *c = &cpu_data(cpu);  	const struct firmware *firmware; -	int ret; +	enum ucode_state ret; -	/* We should bind the task to the CPU */ -	BUG_ON(cpu != raw_smp_processor_id());  	sprintf(name, "intel-ucode/%02x-%02x-%02x",  		c->x86, c->x86_model, c->x86_mask); -	ret = request_firmware(&firmware, name, device); -	if (ret) { + +	if (request_firmware(&firmware, name, device)) {  		pr_debug("microcode: data file %s load failed\n", name); -		return ret; +		return UCODE_NFOUND;  	}  	ret = generic_load_microcode(cpu, (void *)firmware->data, @@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)  	return copy_from_user(to, from, n);  } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size)  { -	/* We should bind the task to the CPU */ -	BUG_ON(cpu != raw_smp_processor_id()); -  	return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);  } diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c index c23880b90b5..89f386f044e 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module.c @@ -1,6 +1,5 @@ -/*  Kernel module help for x86-64 +/*  Kernel module help for x86.      Copyright (C) 2001 Rusty Russell. -    Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.      This program is free software; you can redistribute it and/or modify      it under the terms of the GNU General Public License as published by @@ -22,23 +21,18 @@  #include <linux/fs.h>  #include <linux/string.h>  #include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/slab.h>  #include <linux/bug.h> +#include <linux/mm.h>  #include <asm/system.h>  #include <asm/page.h>  #include <asm/pgtable.h> +#if 0 +#define DEBUGP printk +#else  #define DEBUGP(fmt...) - -#ifndef CONFIG_UML -void module_free(struct module *mod, void *module_region) -{ -	vfree(module_region); -	/* FIXME: If module_region == mod->init_region, trim exception -	   table entries. */ -} +#endif  void *module_alloc(unsigned long size)  { @@ -54,9 +48,15 @@ void *module_alloc(unsigned long size)  	if (!area)  		return NULL; -	return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); +	return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, +					PAGE_KERNEL_EXEC); +} + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ +	vfree(module_region);  } -#endif  /* We don't need anything special. */  int module_frob_arch_sections(Elf_Ehdr *hdr, @@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr,  	return 0;  } +#ifdef CONFIG_X86_32 +int apply_relocate(Elf32_Shdr *sechdrs, +		   const char *strtab, +		   unsigned int symindex, +		   unsigned int relsec, +		   struct module *me) +{ +	unsigned int i; +	Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; +	Elf32_Sym *sym; +	uint32_t *location; + +	DEBUGP("Applying relocate section %u to %u\n", relsec, +	       sechdrs[relsec].sh_info); +	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { +		/* This is where to make the change */ +		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr +			+ rel[i].r_offset; +		/* This is the symbol it is referring to.  Note that all +		   undefined symbols have been resolved.  */ +		sym = (Elf32_Sym *)sechdrs[symindex].sh_addr +			+ ELF32_R_SYM(rel[i].r_info); + +		switch (ELF32_R_TYPE(rel[i].r_info)) { +		case R_386_32: +			/* We add the value into the location given */ +			*location += sym->st_value; +			break; +		case R_386_PC32: +			/* Add the value, subtract its postition */ +			*location += sym->st_value - (uint32_t)location; +			break; +		default: +			printk(KERN_ERR "module %s: Unknown relocation: %u\n", +			       me->name, ELF32_R_TYPE(rel[i].r_info)); +			return -ENOEXEC; +		} +	} +	return 0; +} + +int apply_relocate_add(Elf32_Shdr *sechdrs, +		       const char *strtab, +		       unsigned int symindex, +		       unsigned int relsec, +		       struct module *me) +{ +	printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", +	       me->name); +	return -ENOEXEC; +} +#else /*X86_64*/  int apply_relocate_add(Elf64_Shdr *sechdrs,  		   const char *strtab,  		   unsigned int symindex, @@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs,  	return -ENOSYS;  } +#endif +  int module_finalize(const Elf_Ehdr *hdr,  		    const Elf_Shdr *sechdrs,  		    struct module *me) diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c deleted file mode 100644 index 0edd819050e..00000000000 --- a/arch/x86/kernel/module_32.c +++ /dev/null @@ -1,152 +0,0 @@ -/*  Kernel module help for i386. -    Copyright (C) 2001 Rusty Russell. - -    This program is free software; you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation; either version 2 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program; if not, write to the Free Software -    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA -*/ -#include <linux/moduleloader.h> -#include <linux/elf.h> -#include <linux/vmalloc.h> -#include <linux/fs.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/bug.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt...) -#endif - -void *module_alloc(unsigned long size) -{ -	if (size == 0) -		return NULL; -	return vmalloc_exec(size); -} - - -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) -{ -	vfree(module_region); -	/* FIXME: If module_region == mod->init_region, trim exception -	   table entries. */ -} - -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, -			      Elf_Shdr *sechdrs, -			      char *secstrings, -			      struct module *mod) -{ -	return 0; -} - -int apply_relocate(Elf32_Shdr *sechdrs, -		   const char *strtab, -		   unsigned int symindex, -		   unsigned int relsec, -		   struct module *me) -{ -	unsigned int i; -	Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; -	Elf32_Sym *sym; -	uint32_t *location; - -	DEBUGP("Applying relocate section %u to %u\n", relsec, -	       sechdrs[relsec].sh_info); -	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { -		/* This is where to make the change */ -		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr -			+ rel[i].r_offset; -		/* This is the symbol it is referring to.  Note that all -		   undefined symbols have been resolved.  */ -		sym = (Elf32_Sym *)sechdrs[symindex].sh_addr -			+ ELF32_R_SYM(rel[i].r_info); - -		switch (ELF32_R_TYPE(rel[i].r_info)) { -		case R_386_32: -			/* We add the value into the location given */ -			*location += sym->st_value; -			break; -		case R_386_PC32: -			/* Add the value, subtract its postition */ -			*location += sym->st_value - (uint32_t)location; -			break; -		default: -			printk(KERN_ERR "module %s: Unknown relocation: %u\n", -			       me->name, ELF32_R_TYPE(rel[i].r_info)); -			return -ENOEXEC; -		} -	} -	return 0; -} - -int apply_relocate_add(Elf32_Shdr *sechdrs, -		       const char *strtab, -		       unsigned int symindex, -		       unsigned int relsec, -		       struct module *me) -{ -	printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", -	       me->name); -	return -ENOEXEC; -} - -int module_finalize(const Elf_Ehdr *hdr, -		    const Elf_Shdr *sechdrs, -		    struct module *me) -{ -	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, -		*para = NULL; -	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - -	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { -		if (!strcmp(".text", secstrings + s->sh_name)) -			text = s; -		if (!strcmp(".altinstructions", secstrings + s->sh_name)) -			alt = s; -		if (!strcmp(".smp_locks", secstrings + s->sh_name)) -			locks = s; -		if (!strcmp(".parainstructions", secstrings + s->sh_name)) -			para = s; -	} - -	if (alt) { -		/* patch .altinstructions */ -		void *aseg = (void *)alt->sh_addr; -		apply_alternatives(aseg, aseg + alt->sh_size); -	} -	if (locks && text) { -		void *lseg = (void *)locks->sh_addr; -		void *tseg = (void *)text->sh_addr; -		alternatives_smp_module_add(me, me->name, -					    lseg, lseg + locks->sh_size, -					    tseg, tseg + text->sh_size); -	} - -	if (para) { -		void *pseg = (void *)para->sh_addr; -		apply_paravirt(pseg, pseg + para->sh_size); -	} - -	return module_bug_finalize(hdr, sechdrs, me); -} - -void module_arch_cleanup(struct module *mod) -{ -	alternatives_smp_module_del(mod); -	module_bug_cleanup(mod); -} diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 70fd7e414c1..651c93b2886 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -17,6 +17,7 @@  #include <linux/acpi.h>  #include <linux/module.h>  #include <linux/smp.h> +#include <linux/pci.h>  #include <asm/mtrr.h>  #include <asm/mpspec.h> @@ -870,24 +871,17 @@ static  inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}  #endif /* CONFIG_X86_IO_APIC */ -static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, -		      int count) +static int +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)  { -	if (!mpc_new_phys) { -		pr_info("No spare slots, try to append...take your risk, " -			"new mpc_length %x\n", count); -	} else { -		if (count <= mpc_new_length) -			pr_info("No spare slots, try to append..., " -				"new mpc_length %x\n", count); -		else { -			pr_err("mpc_new_length %lx is too small\n", -				mpc_new_length); -			return -1; -		} +	int ret = 0; + +	if (!mpc_new_phys || count <= mpc_new_length) { +		WARN(1, "update_mptable: No spare slots (length: %x)\n", count); +		return -1;  	} -	return 0; +	return ret;  }  static int  __init replace_intsrc_all(struct mpc_table *mpc, @@ -946,7 +940,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,  		} else {  			struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;  			count += sizeof(struct mpc_intsrc); -			if (!check_slot(mpc_new_phys, mpc_new_length, count)) +			if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)  				goto out;  			assign_to_mpc_intsrc(&mp_irqs[i], m);  			mpc->length = count; @@ -963,11 +957,14 @@ out:  	return 0;  } -static int __initdata enable_update_mptable; +int enable_update_mptable;  static int __init update_mptable_setup(char *str)  {  	enable_update_mptable = 1; +#ifdef CONFIG_PCI +	pci_routeirq = 1; +#endif  	return 0;  }  early_param("update_mptable", update_mptable_setup); @@ -980,6 +977,9 @@ static int __initdata alloc_mptable;  static int __init parse_alloc_mptable_opt(char *p)  {  	enable_update_mptable = 1; +#ifdef CONFIG_PCI +	pci_routeirq = 1; +#endif  	alloc_mptable = 1;  	if (!p)  		return 0; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 3cf3413ec62..98fd6cd4e3a 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {  	.notifier_call = msr_class_cpu_callback,  }; +static char *msr_nodename(struct device *dev) +{ +	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); +} +  static int __init msr_init(void)  {  	int i, err = 0; @@ -212,6 +217,7 @@ static int __init msr_init(void)  		err = PTR_ERR(msr_class);  		goto out_chrdev;  	} +	msr_class->nodename = msr_nodename;  	for_each_online_cpu(i) {  		err = msr_device_create(i);  		if (err != 0) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9faf43bea33..70ec9b951d7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA  static inline void enter_lazy(enum paravirt_lazy_mode mode)  { -	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); -	BUG_ON(preemptible()); +	BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); -	__get_cpu_var(paravirt_lazy_mode) = mode; +	percpu_write(paravirt_lazy_mode, mode);  } -void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +static void leave_lazy(enum paravirt_lazy_mode mode)  { -	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); -	BUG_ON(preemptible()); +	BUG_ON(percpu_read(paravirt_lazy_mode) != mode); -	__get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; +	percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);  }  void paravirt_enter_lazy_mmu(void) @@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)  void paravirt_leave_lazy_mmu(void)  { -	paravirt_leave_lazy(PARAVIRT_LAZY_MMU); +	leave_lazy(PARAVIRT_LAZY_MMU);  } -void paravirt_enter_lazy_cpu(void) +void paravirt_start_context_switch(struct task_struct *prev)  { +	BUG_ON(preemptible()); + +	if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { +		arch_leave_lazy_mmu_mode(); +		set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); +	}  	enter_lazy(PARAVIRT_LAZY_CPU);  } -void paravirt_leave_lazy_cpu(void) +void paravirt_end_context_switch(struct task_struct *next)  { -	paravirt_leave_lazy(PARAVIRT_LAZY_CPU); +	BUG_ON(preemptible()); + +	leave_lazy(PARAVIRT_LAZY_CPU); + +	if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) +		arch_enter_lazy_mmu_mode();  }  enum paravirt_lazy_mode paravirt_get_lazy_mode(void)  { -	return __get_cpu_var(paravirt_lazy_mode); +	if (in_interrupt()) +		return PARAVIRT_LAZY_NONE; + +	return percpu_read(paravirt_lazy_mode);  }  void arch_flush_lazy_mmu_mode(void) @@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)  	preempt_disable();  	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { -		WARN_ON(preempt_count() == 1);  		arch_leave_lazy_mmu_mode();  		arch_enter_lazy_mmu_mode();  	} @@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)  	preempt_enable();  } -void arch_flush_lazy_cpu_mode(void) -{ -	preempt_disable(); - -	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { -		WARN_ON(preempt_count() == 1); -		arch_leave_lazy_cpu_mode(); -		arch_enter_lazy_cpu_mode(); -	} - -	preempt_enable(); -} -  struct pv_info pv_info = {  	.name = "bare hardware",  	.paravirt_enabled = 0, @@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {  	.set_iopl_mask = native_set_iopl_mask,  	.io_delay = native_io_delay, -	.lazy_mode = { -		.enter = paravirt_nop, -		.leave = paravirt_nop, -	}, +	.start_context_switch = paravirt_nop, +	.end_context_switch = paravirt_nop,  };  struct pv_apic_ops pv_apic_ops = { diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 755c21e906f..971a3bec47a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {  static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; -/* enable this to stress test the chip's TCE cache */ -#ifdef CONFIG_IOMMU_DEBUG -static int debugging = 1; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, -	int expected, unsigned long start, unsigned long end) -{ -	unsigned long idx = start; - -	BUG_ON(start >= end); - -	while (idx < end) { -		if (!!test_bit(idx, bitmap) != expected) -			return idx; -		++idx; -	} - -	/* all bits have the expected value */ -	return ~0UL; -} -#else /* debugging is disabled */ -static int debugging; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, -	int expected, unsigned long start, unsigned long end) -{ -	return ~0UL; -} - -#endif /* CONFIG_IOMMU_DEBUG */ -  static inline int translation_enabled(struct iommu_table *tbl)  {  	/* only PHBs with translation enabled have an IOMMU table */ @@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,  {  	unsigned long index;  	unsigned long end; -	unsigned long badbit;  	unsigned long flags;  	index = start_addr >> PAGE_SHIFT; @@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,  	spin_lock_irqsave(&tbl->it_lock, flags); -	badbit = verify_bit_range(tbl->it_map, 0, index, end); -	if (badbit != ~0UL) { -		if (printk_ratelimit()) -			printk(KERN_ERR "Calgary: entry already allocated at " -			       "0x%lx tbl %p dma 0x%lx npages %u\n", -			       badbit, tbl, start_addr, npages); -	} -  	iommu_area_reserve(tbl->it_map, index, npages);  	spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,  	unsigned int npages)  {  	unsigned long entry; -	unsigned long badbit;  	unsigned long badend;  	unsigned long flags; @@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,  	spin_lock_irqsave(&tbl->it_lock, flags); -	badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); -	if (badbit != ~0UL) { -		if (printk_ratelimit()) -			printk(KERN_ERR "Calgary: bit is off at 0x%lx " -			       "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", -			       badbit, tbl, dma_addr, entry, npages); -	} -  	iommu_area_free(tbl->it_map, entry, npages);  	spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -1488,9 +1439,8 @@ void __init detect_calgary(void)  		iommu_detected = 1;  		calgary_detected = 1;  		printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); -		printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " -		       "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, -		       debugging ? "enabled" : "disabled"); +		printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", +		       specified_table_size);  		/* swiotlb for devices that aren't behind the Calgary. */  		if (max_pfn > MAX_DMA32_PFN) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 745579bc825..47630479b06 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -32,6 +32,8 @@ int no_iommu __read_mostly;  /* Set this to 1 if there is a HW IOMMU in the system */  int iommu_detected __read_mostly = 0; +int iommu_pass_through; +  dma_addr_t bad_dma_address __read_mostly = 0;  EXPORT_SYMBOL(bad_dma_address); @@ -209,6 +211,10 @@ static __init int iommu_setup(char *p)  #ifdef CONFIG_SWIOTLB  		if (!strncmp(p, "soft", 4))  			swiotlb = 1; +		if (!strncmp(p, "pt", 2)) { +			iommu_pass_through = 1; +			return 1; +		}  #endif  		gart_parse_options(p); @@ -290,6 +296,8 @@ static int __init pci_iommu_init(void)  void pci_iommu_shutdown(void)  {  	gart_iommu_shutdown(); + +	amd_iommu_shutdown();  }  /* Must execute after PCI subsystem */  fs_initcall(pci_iommu_init); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index b284b58c035..cfd9f906389 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -144,48 +144,21 @@ static void flush_gart(void)  }  #ifdef CONFIG_IOMMU_LEAK - -#define SET_LEAK(x)							\ -	do {								\ -		if (iommu_leak_tab)					\ -			iommu_leak_tab[x] = __builtin_return_address(0);\ -	} while (0) - -#define CLEAR_LEAK(x)							\ -	do {								\ -		if (iommu_leak_tab)					\ -			iommu_leak_tab[x] = NULL;			\ -	} while (0) -  /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab;  static int leak_trace;  static int iommu_leak_pages = 20;  static void dump_leak(void)  { -	int i;  	static int dump; -	if (dump || !iommu_leak_tab) +	if (dump)  		return;  	dump = 1; -	show_stack(NULL, NULL); -	/* Very crude. dump some from the end of the table too */ -	printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", -	       iommu_leak_pages); -	for (i = 0; i < iommu_leak_pages; i += 2) { -		printk(KERN_DEBUG "%lu: ", iommu_pages-i); -		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], -				0); -		printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); -	} -	printk(KERN_DEBUG "\n"); +	show_stack(NULL, NULL); +	debug_dma_dump_mappings(NULL);  } -#else -# define SET_LEAK(x) -# define CLEAR_LEAK(x)  #endif  static void iommu_full(struct device *dev, size_t size, int dir) @@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,  	for (i = 0; i < npages; i++) {  		iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); -		SET_LEAK(iommu_page + i);  		phys_mem += PAGE_SIZE;  	}  	return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); @@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,  	npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);  	for (i = 0; i < npages; i++) {  		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; -		CLEAR_LEAK(iommu_page + i);  	}  	free_iommu(iommu_page, npages);  } @@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,  		pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);  		while (pages--) {  			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); -			SET_LEAK(iommu_page);  			addr += PAGE_SIZE;  			iommu_page++;  		} @@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)  	agp_gatt_table = gatt; -	enable_gart_translations(); -  	error = sysdev_class_register(&gart_sysdev_class);  	if (!error)  		error = sysdev_register(&device_gart); @@ -801,11 +769,12 @@ void __init gart_iommu_init(void)  #ifdef CONFIG_IOMMU_LEAK  	if (leak_trace) { -		iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, -				  get_order(iommu_pages*sizeof(void *))); -		if (!iommu_leak_tab) +		int ret; + +		ret = dma_debug_resize_entries(iommu_pages); +		if (ret)  			printk(KERN_DEBUG -			       "PCI-DMA: Cannot allocate leak trace area\n"); +			       "PCI-DMA: Cannot trace all the entries\n");  	}  #endif @@ -845,6 +814,14 @@ void __init gart_iommu_init(void)  	 * the pages as Not-Present:  	 */  	wbinvd(); +	 +	/* +	 * Now all caches are flushed and we can safely enable +	 * GART hardware.  Doing it early leaves the possibility +	 * of stale cache entries that can lead to GART PTE +	 * errors. +	 */ +	enable_gart_translations();  	/*  	 * Try to workaround a bug (thanks to BenH): diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 221a3853e26..6af96ee4420 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)  	return paddr;  } -phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)  {  	return baddr;  } @@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void)  {  	/* don't initialize swiotlb if iommu=off (no_iommu=1) */  #ifdef CONFIG_X86_64 -	if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) +	if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || +		iommu_pass_through)  	       swiotlb = 1;  #endif  	if (swiotlb_force) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e84..994dd6a4a2a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,12 +8,15 @@  #include <linux/module.h>  #include <linux/pm.h>  #include <linux/clockchips.h> +#include <linux/random.h>  #include <trace/power.h>  #include <asm/system.h>  #include <asm/apic.h> +#include <asm/syscalls.h>  #include <asm/idle.h>  #include <asm/uaccess.h>  #include <asm/i387.h> +#include <asm/ds.h>  unsigned long idle_halt;  EXPORT_SYMBOL(idle_halt); @@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk)  		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);  		tsk->thread.xstate = NULL;  	} + +	WARN(tsk->thread.ds_ctx, "leaking DS context\n");  }  void free_thread_info(struct thread_info *ti) @@ -58,7 +63,7 @@ void arch_task_cache_init(void)          task_xstate_cachep =          	kmem_cache_create("task_xstate", xstate_size,  				  __alignof__(union thread_xstate), -				  SLAB_PANIC, NULL); +				  SLAB_PANIC | SLAB_NOTRACK, NULL);  }  /* @@ -83,8 +88,6 @@ void exit_thread(void)  		put_cpu();  		kfree(bp);  	} - -	ds_exit_thread(current);  }  void flush_thread(void) @@ -613,3 +616,16 @@ static int __init idle_setup(char *str)  }  early_param("idle", idle_setup); +unsigned long arch_align_stack(unsigned long sp) +{ +	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) +		sp -= get_random_int() % 8192; +	return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ +	unsigned long range_end = mm->brk + 0x02000000; +	return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a..59f4524984a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,8 +9,6 @@   * This file handles the architecture-dependent parts of process handling..   */ -#include <stdarg.h> -  #include <linux/stackprotector.h>  #include <linux/cpu.h>  #include <linux/errno.h> @@ -33,7 +31,6 @@  #include <linux/module.h>  #include <linux/kallsyms.h>  #include <linux/ptrace.h> -#include <linux/random.h>  #include <linux/personality.h>  #include <linux/tick.h>  #include <linux/percpu.h> @@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  		p->thread.io_bitmap_max = 0;  	} -	ds_copy_thread(p, current); +	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); +	p->thread.ds_ctx = NULL;  	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);  	p->thread.debugctlmsr = 0; @@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	 * done before math_state_restore, so the TS bit is up  	 * to date.  	 */ -	arch_leave_lazy_cpu_mode(); +	arch_end_context_switch(next_p);  	/* If the task has used fpu the last 5 timeslices, just do a full  	 * restore of the math state immediately to avoid the trap; the @@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p)  	return 0;  } -unsigned long arch_align_stack(unsigned long sp) -{ -	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) -		sp -= get_random_int() % 8192; -	return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ -	unsigned long range_end = mm->brk + 0x02000000; -	return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41392b..ebefb5407b9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,8 +14,6 @@   * This file handles the architecture-dependent parts of process handling..   */ -#include <stdarg.h> -  #include <linux/stackprotector.h>  #include <linux/cpu.h>  #include <linux/errno.h> @@ -32,7 +30,6 @@  #include <linux/delay.h>  #include <linux/module.h>  #include <linux/ptrace.h> -#include <linux/random.h>  #include <linux/notifier.h>  #include <linux/kprobes.h>  #include <linux/kdebug.h> @@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  			goto out;  	} -	ds_copy_thread(p, me); +	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); +	p->thread.ds_ctx = NULL;  	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);  	p->thread.debugctlmsr = 0; @@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	 * done before math_state_restore, so the TS bit is up  	 * to date.  	 */ -	arch_leave_lazy_cpu_mode(); +	arch_end_context_switch(next_p);  	/*  	 * Switch FS and GS. @@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr)  	return do_arch_prctl(current, code, addr);  } -unsigned long arch_align_stack(unsigned long sp) -{ -	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) -		sp -= get_random_int() % 8192; -	return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ -	unsigned long range_end = mm->brk + 0x02000000; -	return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 23b7c8f017e..09ecbde91c1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@  #include <linux/audit.h>  #include <linux/seccomp.h>  #include <linux/signal.h> +#include <linux/workqueue.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h> @@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target,  }  #ifdef CONFIG_X86_PTRACE_BTS +/* + * A branch trace store context. + * + * Contexts may only be installed by ptrace_bts_config() and only for + * ptraced tasks. + * + * Contexts are destroyed when the tracee is detached from the tracer. + * The actual destruction work requires interrupts enabled, so the + * work is deferred and will be scheduled during __ptrace_unlink(). + * + * Contexts hold an additional task_struct reference on the traced + * task, as well as a reference on the tracer's mm. + * + * Ptrace already holds a task_struct for the duration of ptrace operations, + * but since destruction is deferred, it may be executed after both + * tracer and tracee exited. + */ +struct bts_context { +	/* The branch trace handle. */ +	struct bts_tracer	*tracer; + +	/* The buffer used to store the branch trace and its size. */ +	void			*buffer; +	unsigned int		size; + +	/* The mm that paid for the above buffer. */ +	struct mm_struct	*mm; + +	/* The task this context belongs to. */ +	struct task_struct	*task; + +	/* The signal to send on a bts buffer overflow. */ +	unsigned int		bts_ovfl_signal; + +	/* The work struct to destroy a context. */ +	struct work_struct	work; +}; + +static int alloc_bts_buffer(struct bts_context *context, unsigned int size) +{ +	void *buffer = NULL; +	int err = -ENOMEM; + +	err = account_locked_memory(current->mm, current->signal->rlim, size); +	if (err < 0) +		return err; + +	buffer = kzalloc(size, GFP_KERNEL); +	if (!buffer) +		goto out_refund; + +	context->buffer = buffer; +	context->size = size; +	context->mm = get_task_mm(current); + +	return 0; + + out_refund: +	refund_locked_memory(current->mm, size); +	return err; +} + +static inline void free_bts_buffer(struct bts_context *context) +{ +	if (!context->buffer) +		return; + +	kfree(context->buffer); +	context->buffer = NULL; + +	refund_locked_memory(context->mm, context->size); +	context->size = 0; + +	mmput(context->mm); +	context->mm = NULL; +} + +static void free_bts_context_work(struct work_struct *w) +{ +	struct bts_context *context; + +	context = container_of(w, struct bts_context, work); + +	ds_release_bts(context->tracer); +	put_task_struct(context->task); +	free_bts_buffer(context); +	kfree(context); +} + +static inline void free_bts_context(struct bts_context *context) +{ +	INIT_WORK(&context->work, free_bts_context_work); +	schedule_work(&context->work); +} + +static inline struct bts_context *alloc_bts_context(struct task_struct *task) +{ +	struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL); +	if (context) { +		context->task = task; +		task->bts = context; + +		get_task_struct(task); +	} + +	return context; +} +  static int ptrace_bts_read_record(struct task_struct *child, size_t index,  				  struct bts_struct __user *out)  { +	struct bts_context *context;  	const struct bts_trace *trace;  	struct bts_struct bts;  	const unsigned char *at;  	int error; -	trace = ds_read_bts(child->bts); +	context = child->bts; +	if (!context) +		return -ESRCH; + +	trace = ds_read_bts(context->tracer);  	if (!trace) -		return -EPERM; +		return -ESRCH;  	at = trace->ds.top - ((index + 1) * trace->ds.size);  	if ((void *)at < trace->ds.begin) @@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,  	if (!trace->read)  		return -EOPNOTSUPP; -	error = trace->read(child->bts, at, &bts); +	error = trace->read(context->tracer, at, &bts);  	if (error < 0)  		return error; @@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child,  			    long size,  			    struct bts_struct __user *out)  { +	struct bts_context *context;  	const struct bts_trace *trace;  	const unsigned char *at;  	int error, drained = 0; -	trace = ds_read_bts(child->bts); +	context = child->bts; +	if (!context) +		return -ESRCH; + +	trace = ds_read_bts(context->tracer);  	if (!trace) -		return -EPERM; +		return -ESRCH;  	if (!trace->read)  		return -EOPNOTSUPP; @@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child,  	for (at = trace->ds.begin; (void *)at < trace->ds.top;  	     out++, drained++, at += trace->ds.size) {  		struct bts_struct bts; -		int error; -		error = trace->read(child->bts, at, &bts); +		error = trace->read(context->tracer, at, &bts);  		if (error < 0)  			return error; @@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child,  	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); -	error = ds_reset_bts(child->bts); +	error = ds_reset_bts(context->tracer);  	if (error < 0)  		return error;  	return drained;  } -static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) -{ -	child->bts_buffer = alloc_locked_buffer(size); -	if (!child->bts_buffer) -		return -ENOMEM; - -	child->bts_size = size; - -	return 0; -} - -static void ptrace_bts_free_buffer(struct task_struct *child) -{ -	free_locked_buffer(child->bts_buffer, child->bts_size); -	child->bts_buffer = NULL; -	child->bts_size = 0; -} -  static int ptrace_bts_config(struct task_struct *child,  			     long cfg_size,  			     const struct ptrace_bts_config __user *ucfg)  { +	struct bts_context *context;  	struct ptrace_bts_config cfg;  	unsigned int flags = 0; @@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child,  	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))  		return -EFAULT; -	if (child->bts) { -		ds_release_bts(child->bts); -		child->bts = NULL; -	} +	context = child->bts; +	if (!context) +		context = alloc_bts_context(child); +	if (!context) +		return -ENOMEM;  	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {  		if (!cfg.signal)  			return -EINVAL; -		child->thread.bts_ovfl_signal = cfg.signal;  		return -EOPNOTSUPP; +		context->bts_ovfl_signal = cfg.signal;  	} -	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && -	    (cfg.size != child->bts_size)) { -		int error; +	ds_release_bts(context->tracer); +	context->tracer = NULL; -		ptrace_bts_free_buffer(child); +	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { +		int err; -		error = ptrace_bts_allocate_buffer(child, cfg.size); -		if (error < 0) -			return error; +		free_bts_buffer(context); +		if (!cfg.size) +			return 0; + +		err = alloc_bts_buffer(context, cfg.size); +		if (err < 0) +			return err;  	}  	if (cfg.flags & PTRACE_BTS_O_TRACE) @@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child,  	if (cfg.flags & PTRACE_BTS_O_SCHED)  		flags |= BTS_TIMESTAMPS; -	child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, -				    /* ovfl = */ NULL, /* th = */ (size_t)-1, -				    flags); -	if (IS_ERR(child->bts)) { -		int error = PTR_ERR(child->bts); - -		ptrace_bts_free_buffer(child); -		child->bts = NULL; +	context->tracer = +		ds_request_bts_task(child, context->buffer, context->size, +				    NULL, (size_t)-1, flags); +	if (unlikely(IS_ERR(context->tracer))) { +		int error = PTR_ERR(context->tracer); +		free_bts_buffer(context); +		context->tracer = NULL;  		return error;  	} @@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child,  			     long cfg_size,  			     struct ptrace_bts_config __user *ucfg)  { +	struct bts_context *context;  	const struct bts_trace *trace;  	struct ptrace_bts_config cfg; +	context = child->bts; +	if (!context) +		return -ESRCH; +  	if (cfg_size < sizeof(cfg))  		return -EIO; -	trace = ds_read_bts(child->bts); +	trace = ds_read_bts(context->tracer);  	if (!trace) -		return -EPERM; +		return -ESRCH;  	memset(&cfg, 0, sizeof(cfg)); -	cfg.size = trace->ds.end - trace->ds.begin; -	cfg.signal = child->thread.bts_ovfl_signal; -	cfg.bts_size = sizeof(struct bts_struct); +	cfg.size	= trace->ds.end - trace->ds.begin; +	cfg.signal	= context->bts_ovfl_signal; +	cfg.bts_size	= sizeof(struct bts_struct);  	if (cfg.signal)  		cfg.flags |= PTRACE_BTS_O_SIGNAL; @@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child,  static int ptrace_bts_clear(struct task_struct *child)  { +	struct bts_context *context;  	const struct bts_trace *trace; -	trace = ds_read_bts(child->bts); +	context = child->bts; +	if (!context) +		return -ESRCH; + +	trace = ds_read_bts(context->tracer);  	if (!trace) -		return -EPERM; +		return -ESRCH;  	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); -	return ds_reset_bts(child->bts); +	return ds_reset_bts(context->tracer);  }  static int ptrace_bts_size(struct task_struct *child)  { +	struct bts_context *context;  	const struct bts_trace *trace; -	trace = ds_read_bts(child->bts); +	context = child->bts; +	if (!context) +		return -ESRCH; + +	trace = ds_read_bts(context->tracer);  	if (!trace) -		return -EPERM; +		return -ESRCH;  	return (trace->ds.top - trace->ds.begin) / trace->ds.size;  } -static void ptrace_bts_fork(struct task_struct *tsk) -{ -	tsk->bts = NULL; -	tsk->bts_buffer = NULL; -	tsk->bts_size = 0; -	tsk->thread.bts_ovfl_signal = 0; -} - -static void ptrace_bts_untrace(struct task_struct *child) +/* + * Called from __ptrace_unlink() after the child has been moved back + * to its original parent. + */ +void ptrace_bts_untrace(struct task_struct *child)  {  	if (unlikely(child->bts)) { -		ds_release_bts(child->bts); +		free_bts_context(child->bts);  		child->bts = NULL; - -		/* We cannot update total_vm and locked_vm since -		   child's mm is already gone. But we can reclaim the -		   memory. */ -		kfree(child->bts_buffer); -		child->bts_buffer = NULL; -		child->bts_size = 0;  	}  } - -static void ptrace_bts_detach(struct task_struct *child) -{ -	/* -	 * Ptrace_detach() races with ptrace_untrace() in case -	 * the child dies and is reaped by another thread. -	 * -	 * We only do the memory accounting at this point and -	 * leave the buffer deallocation and the bts tracer -	 * release to ptrace_bts_untrace() which will be called -	 * later on with tasklist_lock held. -	 */ -	release_locked_buffer(child->bts_buffer, child->bts_size); -} -#else -static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_detach(struct task_struct *child) {} -static inline void ptrace_bts_untrace(struct task_struct *child) {}  #endif /* CONFIG_X86_PTRACE_BTS */ -void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ -	ptrace_bts_fork(child); -} - -void x86_ptrace_untrace(struct task_struct *child) -{ -	ptrace_bts_untrace(child); -} -  /*   * Called by kernel/ptrace.c when detaching..   * @@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child)  #ifdef TIF_SYSCALL_EMU  	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);  #endif -	ptrace_bts_detach(child);  }  #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 7563b31b4f0..af71d06624b 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -491,5 +491,42 @@ void force_hpet_resume(void)  		break;  	}  } +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void __init quirk_amd_nb_node(struct pci_dev *dev) +{ +	struct pci_dev *nb_ht; +	unsigned int devfn; +	u32 val; + +	devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); +	nb_ht = pci_get_slot(dev->bus, devfn); +	if (!nb_ht) +		return; + +	pci_read_config_dword(nb_ht, 0x60, &val); +	set_dev_node(&dev->dev, val & 7); +	pci_dev_put(dev); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, +			quirk_amd_nb_node);  #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 667188e0b5a..d2d1ce8170f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_BOARD_NAME, "0KP561"),  		},  	}, +	{   /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ +		.callback = set_bios_reboot, +		.ident = "Dell OptiPlex 360", +		.matches = { +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), +			DMI_MATCH(DMI_BOARD_NAME, "0T656F"), +		}, +	},  	{	/* Handle problems with rebooting on Dell 2400's */  		.callback = set_bios_reboot,  		.ident = "Dell PowerEdge 2400", diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf6..be5ae80f897 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,14 @@  #define ARCH_SETUP  #endif +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; +  RESERVE_BRK(dmi_alloc, 65536);  unsigned int boot_cpu_id __read_mostly; @@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;  unsigned long mmu_cr4_features = X86_CR4_PAE;  #endif -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version;  /*   * Setup options @@ -293,15 +301,13 @@ static void __init reserve_brk(void)  #ifdef CONFIG_BLK_DEV_INITRD -#ifdef CONFIG_X86_32 -  #define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)  static void __init relocate_initrd(void)  {  	u64 ramdisk_image = boot_params.hdr.ramdisk_image;  	u64 ramdisk_size  = boot_params.hdr.ramdisk_size; -	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; +	u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;  	u64 ramdisk_here;  	unsigned long slop, clen, mapaddr;  	char *p, *q; @@ -357,14 +363,13 @@ static void __init relocate_initrd(void)  		ramdisk_image, ramdisk_image + ramdisk_size - 1,  		ramdisk_here, ramdisk_here + ramdisk_size - 1);  } -#endif  static void __init reserve_initrd(void)  {  	u64 ramdisk_image = boot_params.hdr.ramdisk_image;  	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;  	u64 ramdisk_end   = ramdisk_image + ramdisk_size; -	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; +	u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;  	if (!boot_params.hdr.type_of_loader ||  	    !ramdisk_image || !ramdisk_size) @@ -394,14 +399,8 @@ static void __init reserve_initrd(void)  		return;  	} -#ifdef CONFIG_X86_32  	relocate_initrd(); -#else -	printk(KERN_ERR "initrd extends beyond end of memory " -	       "(0x%08llx > 0x%08llx)\ndisabling initrd\n", -	       ramdisk_end, end_of_lowmem); -	initrd_start = 0; -#endif +  	free_early(ramdisk_image, ramdisk_end);  }  #else @@ -706,6 +705,12 @@ void __init setup_arch(char **cmdline_p)  #endif  	saved_video_mode = boot_params.hdr.vid_mode;  	bootloader_type = boot_params.hdr.type_of_loader; +	if ((bootloader_type >> 4) == 0xe) { +		bootloader_type &= 0xf; +		bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; +	} +	bootloader_version  = bootloader_type & 0xf; +	bootloader_version |= boot_params.hdr.ext_loader_ver << 4;  #ifdef CONFIG_BLK_DEV_RAM  	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; @@ -854,12 +859,16 @@ void __init setup_arch(char **cmdline_p)  		max_low_pfn = max_pfn;  	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; +	max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;  #endif  #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION  	setup_bios_corruption_check();  #endif +	printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", +			max_pfn_mapped<<PAGE_SHIFT); +  	reserve_brk();  	/* max_pfn_mapped is updated here */ @@ -997,24 +1006,6 @@ void __init setup_arch(char **cmdline_p)  #ifdef CONFIG_X86_32  /** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - *	Perform any necessary interrupt initialisation prior to setting up - *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA - *	interrupts should be initialised here if the machine emulates a PC - *	in any way. - **/ -void __init x86_quirk_pre_intr_init(void) -{ -	if (x86_quirks->arch_pre_intr_init) { -		if (x86_quirks->arch_pre_intr_init()) -			return; -	} -	init_ISA_irqs(); -} - -/**   * x86_quirk_intr_init - post gate setup interrupt initialisation   *   * Description: diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8f0e13be36b..9c3f0823e6a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void)  	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;  #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) +	/* +	 * make sure boot cpu node_number is right, when boot cpu is on the +	 * node that doesn't have mem installed +	 */ +	per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); +#endif +  	/* Setup node to cpumask map */  	setup_node_to_cpumask_map(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e..4c578751e94 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,6 @@   *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes   *  2000-2002   x86-64 support by Andi Kleen   */ -  #include <linux/sched.h>  #include <linux/mm.h>  #include <linux/smp.h> @@ -25,11 +24,11 @@  #include <asm/ucontext.h>  #include <asm/i387.h>  #include <asm/vdso.h> +#include <asm/mce.h>  #ifdef CONFIG_X86_64  #include <asm/proto.h>  #include <asm/ia32_unistd.h> -#include <asm/mce.h>  #endif /* CONFIG_X86_64 */  #include <asm/syscall.h> @@ -857,10 +856,10 @@ static void do_signal(struct pt_regs *regs)  void  do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)  { -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#ifdef CONFIG_X86_NEW_MCE  	/* notify userspace of pending MCEs */  	if (thread_info_flags & _TIF_MCE_NOTIFY) -		mce_notify_user(); +		mce_notify_process();  #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */  	/* deal with pending signal delivery */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 13f33ea8cca..ec1de97600e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask)   * this function calls the 'stop' function on all other CPUs in the system.   */ +asmlinkage void smp_reboot_interrupt(void) +{ +	ack_APIC_irq(); +	irq_enter(); +	stop_this_cpu(NULL); +	irq_exit(); +} +  static void native_smp_send_stop(void)  {  	unsigned long flags; +	unsigned long wait;  	if (reboot_force)  		return; -	smp_call_function(stop_this_cpu, NULL, 0); +	/* +	 * Use an own vector here because smp_call_function +	 * does lots of things not suitable in a panic situation. +	 * On most systems we could also use an NMI here, +	 * but there are a few systems around where NMI +	 * is problematic so stay with an non NMI for now +	 * (this implies we cannot stop CPUs spinning with irq off +	 * currently) +	 */ +	if (num_online_cpus() > 1) { +		apic->send_IPI_allbutself(REBOOT_VECTOR); + +		/* Don't wait longer than a second */ +		wait = USEC_PER_SEC; +		while (num_online_cpus() > 1 && wait--) +			udelay(1); +	} +  	local_irq_save(flags);  	disable_local_APIC();  	local_irq_restore(flags); @@ -172,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)  {  	ack_APIC_irq();  	inc_irq_stat(irq_resched_count); +	/* +	 * KVM uses this interrupt to force a cpu out of guest mode +	 */  }  void smp_call_function_interrupt(struct pt_regs *regs) @@ -193,19 +222,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)  }  struct smp_ops smp_ops = { -	.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, -	.smp_prepare_cpus = native_smp_prepare_cpus, -	.smp_cpus_done = native_smp_cpus_done, +	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu, +	.smp_prepare_cpus	= native_smp_prepare_cpus, +	.smp_cpus_done		= native_smp_cpus_done, -	.smp_send_stop = native_smp_send_stop, -	.smp_send_reschedule = native_smp_send_reschedule, +	.smp_send_stop		= native_smp_send_stop, +	.smp_send_reschedule	= native_smp_send_reschedule, -	.cpu_up = native_cpu_up, -	.cpu_die = native_cpu_die, -	.cpu_disable = native_cpu_disable, -	.play_dead = native_play_dead, +	.cpu_up			= native_cpu_up, +	.cpu_die		= native_cpu_die, +	.cpu_disable		= native_cpu_disable, +	.play_dead		= native_play_dead, -	.send_call_func_ipi = native_send_call_func_ipi, +	.send_call_func_ipi	= native_send_call_func_ipi,  	.send_call_func_single_ipi = native_send_call_func_single_ipi,  };  EXPORT_SYMBOL_GPL(smp_ops); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef917d..2fecda69ee6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid)   * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this   * won't ... remember to clear down the APIC, etc later.   */ -int __devinit +int __cpuinit  wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)  {  	unsigned long send_status, accept_status = 0; @@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)  	return (send_status | accept_status);  } -int __devinit +static int __cpuinit  wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)  {  	unsigned long send_status, accept_status = 0; @@ -822,10 +822,12 @@ do_rest:  	/* mark "stuck" area as not stuck */  	*((volatile unsigned long *)trampoline_base) = 0; -	/* -	 * Cleanup possible dangling ends... -	 */ -	smpboot_restore_warm_reset_vector(); +	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { +		/* +		 * Cleanup possible dangling ends... +		 */ +		smpboot_restore_warm_reset_vector(); +	}  	return boot_error;  } @@ -871,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)  	err = do_boot_cpu(apicid, cpu); -	zap_low_mappings(); +	zap_low_mappings(false);  	low_mappings = 0;  #else  	err = do_boot_cpu(apicid, cpu); @@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)  	 */  	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&  	    !cpu_has_apic) { -		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", -			boot_cpu_physical_apicid); -		printk(KERN_ERR "... forcing use of dummy APIC emulation." +		if (!disable_apic) { +			pr_err("BIOS bug, local APIC #%d not detected!...\n", +				boot_cpu_physical_apicid); +			pr_err("... forcing use of dummy APIC emulation."  				"(tell your hw vendor)\n"); +		}  		smpboot_clear_io_apic();  		arch_disable_smp_support();  		return -1; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index f7bddc2e37d..c3eb207181f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)  static int save_stack_stack(void *data, char *name)  { -	return -1; +	return 0;  }  static void save_stack_address(void *data, unsigned long addr, int reliable) @@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)  }  EXPORT_SYMBOL_GPL(save_stack_trace); +void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) +{ +	dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); +	if (trace->nr_entries < trace->max_entries) +		trace->entries[trace->nr_entries++] = ULONG_MAX; +} +  void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)  {  	dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index ff5c8736b49..d51321ddafd 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -334,3 +334,5 @@ ENTRY(sys_call_table)  	.long sys_inotify_init1  	.long sys_preadv  	.long sys_pwritev +	.long sys_rt_tgsigqueueinfo	/* 335 */ +	.long sys_perf_counter_open diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 8c7b03b0cfc..124d40c575d 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)  	struct bau_desc *adp;  	struct bau_desc *ad2; -	adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); +	/* +	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) +	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade +	 */ +	adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* +		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);  	BUG_ON(!adp);  	pa = uv_gpa(adp); /* need the real nasid*/ @@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)  				      (n << UV_DESC_BASE_PNODE_SHIFT | m));  	} -	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { +	/* +	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each +	 * cpu even though we only use the first one; one descriptor can +	 * describe a broadcast to 256 nodes. +	 */ +	for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); +		i++, ad2++) {  		memset(ad2, 0, sizeof(struct bau_desc));  		ad2->header.sw_ack_flag = 1;  		/* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff..a0f48f5671c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -45,6 +45,7 @@  #include <linux/edac.h>  #endif +#include <asm/kmemcheck.h>  #include <asm/stacktrace.h>  #include <asm/processor.h>  #include <asm/debugreg.h> @@ -53,6 +54,7 @@  #include <asm/traps.h>  #include <asm/desc.h>  #include <asm/i387.h> +#include <asm/mce.h>  #include <asm/mach_traps.h> @@ -64,8 +66,6 @@  #include <asm/setup.h>  #include <asm/traps.h> -#include "cpu/mcheck/mce.h" -  asmlinkage int system_call(void);  /* Do we ignore FPU interrupts ? */ @@ -534,6 +534,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  	get_debugreg(condition, 6); +	/* Catch kmemcheck conditions first of all! */ +	if (condition & DR_STEP && kmemcheck_trap(regs)) +		return; +  	/*  	 * The processor cleared BTF, so don't mark that we need it set.  	 */ @@ -798,15 +802,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)  	return new_kesp;  } -#else +#endif +  asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)  {  } -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)  {  } -#endif  /*   * 'math_state_restore()' saves the current math information in the @@ -839,9 +843,6 @@ asmlinkage void math_state_restore(void)  	}  	clts();				/* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 -	restore_fpu(tsk); -#else  	/*  	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.  	 */ @@ -850,7 +851,7 @@ asmlinkage void math_state_restore(void)  		force_sig(SIGSEGV, tsk);  		return;  	} -#endif +  	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */  	tsk->fpu_counter++;  } @@ -945,8 +946,13 @@ void __init trap_init(void)  #endif  	set_intr_gate(19, &simd_coprocessor_error); +	/* Reserve all the builtin and the syscall vector: */ +	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) +		set_bit(i, used_vectors); +  #ifdef CONFIG_IA32_EMULATION  	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); +	set_bit(IA32_SYSCALL_VECTOR, used_vectors);  #endif  #ifdef CONFIG_X86_32 @@ -963,17 +969,9 @@ void __init trap_init(void)  	}  	set_system_trap_gate(SYSCALL_VECTOR, &system_call); -#endif - -	/* Reserve all the builtin and the syscall vector: */ -	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) -		set_bit(i, used_vectors); - -#ifdef CONFIG_X86_64 -	set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else  	set_bit(SYSCALL_VECTOR, used_vectors);  #endif +  	/*  	 * Should be a barrier for any external CPU state:  	 */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index d57de05dc43..6e1a368d21d 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -9,6 +9,7 @@  #include <linux/delay.h>  #include <linux/clocksource.h>  #include <linux/percpu.h> +#include <linux/timex.h>  #include <asm/hpet.h>  #include <asm/timer.h> @@ -384,13 +385,13 @@ unsigned long native_calibrate_tsc(void)  {  	u64 tsc1, tsc2, delta, ref1, ref2;  	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; -	unsigned long flags, latch, ms, fast_calibrate, tsc_khz; +	unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;  	int hpet = is_hpet_enabled(), i, loopmin; -	tsc_khz = get_hypervisor_tsc_freq(); -	if (tsc_khz) { +	hv_tsc_khz = get_hypervisor_tsc_freq(); +	if (hv_tsc_khz) {  		printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); -		return tsc_khz; +		return hv_tsc_khz;  	}  	local_irq_save(flags); @@ -589,22 +590,26 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);   */  DEFINE_PER_CPU(unsigned long, cyc2ns); +DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);  static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)  { -	unsigned long long tsc_now, ns_now; +	unsigned long long tsc_now, ns_now, *offset;  	unsigned long flags, *scale;  	local_irq_save(flags);  	sched_clock_idle_sleep_event();  	scale = &per_cpu(cyc2ns, cpu); +	offset = &per_cpu(cyc2ns_offset, cpu);  	rdtscll(tsc_now);  	ns_now = __cycles_2_ns(tsc_now); -	if (cpu_khz) +	if (cpu_khz) {  		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; +		*offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); +	}  	sched_clock_idle_wakeup_event(0);  	local_irq_restore(flags); @@ -631,17 +636,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,  				void *data)  {  	struct cpufreq_freqs *freq = data; -	unsigned long *lpj, dummy; +	unsigned long *lpj;  	if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))  		return 0; -	lpj = &dummy; -	if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +	lpj = &boot_cpu_data.loops_per_jiffy;  #ifdef CONFIG_SMP +	if (!(freq->flags & CPUFREQ_CONST_LOOPS))  		lpj = &cpu_data(freq->cpu).loops_per_jiffy; -#else -	lpj = &boot_cpu_data.loops_per_jiffy;  #endif  	if (!ref_freq) { @@ -710,7 +713,16 @@ static cycle_t read_tsc(struct clocksource *cs)  #ifdef CONFIG_X86_64  static cycle_t __vsyscall_fn vread_tsc(void)  { -	cycle_t ret = (cycle_t)vget_cycles(); +	cycle_t ret; + +	/* +	 * Surround the RDTSC by barriers, to make sure it's not +	 * speculated to outside the seqlock critical section and +	 * does not cause time warps: +	 */ +	rdtsc_barrier(); +	ret = (cycle_t)vget_cycles(); +	rdtsc_barrier();  	return ret >= __vsyscall_gtod_data.clock.cycle_last ?  		ret : __vsyscall_gtod_data.clock.cycle_last; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bf36328f6ef..027b5b49899 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;   * of a critical section, to be able to prove TSC time-warps:   */  static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +  static __cpuinitdata cycles_t last_tsc;  static __cpuinitdata cycles_t max_warp;  static __cpuinitdata int nr_warps; @@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)  		return;  	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { -		printk(KERN_INFO -		       "Skipping synchronization checks as TSC is reliable.\n"); +		pr_info("Skipping synchronization checks as TSC is reliable.\n");  		return;  	} -	printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", -			  smp_processor_id(), cpu); +	pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", +		smp_processor_id(), cpu);  	/*  	 * Reset it - in case this is a second bootup: @@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)  	if (nr_warps) {  		printk("\n"); -		printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," -				    " turning off TSC clock.\n", max_warp); +		pr_warning("Measured %Ld cycles TSC warp between CPUs, " +			   "turning off TSC clock.\n", max_warp);  		mark_tsc_unstable("check_tsc_sync_source failed");  	} else {  		printk(" passed.\n"); @@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)  	while (atomic_read(&stop_count) != cpus)  		cpu_relax();  } -#undef NR_LOOPS - diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e7fc1..9c4e6253905 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk  	info->regs.pt.ds = 0;  	info->regs.pt.es = 0;  	info->regs.pt.fs = 0; - -/* we are clearing gs later just before "jmp resume_userspace", - * because it is not saved/restored. - */ +#ifndef CONFIG_X86_32_LAZY_GS +	info->regs.pt.gs = 0; +#endif  /*   * The flags register is also special: we cannot trust that the user @@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk  	}  /* - * Save old state, set default return value (%ax) to 0 + * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)   */ -	info->regs32->ax = 0; +	info->regs32->ax = VM86_SIGNAL;  	tsk->thread.saved_sp0 = tsk->thread.sp0;  	tsk->thread.saved_fs = info->regs32->fs;  	tsk->thread.saved_gs = get_user_gs(info->regs32); @@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk  	__asm__ __volatile__(  		"movl %0,%%esp\n\t"  		"movl %1,%%ebp\n\t" +#ifdef CONFIG_X86_32_LAZY_GS  		"mov  %2, %%gs\n\t" +#endif  		"jmp resume_userspace"  		: /* no outputs */  		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95deb9f2211..b263423fbe2 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,  }  #endif -static void vmi_enter_lazy_cpu(void) +static void vmi_start_context_switch(struct task_struct *prev)  { -	paravirt_enter_lazy_cpu(); +	paravirt_start_context_switch(prev);  	vmi_ops.set_lazy_mode(2);  } +static void vmi_end_context_switch(struct task_struct *next) +{ +	vmi_ops.set_lazy_mode(0); +	paravirt_end_context_switch(next); +} +  static void vmi_enter_lazy_mmu(void)  {  	paravirt_enter_lazy_mmu();  	vmi_ops.set_lazy_mode(1);  } -static void vmi_leave_lazy(void) +static void vmi_leave_lazy_mmu(void)  { -	paravirt_leave_lazy(paravirt_get_lazy_mode());  	vmi_ops.set_lazy_mode(0); +	paravirt_leave_lazy_mmu();  }  static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)  	para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);  	para_fill(pv_cpu_ops.io_delay, IODelay); -	para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, +	para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,  		  set_lazy_mode, SetLazyMode); -	para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, +	para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,  		  set_lazy_mode, SetLazyMode);  	para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,  		  set_lazy_mode, SetLazyMode); -	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, +	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,  		  set_lazy_mode, SetLazyMode);  	/* user and kernel flush are just handled with different flags to FlushTLB */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 849ee611f01..367e8788204 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -1,5 +1,433 @@ +/* + * ld script for the x86 kernel + * + * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * Modernisation, unification and other changes and fixes: + *   Copyright (C) 2007-2009  Sam Ravnborg <sam@ravnborg.org> + * + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ +  #ifdef CONFIG_X86_32 -# include "vmlinux_32.lds.S" +#define LOAD_OFFSET __PAGE_OFFSET  #else -# include "vmlinux_64.lds.S" +#define LOAD_OFFSET __START_KERNEL_map  #endif + +#include <asm-generic/vmlinux.lds.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/page_types.h> +#include <asm/cache.h> +#include <asm/boot.h> + +#undef i386     /* in case the preprocessor is a 32bit one */ + +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_32 +OUTPUT_ARCH(i386) +ENTRY(phys_startup_32) +jiffies = jiffies_64; +#else +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +jiffies_64 = jiffies; +#endif + +PHDRS { +	text PT_LOAD FLAGS(5);          /* R_E */ +	data PT_LOAD FLAGS(7);          /* RWE */ +#ifdef CONFIG_X86_64 +	user PT_LOAD FLAGS(7);          /* RWE */ +	data.init PT_LOAD FLAGS(7);     /* RWE */ +#ifdef CONFIG_SMP +	percpu PT_LOAD FLAGS(7);        /* RWE */ +#endif +	data.init2 PT_LOAD FLAGS(7);    /* RWE */ +#endif +	note PT_NOTE FLAGS(0);          /* ___ */ +} + +SECTIONS +{ +#ifdef CONFIG_X86_32 +        . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; +        phys_startup_32 = startup_32 - LOAD_OFFSET; +#else +        . = __START_KERNEL; +        phys_startup_64 = startup_64 - LOAD_OFFSET; +#endif + +	/* Text and read-only data */ + +	/* bootstrapping code */ +	.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { +		_text = .; +		*(.text.head) +	} :text = 0x9090 + +	/* The rest of the text */ +	.text :  AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_X86_32 +		/* not really needed, already page aligned */ +		. = ALIGN(PAGE_SIZE); +		*(.text.page_aligned) +#endif +		. = ALIGN(8); +		_stext = .; +		TEXT_TEXT +		SCHED_TEXT +		LOCK_TEXT +		KPROBES_TEXT +		IRQENTRY_TEXT +		*(.fixup) +		*(.gnu.warning) +		/* End of text section */ +		_etext = .; +	} :text = 0x9090 + +	NOTES :text :note + +	/* Exception table */ +	. = ALIGN(16); +	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { +		__start___ex_table = .; +		*(__ex_table) +		__stop___ex_table = .; +	} :text = 0x9090 + +	RODATA + +	/* Data */ +	. = ALIGN(PAGE_SIZE); +	.data : AT(ADDR(.data) - LOAD_OFFSET) { +		/* Start of data section */ +		_sdata = .; +		DATA_DATA +		CONSTRUCTORS + +#ifdef CONFIG_X86_64 +		/* End of data section */ +		_edata = .; +#endif +	} :data + +#ifdef CONFIG_X86_32 +	/* 32 bit has nosave before _edata */ +	. = ALIGN(PAGE_SIZE); +	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { +		__nosave_begin = .; +		*(.data.nosave) +		. = ALIGN(PAGE_SIZE); +		__nosave_end = .; +	} +#endif + +	. = ALIGN(PAGE_SIZE); +	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { +		*(.data.page_aligned) +		*(.data.idt) +	} + +#ifdef CONFIG_X86_32 +	. = ALIGN(32); +#else +	. = ALIGN(PAGE_SIZE); +	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES); +#endif +	.data.cacheline_aligned : +		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { +		*(.data.cacheline_aligned) +	} + +	/* rarely changed data like cpu maps */ +#ifdef CONFIG_X86_32 +	. = ALIGN(32); +#else +	. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); +#endif +	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { +		*(.data.read_mostly) + +#ifdef CONFIG_X86_32 +		/* End of data section */ +		_edata = .; +#endif +	} + +#ifdef CONFIG_X86_64 + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ +                            SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ +                            SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + +	. = VSYSCALL_ADDR; +	.vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { +		*(.vsyscall_0) +	} :user + +	__vsyscall_0 = VSYSCALL_VIRT_ADDR; + +	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES); +	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { +		*(.vsyscall_fn) +	} + +	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES); +	.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { +		*(.vsyscall_gtod_data) +	} + +	vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); +	.vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { +		*(.vsyscall_clock) +	} +	vsyscall_clock = VVIRT(.vsyscall_clock); + + +	.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { +		*(.vsyscall_1) +	} +	.vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { +		*(.vsyscall_2) +	} + +	.vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { +		*(.vgetcpu_mode) +	} +	vgetcpu_mode = VVIRT(.vgetcpu_mode); + +	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES); +	.jiffies : AT(VLOAD(.jiffies)) { +		*(.jiffies) +	} +	jiffies = VVIRT(.jiffies); + +	.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { +		*(.vsyscall_3) +	} + +	. = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT + +#endif /* CONFIG_X86_64 */ + +	/* init_task */ +	. = ALIGN(THREAD_SIZE); +	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { +		*(.data.init_task) +	} +#ifdef CONFIG_X86_64 +	 :data.init +#endif + +	/* +	 * smp_locks might be freed after init +	 * start/end must be page aligned +	 */ +	. = ALIGN(PAGE_SIZE); +	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { +		__smp_locks = .; +		*(.smp_locks) +		__smp_locks_end = .; +		. = ALIGN(PAGE_SIZE); +	} + +	/* Init code and data - will be freed after init */ +	. = ALIGN(PAGE_SIZE); +	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { +		__init_begin = .; /* paired with __init_end */ +		_sinittext = .; +		INIT_TEXT +		_einittext = .; +	} + +	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { +		INIT_DATA +	} + +	. = ALIGN(16); +	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { +		__setup_start = .; +		*(.init.setup) +		__setup_end = .; +	} +	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { +		__initcall_start = .; +		INITCALLS +		__initcall_end = .; +	} + +	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { +		__con_initcall_start = .; +		*(.con_initcall.init) +		__con_initcall_end = .; +	} + +	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { +		__x86_cpu_dev_start = .; +		*(.x86_cpu_dev.init) +		__x86_cpu_dev_end = .; +	} + +	SECURITY_INIT + +	. = ALIGN(8); +	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { +		__parainstructions = .; +		*(.parainstructions) +		__parainstructions_end = .; +	} + +	. = ALIGN(8); +	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { +		__alt_instructions = .; +		*(.altinstructions) +		__alt_instructions_end = .; +	} + +	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { +		*(.altinstr_replacement) +	} + +	/* +	 * .exit.text is discard at runtime, not link time, to deal with +	 *  references from .altinstructions and .eh_frame +	 */ +	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { +		EXIT_TEXT +	} + +	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { +		EXIT_DATA +	} + +#ifdef CONFIG_BLK_DEV_INITRD +	. = ALIGN(PAGE_SIZE); +	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { +		__initramfs_start = .; +		*(.init.ramfs) +		__initramfs_end = .; +	} +#endif + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) +	/* +	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the +	 * output PHDR, so the next output section - __data_nosave - should +	 * start another section data.init2.  Also, pda should be at the head of +	 * percpu area.  Preallocate it and define the percpu offset symbol +	 * so that it can be accessed as a percpu variable. +	 */ +	. = ALIGN(PAGE_SIZE); +	PERCPU_VADDR(0, :percpu) +#else +	PERCPU(PAGE_SIZE) +#endif + +	. = ALIGN(PAGE_SIZE); + +	/* freed after init ends here */ +	.init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { +		__init_end = .; +	} + +#ifdef CONFIG_X86_64 +	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { +		. = ALIGN(PAGE_SIZE); +		__nosave_begin = .; +		*(.data.nosave) +		. = ALIGN(PAGE_SIZE); +		__nosave_end = .; +	} :data.init2 +	/* use another section data.init2, see PERCPU_VADDR() above */ +#endif + +	/* BSS */ +	. = ALIGN(PAGE_SIZE); +	.bss : AT(ADDR(.bss) - LOAD_OFFSET) { +		__bss_start = .; +		*(.bss.page_aligned) +		*(.bss) +		. = ALIGN(4); +		__bss_stop = .; +	} + +	. = ALIGN(PAGE_SIZE); +	.brk : AT(ADDR(.brk) - LOAD_OFFSET) { +		__brk_base = .; +		. += 64 * 1024;		/* 64k alignment slop space */ +		*(.brk_reservation)	/* areas brk users have reserved */ +		__brk_limit = .; +	} + +	.end : AT(ADDR(.end) - LOAD_OFFSET) { +		_end = .; +	} + +	/* Sections to be discarded */ +	/DISCARD/ : { +		*(.exitcall.exit) +		*(.eh_frame) +		*(.discard) +	} + +        STABS_DEBUG +        DWARF_DEBUG +} + + +#ifdef CONFIG_X86_32 +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), +        "kernel image bigger than KERNEL_IMAGE_SIZE") +#else +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ +#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(irq_stack_union); + +/* + * Build-time check on the image size: + */ +ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), +	"kernel image bigger than KERNEL_IMAGE_SIZE") + +#ifdef CONFIG_SMP +ASSERT((per_cpu__irq_stack_union == 0), +        "irq_stack_union is not at start of per-cpu area"); +#endif + +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_KEXEC +#include <asm/kexec.h> + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, +       "kexec control code size is too big") +#endif + diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S deleted file mode 100644 index 62ad500d55f..00000000000 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ /dev/null @@ -1,229 +0,0 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - * - * Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ - -#define LOAD_OFFSET __PAGE_OFFSET - -#include <asm-generic/vmlinux.lds.h> -#include <asm/thread_info.h> -#include <asm/page_types.h> -#include <asm/cache.h> -#include <asm/boot.h> - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(phys_startup_32) -jiffies = jiffies_64; - -PHDRS { -	text PT_LOAD FLAGS(5);	/* R_E */ -	data PT_LOAD FLAGS(7);	/* RWE */ -	note PT_NOTE FLAGS(0);	/* ___ */ -} -SECTIONS -{ -  . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; -  phys_startup_32 = startup_32 - LOAD_OFFSET; - -  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { -  	_text = .;			/* Text and read-only data */ -	*(.text.head) -  } :text = 0x9090 - -  /* read-only */ -  .text : AT(ADDR(.text) - LOAD_OFFSET) { -	. = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */ -	*(.text.page_aligned) -	TEXT_TEXT -	SCHED_TEXT -	LOCK_TEXT -	KPROBES_TEXT -	IRQENTRY_TEXT -	*(.fixup) -	*(.gnu.warning) -  	_etext = .;			/* End of text section */ -  } :text = 0x9090 - -  NOTES :text :note - -  . = ALIGN(16);		/* Exception table */ -  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { -  	__start___ex_table = .; -	 *(__ex_table) -  	__stop___ex_table = .; -  } :text = 0x9090 - -  RODATA - -  /* writeable */ -  . = ALIGN(PAGE_SIZE); -  .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */ -	DATA_DATA -	CONSTRUCTORS -	} :data - -  . = ALIGN(PAGE_SIZE); -  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { -  	__nosave_begin = .; -	*(.data.nosave) -  	. = ALIGN(PAGE_SIZE); -  	__nosave_end = .; -  } - -  . = ALIGN(PAGE_SIZE); -  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { -	*(.data.page_aligned) -	*(.data.idt) -  } - -  . = ALIGN(32); -  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { -	*(.data.cacheline_aligned) -  } - -  /* rarely changed data like cpu maps */ -  . = ALIGN(32); -  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { -	*(.data.read_mostly) -	_edata = .;		/* End of data section */ -  } - -  . = ALIGN(THREAD_SIZE);	/* init_task */ -  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { -	*(.data.init_task) -  } - -  /* might get freed after init */ -  . = ALIGN(PAGE_SIZE); -  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { -  	__smp_locks = .; -	*(.smp_locks) -	__smp_locks_end = .; -  } -  /* will be freed after init -   * Following ALIGN() is required to make sure no other data falls on the -   * same page where __smp_alt_end is pointing as that page might be freed -   * after boot. Always make sure that ALIGN() directive is present after -   * the section which contains __smp_alt_end. -   */ -  . = ALIGN(PAGE_SIZE); - -  /* will be freed after init */ -  . = ALIGN(PAGE_SIZE);		/* Init code and data */ -  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { -  	__init_begin = .; -	_sinittext = .; -	INIT_TEXT -	_einittext = .; -  } -  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { -	INIT_DATA -  } -  . = ALIGN(16); -  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { -  	__setup_start = .; -	*(.init.setup) -  	__setup_end = .; -   } -  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { -  	__initcall_start = .; -	INITCALLS -  	__initcall_end = .; -  } -  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { -  	__con_initcall_start = .; -	*(.con_initcall.init) -  	__con_initcall_end = .; -  } -  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { -	__x86_cpu_dev_start = .; -	*(.x86_cpu_dev.init) -	__x86_cpu_dev_end = .; -  } -  SECURITY_INIT -  . = ALIGN(4); -  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { -  	__alt_instructions = .; -	*(.altinstructions) -	__alt_instructions_end = .; -  } -  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { -	*(.altinstr_replacement) -  } -  . = ALIGN(4); -  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { -  	__parainstructions = .; -	*(.parainstructions) -  	__parainstructions_end = .; -  } -  /* .exit.text is discard at runtime, not link time, to deal with references -     from .altinstructions and .eh_frame */ -  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { -	EXIT_TEXT -  } -  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { -	EXIT_DATA -  } -#if defined(CONFIG_BLK_DEV_INITRD) -  . = ALIGN(PAGE_SIZE); -  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { -	__initramfs_start = .; -	*(.init.ramfs) -	__initramfs_end = .; -  } -#endif -  PERCPU(PAGE_SIZE) -  . = ALIGN(PAGE_SIZE); -  /* freed after init ends here */ - -  .bss : AT(ADDR(.bss) - LOAD_OFFSET) { -	__init_end = .; -	__bss_start = .;		/* BSS */ -	*(.bss.page_aligned) -	*(.bss) -	. = ALIGN(4); -	__bss_stop = .; -  } - -  .brk : AT(ADDR(.brk) - LOAD_OFFSET) { -	. = ALIGN(PAGE_SIZE); -	__brk_base = . ; - 	. += 64 * 1024 ;	/* 64k alignment slop space */ -	*(.brk_reservation)	/* areas brk users have reserved */ -	__brk_limit = . ; -  } - -  .end : AT(ADDR(.end) - LOAD_OFFSET) { -	_end = . ; -  } - -  /* Sections to be discarded */ -  /DISCARD/ : { -	*(.exitcall.exit) -	*(.discard) -	} - -  STABS_DEBUG - -  DWARF_DEBUG -} - -/* - * Build-time check on the image size: - */ -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), -	"kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_KEXEC -/* Link time checks */ -#include <asm/kexec.h> - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, -       "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S deleted file mode 100644 index c8742507b03..00000000000 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ /dev/null @@ -1,298 +0,0 @@ -/* ld script to make x86-64 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - */ - -#define LOAD_OFFSET __START_KERNEL_map - -#include <asm-generic/vmlinux.lds.h> -#include <asm/asm-offsets.h> -#include <asm/page_types.h> - -#undef i386	/* in case the preprocessor is a 32bit one */ - -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(phys_startup_64) -jiffies_64 = jiffies; -PHDRS { -	text PT_LOAD FLAGS(5);	/* R_E */ -	data PT_LOAD FLAGS(7);	/* RWE */ -	user PT_LOAD FLAGS(7);	/* RWE */ -	data.init PT_LOAD FLAGS(7);	/* RWE */ -#ifdef CONFIG_SMP -	percpu PT_LOAD FLAGS(7);	/* RWE */ -#endif -	data.init2 PT_LOAD FLAGS(7);	/* RWE */ -	note PT_NOTE FLAGS(0);	/* ___ */ -} -SECTIONS -{ -  . = __START_KERNEL; -  phys_startup_64 = startup_64 - LOAD_OFFSET; -  .text :  AT(ADDR(.text) - LOAD_OFFSET) { -	_text = .;			/* Text and read-only data */ -	/* First the code that has to be first for bootstrapping */ -	*(.text.head) -	_stext = .; -	/* Then the rest */ -	TEXT_TEXT -	SCHED_TEXT -	LOCK_TEXT -	KPROBES_TEXT -	IRQENTRY_TEXT -	*(.fixup) -	*(.gnu.warning) -	_etext = .;		/* End of text section */ -  } :text = 0x9090 - -  NOTES :text :note - -  . = ALIGN(16);		/* Exception table */ -  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { -  	__start___ex_table = .; -	 *(__ex_table) -  	__stop___ex_table = .; -  } :text = 0x9090 - -  RODATA - -  . = ALIGN(PAGE_SIZE);		/* Align data segment to page size boundary */ -				/* Data */ -  .data : AT(ADDR(.data) - LOAD_OFFSET) { -	DATA_DATA -	CONSTRUCTORS -	_edata = .;			/* End of data section */ -	} :data - - -  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { -	. = ALIGN(PAGE_SIZE); -	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES); -	*(.data.cacheline_aligned) -  } -  . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); -  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { -  	*(.data.read_mostly) -  } - -#define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - -  . = VSYSCALL_ADDR; -  .vsyscall_0 :	 AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user -  __vsyscall_0 = VSYSCALL_VIRT_ADDR; - -  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); -  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } -  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); -  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) -		{ *(.vsyscall_gtod_data) } -  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); -  .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) -		{ *(.vsyscall_clock) } -  vsyscall_clock = VVIRT(.vsyscall_clock); - - -  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) -		{ *(.vsyscall_1) } -  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) -		{ *(.vsyscall_2) } - -  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } -  vgetcpu_mode = VVIRT(.vgetcpu_mode); - -  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); -  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } -  jiffies = VVIRT(.jiffies); - -  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) -		{ *(.vsyscall_3) } - -  . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; - -#undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - -  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { -	. = ALIGN(THREAD_SIZE);	/* init_task */ -	*(.data.init_task) -  }:data.init - -  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { -	. = ALIGN(PAGE_SIZE); -	*(.data.page_aligned) -  } - -  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { -	/* might get freed after init */ -	. = ALIGN(PAGE_SIZE); -	__smp_alt_begin = .; -	__smp_locks = .; -	*(.smp_locks) -	__smp_locks_end = .; -	. = ALIGN(PAGE_SIZE); -	__smp_alt_end = .; -  } - -  . = ALIGN(PAGE_SIZE);		/* Init code and data */ -  __init_begin = .;	/* paired with __init_end */ -  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { -	_sinittext = .; -	INIT_TEXT -	_einittext = .; -  } -  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { -	__initdata_begin = .; -	INIT_DATA -	__initdata_end = .; -   } - -  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { -	. = ALIGN(16); -	__setup_start = .; -	*(.init.setup) -	__setup_end = .; -  } -  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { -	__initcall_start = .; -	INITCALLS -	__initcall_end = .; -  } -  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { -	__con_initcall_start = .; -	*(.con_initcall.init) -	__con_initcall_end = .; -  } -  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { -	__x86_cpu_dev_start = .; -	*(.x86_cpu_dev.init) -	__x86_cpu_dev_end = .; -  } -  SECURITY_INIT - -  . = ALIGN(8); -  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { -	__parainstructions = .; -       *(.parainstructions) -	__parainstructions_end = .; -  } - -  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { -	. = ALIGN(8); -	__alt_instructions = .; -	*(.altinstructions) -	__alt_instructions_end = .; -  } -  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { -	*(.altinstr_replacement) -  } -  /* .exit.text is discard at runtime, not link time, to deal with references -     from .altinstructions and .eh_frame */ -  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { -	EXIT_TEXT -  } -  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { -	EXIT_DATA -  } - -#ifdef CONFIG_BLK_DEV_INITRD -  . = ALIGN(PAGE_SIZE); -  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { -	__initramfs_start = .; -	*(.init.ramfs) -	__initramfs_end = .; -  } -#endif - -#ifdef CONFIG_SMP -  /* -   * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the -   * output PHDR, so the next output section - __data_nosave - should -   * start another section data.init2.  Also, pda should be at the head of -   * percpu area.  Preallocate it and define the percpu offset symbol -   * so that it can be accessed as a percpu variable. -   */ -  . = ALIGN(PAGE_SIZE); -  PERCPU_VADDR(0, :percpu) -#else -  PERCPU(PAGE_SIZE) -#endif - -  . = ALIGN(PAGE_SIZE); -  __init_end = .; - -  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { -	. = ALIGN(PAGE_SIZE); -	__nosave_begin = .; -	*(.data.nosave) -	. = ALIGN(PAGE_SIZE); -	__nosave_end = .; -  } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ - -  .bss : AT(ADDR(.bss) - LOAD_OFFSET) { -	. = ALIGN(PAGE_SIZE); -	__bss_start = .;		/* BSS */ -	*(.bss.page_aligned) -	*(.bss) -	__bss_stop = .; -  } - -  .brk : AT(ADDR(.brk) - LOAD_OFFSET) { -	. = ALIGN(PAGE_SIZE); -	__brk_base = . ; - 	. += 64 * 1024 ;	/* 64k alignment slop space */ -	*(.brk_reservation)	/* areas brk users have reserved */ -	__brk_limit = . ; -  } - -  _end = . ; - -  /* Sections to be discarded */ -  /DISCARD/ : { -	*(.exitcall.exit) -	*(.eh_frame) -	*(.discard) -	} - -  STABS_DEBUG - -  DWARF_DEBUG -} - - /* -  * Per-cpu symbols which need to be offset from __per_cpu_load -  * for the boot processor. -  */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); - -/* - * Build-time check on the image size: - */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), -	"kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), -        "irq_stack_union is not at start of per-cpu area"); -#endif - -#ifdef CONFIG_KEXEC -#include <asm/kexec.h> - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, -       "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 44153afc906..25ee06a80aa 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)  			return;  		} -		/* -		 * Surround the RDTSC by barriers, to make sure it's not -		 * speculated to outside the seqlock critical section and -		 * does not cause time warps: -		 */ -		rdtsc_barrier();  		now = vread(); -		rdtsc_barrier(); -  		base = __vsyscall_gtod_data.clock.cycle_last;  		mask = __vsyscall_gtod_data.clock.mask;  		mult = __vsyscall_gtod_data.clock.mult;  |