diff options
Diffstat (limited to 'arch/x86/kernel')
114 files changed, 6915 insertions, 4842 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d364df03c1d..339ce35648e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -23,11 +23,12 @@ nostackp := $(call cc-option, -fno-stack-protector)  CFLAGS_vsyscall_64.o	:= $(PROFILING) -g0 $(nostackp)  CFLAGS_hpet.o		:= $(nostackp)  CFLAGS_tsc.o		:= $(nostackp) +CFLAGS_paravirt.o	:= $(nostackp)  obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o  obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o  obj-y			+= time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y			+= setup.o i8259.o irqinit_$(BITS).o setup_percpu.o +obj-y			+= setup.o i8259.o irqinit_$(BITS).o  obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o  obj-$(CONFIG_X86_32)	+= probe_roms_32.o  obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o @@ -49,31 +50,27 @@ obj-y				+= step.o  obj-$(CONFIG_STACKTRACE)	+= stacktrace.o  obj-y				+= cpu/  obj-y				+= acpi/ -obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot.o +obj-y				+= reboot.o  obj-$(CONFIG_MCA)		+= mca_32.o  obj-$(CONFIG_X86_MSR)		+= msr.o  obj-$(CONFIG_X86_CPUID)		+= cpuid.o  obj-$(CONFIG_PCI)		+= early-quirks.o  apm-y				:= apm_32.o  obj-$(CONFIG_APM)		+= apm.o -obj-$(CONFIG_X86_SMP)		+= smp.o -obj-$(CONFIG_X86_SMP)		+= smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o -obj-$(CONFIG_X86_32_SMP)	+= smpcommon.o -obj-$(CONFIG_X86_64_SMP)	+= tsc_sync.o smpcommon.o +obj-$(CONFIG_SMP)		+= smp.o +obj-$(CONFIG_SMP)		+= smpboot.o tsc_sync.o +obj-$(CONFIG_SMP)		+= setup_percpu.o +obj-$(CONFIG_X86_64_SMP)	+= tsc_sync.o  obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline_$(BITS).o  obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o -obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o -obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o +obj-y				+= apic/  obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o  obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o  obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o  obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o  obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o  obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o -obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o -obj-$(CONFIG_X86_ES7000)	+= es7000_32.o -obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o -obj-y				+= vsmp_64.o +obj-$(CONFIG_X86_VSMP)		+= vsmp_64.o  obj-$(CONFIG_KPROBES)		+= kprobes.o  obj-$(CONFIG_MODULES)		+= module_$(BITS).o  obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o @@ -114,16 +111,13 @@ obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb_64.o # NB rename without _64  ###  # 64 bit specific files  ifeq ($(CONFIG_X86_64),y) -        obj-y				+= genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o -	obj-y				+= bios_uv.o uv_irq.o uv_sysfs.o -        obj-y				+= genx2apic_cluster.o -        obj-y				+= genx2apic_phys.o -        obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o -        obj-$(CONFIG_AUDIT)		+= audit_64.o +	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o +	obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o +	obj-$(CONFIG_AUDIT)		+= audit_64.o -        obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o -        obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o -        obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o amd_iommu.o +	obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o +	obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o +	obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o amd_iommu.o -        obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o +	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o  endif diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7678f10c456..a18eb7ce223 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -37,15 +37,10 @@  #include <asm/pgtable.h>  #include <asm/io_apic.h>  #include <asm/apic.h> -#include <asm/genapic.h>  #include <asm/io.h>  #include <asm/mpspec.h>  #include <asm/smp.h> -#ifdef CONFIG_X86_LOCAL_APIC -# include <mach_apic.h> -#endif -  static int __initdata acpi_force = 0;  u32 acpi_rsdt_forced;  #ifdef	CONFIG_ACPI @@ -56,16 +51,7 @@ int acpi_disabled = 1;  EXPORT_SYMBOL(acpi_disabled);  #ifdef	CONFIG_X86_64 - -#include <asm/proto.h> - -#else				/* X86 */ - -#ifdef	CONFIG_X86_LOCAL_APIC -#include <mach_apic.h> -#include <mach_mpparse.h> -#endif				/* CONFIG_X86_LOCAL_APIC */ - +# include <asm/proto.h>  #endif				/* X86 */  #define BAD_MADT_ENTRY(entry, end) (					    \ @@ -121,35 +107,18 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;   */  char *__init __acpi_map_table(unsigned long phys, unsigned long size)  { -	unsigned long base, offset, mapped_size; -	int idx;  	if (!phys || !size)  		return NULL; -	if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT)) -		return __va(phys); - -	offset = phys & (PAGE_SIZE - 1); -	mapped_size = PAGE_SIZE - offset; -	clear_fixmap(FIX_ACPI_END); -	set_fixmap(FIX_ACPI_END, phys); -	base = fix_to_virt(FIX_ACPI_END); - -	/* -	 * Most cases can be covered by the below. -	 */ -	idx = FIX_ACPI_END; -	while (mapped_size < size) { -		if (--idx < FIX_ACPI_BEGIN) -			return NULL;	/* cannot handle this */ -		phys += PAGE_SIZE; -		clear_fixmap(idx); -		set_fixmap(idx, phys); -		mapped_size += PAGE_SIZE; -	} +	return early_ioremap(phys, size); +} +void __init __acpi_unmap_table(char *map, unsigned long size) +{ +	if (!map || !size) +		return; -	return ((unsigned char *)base + offset); +	early_iounmap(map, size);  }  #ifdef CONFIG_PCI_MMCONFIG @@ -239,7 +208,8 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)  		       madt->address);  	} -	acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); +	default_acpi_madt_oem_check(madt->header.oem_id, +				    madt->header.oem_table_id);  	return 0;  } @@ -884,7 +854,7 @@ static struct {  	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);  } mp_ioapic_routing[MAX_IO_APICS]; -static int mp_find_ioapic(int gsi) +int mp_find_ioapic(int gsi)  {  	int i = 0; @@ -899,6 +869,16 @@ static int mp_find_ioapic(int gsi)  	return -1;  } +int mp_find_ioapic_pin(int ioapic, int gsi) +{ +	if (WARN_ON(ioapic == -1)) +		return -1; +	if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end)) +		return -1; + +	return gsi - mp_ioapic_routing[ioapic].gsi_base; +} +  static u8 __init uniq_ioapic_id(u8 id)  {  #ifdef CONFIG_X86_32 @@ -912,8 +892,8 @@ static u8 __init uniq_ioapic_id(u8 id)  	DECLARE_BITMAP(used, 256);  	bitmap_zero(used, 256);  	for (i = 0; i < nr_ioapics; i++) { -		struct mp_config_ioapic *ia = &mp_ioapics[i]; -		__set_bit(ia->mp_apicid, used); +		struct mpc_ioapic *ia = &mp_ioapics[i]; +		__set_bit(ia->apicid, used);  	}  	if (!test_bit(id, used))  		return id; @@ -945,29 +925,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)  	idx = nr_ioapics; -	mp_ioapics[idx].mp_type = MP_IOAPIC; -	mp_ioapics[idx].mp_flags = MPC_APIC_USABLE; -	mp_ioapics[idx].mp_apicaddr = address; +	mp_ioapics[idx].type = MP_IOAPIC; +	mp_ioapics[idx].flags = MPC_APIC_USABLE; +	mp_ioapics[idx].apicaddr = address;  	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); -	mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id); +	mp_ioapics[idx].apicid = uniq_ioapic_id(id);  #ifdef CONFIG_X86_32 -	mp_ioapics[idx].mp_apicver = io_apic_get_version(idx); +	mp_ioapics[idx].apicver = io_apic_get_version(idx);  #else -	mp_ioapics[idx].mp_apicver = 0; +	mp_ioapics[idx].apicver = 0;  #endif  	/*  	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups  	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).  	 */ -	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid; +	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;  	mp_ioapic_routing[idx].gsi_base = gsi_base;  	mp_ioapic_routing[idx].gsi_end = gsi_base +  	    io_apic_get_redir_entries(idx); -	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " -	       "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid, -	       mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr, +	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " +	       "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, +	       mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,  	       mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);  	nr_ioapics++; @@ -996,19 +976,19 @@ int __init acpi_probe_gsi(void)  	return max_gsi + 1;  } -static void assign_to_mp_irq(struct mp_config_intsrc *m, -				    struct mp_config_intsrc *mp_irq) +static void assign_to_mp_irq(struct mpc_intsrc *m, +				    struct mpc_intsrc *mp_irq)  { -	memcpy(mp_irq, m, sizeof(struct mp_config_intsrc)); +	memcpy(mp_irq, m, sizeof(struct mpc_intsrc));  } -static int mp_irq_cmp(struct mp_config_intsrc *mp_irq, -				struct mp_config_intsrc *m) +static int mp_irq_cmp(struct mpc_intsrc *mp_irq, +				struct mpc_intsrc *m)  { -	return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc)); +	return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));  } -static void save_mp_irq(struct mp_config_intsrc *m) +static void save_mp_irq(struct mpc_intsrc *m)  {  	int i; @@ -1026,7 +1006,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)  {  	int ioapic;  	int pin; -	struct mp_config_intsrc mp_irq; +	struct mpc_intsrc mp_irq;  	/*  	 * Convert 'gsi' to 'ioapic.pin'. @@ -1034,7 +1014,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)  	ioapic = mp_find_ioapic(gsi);  	if (ioapic < 0)  		return; -	pin = gsi - mp_ioapic_routing[ioapic].gsi_base; +	pin = mp_find_ioapic_pin(ioapic, gsi);  	/*  	 * TBD: This check is for faulty timer entries, where the override @@ -1044,13 +1024,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)  	if ((bus_irq == 0) && (trigger == 3))  		trigger = 1; -	mp_irq.mp_type = MP_INTSRC; -	mp_irq.mp_irqtype = mp_INT; -	mp_irq.mp_irqflag = (trigger << 2) | polarity; -	mp_irq.mp_srcbus = MP_ISA_BUS; -	mp_irq.mp_srcbusirq = bus_irq;	/* IRQ */ -	mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ -	mp_irq.mp_dstirq = pin;	/* INTIN# */ +	mp_irq.type = MP_INTSRC; +	mp_irq.irqtype = mp_INT; +	mp_irq.irqflag = (trigger << 2) | polarity; +	mp_irq.srcbus = MP_ISA_BUS; +	mp_irq.srcbusirq = bus_irq;	/* IRQ */ +	mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ +	mp_irq.dstirq = pin;	/* INTIN# */  	save_mp_irq(&mp_irq);  } @@ -1060,7 +1040,7 @@ void __init mp_config_acpi_legacy_irqs(void)  	int i;  	int ioapic;  	unsigned int dstapic; -	struct mp_config_intsrc mp_irq; +	struct mpc_intsrc mp_irq;  #if defined (CONFIG_MCA) || defined (CONFIG_EISA)  	/* @@ -1085,7 +1065,7 @@ void __init mp_config_acpi_legacy_irqs(void)  	ioapic = mp_find_ioapic(0);  	if (ioapic < 0)  		return; -	dstapic = mp_ioapics[ioapic].mp_apicid; +	dstapic = mp_ioapics[ioapic].apicid;  	/*  	 * Use the default configuration for the IRQs 0-15.  Unless @@ -1095,16 +1075,14 @@ void __init mp_config_acpi_legacy_irqs(void)  		int idx;  		for (idx = 0; idx < mp_irq_entries; idx++) { -			struct mp_config_intsrc *irq = mp_irqs + idx; +			struct mpc_intsrc *irq = mp_irqs + idx;  			/* Do we already have a mapping for this ISA IRQ? */ -			if (irq->mp_srcbus == MP_ISA_BUS -			    && irq->mp_srcbusirq == i) +			if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)  				break;  			/* Do we already have a mapping for this IOAPIC pin */ -			if (irq->mp_dstapic == dstapic && -			    irq->mp_dstirq == i) +			if (irq->dstapic == dstapic && irq->dstirq == i)  				break;  		} @@ -1113,13 +1091,13 @@ void __init mp_config_acpi_legacy_irqs(void)  			continue;	/* IRQ already used */  		} -		mp_irq.mp_type = MP_INTSRC; -		mp_irq.mp_irqflag = 0;	/* Conforming */ -		mp_irq.mp_srcbus = MP_ISA_BUS; -		mp_irq.mp_dstapic = dstapic; -		mp_irq.mp_irqtype = mp_INT; -		mp_irq.mp_srcbusirq = i; /* Identity mapped */ -		mp_irq.mp_dstirq = i; +		mp_irq.type = MP_INTSRC; +		mp_irq.irqflag = 0;	/* Conforming */ +		mp_irq.srcbus = MP_ISA_BUS; +		mp_irq.dstapic = dstapic; +		mp_irq.irqtype = mp_INT; +		mp_irq.srcbusirq = i; /* Identity mapped */ +		mp_irq.dstirq = i;  		save_mp_irq(&mp_irq);  	} @@ -1156,7 +1134,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)  		return gsi;  	} -	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; +	ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);  #ifdef CONFIG_X86_32  	if (ioapic_renumber_irq) @@ -1230,22 +1208,22 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,  			u32 gsi, int triggering, int polarity)  {  #ifdef CONFIG_X86_MPPARSE -	struct mp_config_intsrc mp_irq; +	struct mpc_intsrc mp_irq;  	int ioapic;  	if (!acpi_ioapic)  		return 0;  	/* print the entry should happen on mptable identically */ -	mp_irq.mp_type = MP_INTSRC; -	mp_irq.mp_irqtype = mp_INT; -	mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | +	mp_irq.type = MP_INTSRC; +	mp_irq.irqtype = mp_INT; +	mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |  				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3); -	mp_irq.mp_srcbus = number; -	mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); +	mp_irq.srcbus = number; +	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);  	ioapic = mp_find_ioapic(gsi); -	mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id; -	mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; +	mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; +	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);  	save_mp_irq(&mp_irq);  #endif @@ -1372,7 +1350,7 @@ static void __init acpi_process_madt(void)  		if (!error) {  			acpi_lapic = 1; -#ifdef CONFIG_X86_GENERICARCH +#ifdef CONFIG_X86_BIGSMP  			generic_bigsmp_probe();  #endif  			/* @@ -1384,9 +1362,8 @@ static void __init acpi_process_madt(void)  				acpi_ioapic = 1;  				smp_found_config = 1; -#ifdef CONFIG_X86_32 -				setup_apic_routing(); -#endif +				if (apic->setup_apic_routing) +					apic->setup_apic_routing();  			}  		}  		if (error == -EINVAL) { diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 3355973b12a..580b4e29601 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -3,8 +3,8 @@   */  #include <asm/segment.h>  #include <asm/msr-index.h> -#include <asm/page.h> -#include <asm/pgtable.h> +#include <asm/page_types.h> +#include <asm/pgtable_types.h>  #include <asm/processor-flags.h>  	.code16 diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index a60c1f3bcb8..7c243a2c511 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -101,6 +101,7 @@ int acpi_save_state_mem(void)  	stack_start.sp = temp_stack + sizeof(temp_stack);  	early_gdt_descr.address =  			(unsigned long)get_cpu_gdt_table(smp_processor_id()); +	initial_gs = per_cpu_offset(smp_processor_id());  #endif  	initial_code = (unsigned long)wakeup_long64;  	saved_magic = 0x123456789abcdef0; diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index a12e6a9fb65..8ded418b059 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -1,7 +1,7 @@  	.section .text.page_aligned  #include <linux/linkage.h>  #include <asm/segment.h> -#include <asm/page.h> +#include <asm/page_types.h>  # Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 96258d9dc97..8ea5164cbd0 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -1,8 +1,8 @@  .text  #include <linux/linkage.h>  #include <asm/segment.h> -#include <asm/pgtable.h> -#include <asm/page.h> +#include <asm/pgtable_types.h> +#include <asm/page_types.h>  #include <asm/msr.h>  #include <asm/asm-offsets.h> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a84ac7b570e..4c80f155743 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -414,9 +414,17 @@ void __init alternative_instructions(void)  	   that might execute the to be patched code.  	   Other CPUs are not running. */  	stop_nmi(); -#ifdef CONFIG_X86_MCE -	stop_mce(); -#endif + +	/* +	 * Don't stop machine check exceptions while patching. +	 * MCEs only happen when something got corrupted and in this +	 * case we must do something about the corruption. +	 * Ignoring it is worse than a unlikely patching race. +	 * Also machine checks tend to be broadcast and if one CPU +	 * goes into machine check the others follow quickly, so we don't +	 * expect a machine check to cause undue problems during to code +	 * patching. +	 */  	apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -456,9 +464,6 @@ void __init alternative_instructions(void)  				(unsigned long)__smp_locks_end);  	restart_nmi(); -#ifdef CONFIG_X86_MCE -	restart_mce(); -#endif  }  /** @@ -498,12 +503,12 @@ void *text_poke_early(void *addr, const void *opcode, size_t len)   */  void *__kprobes text_poke(void *addr, const void *opcode, size_t len)  { -	unsigned long flags;  	char *vaddr;  	int nr_pages = 2;  	struct page *pages[2];  	int i; +	might_sleep();  	if (!core_kernel_text((unsigned long)addr)) {  		pages[0] = vmalloc_to_page(addr);  		pages[1] = vmalloc_to_page(addr + PAGE_SIZE); @@ -517,9 +522,9 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)  		nr_pages = 1;  	vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);  	BUG_ON(!vaddr); -	local_irq_save(flags); +	local_irq_disable();  	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); -	local_irq_restore(flags); +	local_irq_enable();  	vunmap(vaddr);  	sync_core();  	/* Could also do a CLFLUSH here to speed up CPU recovery; but diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile new file mode 100644 index 00000000000..da7b7b9f8bd --- /dev/null +++ b/arch/x86/kernel/apic/Makefile @@ -0,0 +1,19 @@ +# +# Makefile for local APIC drivers and for the IO-APIC code +# + +obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o +obj-$(CONFIG_SMP)		+= ipi.o + +ifeq ($(CONFIG_X86_64),y) +obj-y				+= apic_flat_64.o +obj-$(CONFIG_X86_X2APIC)	+= x2apic_cluster.o +obj-$(CONFIG_X86_X2APIC)	+= x2apic_phys.o +obj-$(CONFIG_X86_UV)		+= x2apic_uv_x.o +endif + +obj-$(CONFIG_X86_BIGSMP)	+= bigsmp_32.o +obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o +obj-$(CONFIG_X86_ES7000)	+= es7000_32.o +obj-$(CONFIG_X86_SUMMIT)	+= summit_32.o diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic/apic.c index 570f36e44e5..30909a258d0 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1,7 +1,7 @@  /*   *	Local APIC handling, local APIC timers   * - *	(c) 1999, 2000 Ingo Molnar <mingo@redhat.com> + *	(c) 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>   *   *	Fixes   *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs; @@ -14,51 +14,70 @@   *	Mikael Pettersson	:	PM converted to driver model.   */ -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h>  #include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/ioport.h> -#include <linux/cpu.h> -#include <linux/clockchips.h> +#include <linux/mc146818rtc.h>  #include <linux/acpi_pmtmr.h> +#include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/bootmem.h> +#include <linux/ftrace.h> +#include <linux/ioport.h>  #include <linux/module.h> -#include <linux/dmi.h> +#include <linux/sysdev.h> +#include <linux/delay.h> +#include <linux/timex.h>  #include <linux/dmar.h> -#include <linux/ftrace.h> -#include <linux/smp.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/dmi.h>  #include <linux/nmi.h> -#include <linux/timex.h> +#include <linux/smp.h> +#include <linux/mm.h> +#include <asm/pgalloc.h>  #include <asm/atomic.h> -#include <asm/mtrr.h>  #include <asm/mpspec.h> -#include <asm/desc.h> -#include <asm/arch_hooks.h> -#include <asm/hpet.h> -#include <asm/pgalloc.h>  #include <asm/i8253.h> -#include <asm/idle.h> +#include <asm/i8259.h>  #include <asm/proto.h>  #include <asm/apic.h> -#include <asm/i8259.h> +#include <asm/desc.h> +#include <asm/hpet.h> +#include <asm/idle.h> +#include <asm/mtrr.h>  #include <asm/smp.h> +#include <asm/mce.h> + +unsigned int num_processors; + +unsigned disabled_cpus __cpuinitdata; -#include <mach_apic.h> -#include <mach_apicdef.h> -#include <mach_ipi.h> +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U;  /* - * Sanity check + * The highest APIC ID seen during enumeration. + * + * This determines the messaging protocol we can use: if all APIC IDs + * are in the 0 ... 7 range, then we can use logical addressing which + * has some performance advantages (better broadcasting). + * + * If there's an APIC ID above 8, we use physical addressing.   */ -#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) -# error SPURIOUS_APIC_VECTOR definition error -#endif +unsigned int max_physical_apicid; + +/* + * Bitmask of physically existing CPUs: + */ +physid_mask_t phys_cpu_present_map; + +/* + * Map cpu index to physical APIC ID + */ +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);  #ifdef CONFIG_X86_32  /* @@ -92,11 +111,7 @@ static __init int setup_apicpmtimer(char *s)  __setup("apicpmtimer", setup_apicpmtimer);  #endif -#ifdef CONFIG_X86_64 -#define HAVE_X2APIC -#endif - -#ifdef HAVE_X2APIC +#ifdef CONFIG_X86_X2APIC  int x2apic;  /* x2apic enabled before OS handover */  static int x2apic_preenabled; @@ -194,18 +209,13 @@ static int modern_apic(void)  	return lapic_get_version() >= 0x14;  } -/* - * Paravirt kernels also might be using these below ops. So we still - * use generic apic_read()/apic_write(), which might be pointing to different - * ops in PARAVIRT case. - */ -void xapic_wait_icr_idle(void) +void native_apic_wait_icr_idle(void)  {  	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)  		cpu_relax();  } -u32 safe_xapic_wait_icr_idle(void) +u32 native_safe_apic_wait_icr_idle(void)  {  	u32 send_status;  	int timeout; @@ -221,13 +231,13 @@ u32 safe_xapic_wait_icr_idle(void)  	return send_status;  } -void xapic_icr_write(u32 low, u32 id) +void native_apic_icr_write(u32 low, u32 id)  {  	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));  	apic_write(APIC_ICR, low);  } -static u64 xapic_icr_read(void) +u64 native_apic_icr_read(void)  {  	u32 icr1, icr2; @@ -237,54 +247,6 @@ static u64 xapic_icr_read(void)  	return icr1 | ((u64)icr2 << 32);  } -static struct apic_ops xapic_ops = { -	.read = native_apic_mem_read, -	.write = native_apic_mem_write, -	.icr_read = xapic_icr_read, -	.icr_write = xapic_icr_write, -	.wait_icr_idle = xapic_wait_icr_idle, -	.safe_wait_icr_idle = safe_xapic_wait_icr_idle, -}; - -struct apic_ops __read_mostly *apic_ops = &xapic_ops; -EXPORT_SYMBOL_GPL(apic_ops); - -#ifdef HAVE_X2APIC -static void x2apic_wait_icr_idle(void) -{ -	/* no need to wait for icr idle in x2apic */ -	return; -} - -static u32 safe_x2apic_wait_icr_idle(void) -{ -	/* no need to wait for icr idle in x2apic */ -	return 0; -} - -void x2apic_icr_write(u32 low, u32 id) -{ -	wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); -} - -static u64 x2apic_icr_read(void) -{ -	unsigned long val; - -	rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); -	return val; -} - -static struct apic_ops x2apic_ops = { -	.read = native_apic_msr_read, -	.write = native_apic_msr_write, -	.icr_read = x2apic_icr_read, -	.icr_write = x2apic_icr_write, -	.wait_icr_idle = x2apic_wait_icr_idle, -	.safe_wait_icr_idle = safe_x2apic_wait_icr_idle, -}; -#endif -  /**   * enable_NMI_through_LVT0 - enable NMI through local vector table 0   */ @@ -457,7 +419,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,  static void lapic_timer_broadcast(const struct cpumask *mask)  {  #ifdef CONFIG_SMP -	send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +	apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);  #endif  } @@ -535,7 +497,8 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)  	}  } -static int __init calibrate_by_pmtimer(long deltapm, long *delta) +static int __init +calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)  {  	const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;  	const long pm_thresh = pm_100ms / 100; @@ -546,7 +509,7 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)  	return -1;  #endif -	apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); +	apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);  	/* Check, if the PM timer is available */  	if (!deltapm) @@ -556,19 +519,30 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)  	if (deltapm > (pm_100ms - pm_thresh) &&  	    deltapm < (pm_100ms + pm_thresh)) { -		apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); -	} else { -		res = (((u64)deltapm) *  mult) >> 22; -		do_div(res, 1000000); -		pr_warning("APIC calibration not consistent " -			"with PM Timer: %ldms instead of 100ms\n", -			(long)res); -		/* Correct the lapic counter value */ -		res = (((u64)(*delta)) * pm_100ms); +		apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n"); +		return 0; +	} + +	res = (((u64)deltapm) *  mult) >> 22; +	do_div(res, 1000000); +	pr_warning("APIC calibration not consistent " +		   "with PM-Timer: %ldms instead of 100ms\n",(long)res); + +	/* Correct the lapic counter value */ +	res = (((u64)(*delta)) * pm_100ms); +	do_div(res, deltapm); +	pr_info("APIC delta adjusted to PM-Timer: " +		"%lu (%ld)\n", (unsigned long)res, *delta); +	*delta = (long)res; + +	/* Correct the tsc counter value */ +	if (cpu_has_tsc) { +		res = (((u64)(*deltatsc)) * pm_100ms);  		do_div(res, deltapm); -		pr_info("APIC delta adjusted to PM-Timer: " -			"%lu (%ld)\n", (unsigned long)res, *delta); -		*delta = (long)res; +		apic_printk(APIC_VERBOSE, "TSC delta adjusted to " +					  "PM-Timer: %lu (%ld) \n", +					(unsigned long)res, *deltatsc); +		*deltatsc = (long)res;  	}  	return 0; @@ -579,7 +553,7 @@ static int __init calibrate_APIC_clock(void)  	struct clock_event_device *levt = &__get_cpu_var(lapic_events);  	void (*real_handler)(struct clock_event_device *dev);  	unsigned long deltaj; -	long delta; +	long delta, deltatsc;  	int pm_referenced = 0;  	local_irq_disable(); @@ -609,9 +583,11 @@ static int __init calibrate_APIC_clock(void)  	delta = lapic_cal_t1 - lapic_cal_t2;  	apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); +	deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); +  	/* we trust the PM based calibration if possible */  	pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, -					&delta); +					&delta, &deltatsc);  	/* Calculate the scaled math multiplication factor */  	lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, @@ -629,11 +605,10 @@ static int __init calibrate_APIC_clock(void)  		    calibration_result);  	if (cpu_has_tsc) { -		delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);  		apic_printk(APIC_VERBOSE, "..... CPU clock speed is "  			    "%ld.%04ld MHz.\n", -			    (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), -			    (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); +			    (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), +			    (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));  	}  	apic_printk(APIC_VERBOSE, "..... host bus clock speed is " @@ -868,6 +843,14 @@ void clear_local_APIC(void)  		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);  	}  #endif +#ifdef CONFIG_X86_MCE_INTEL +	if (maxlvt >= 6) { +		v = apic_read(APIC_LVTCMCI); +		if (!(v & APIC_LVT_MASKED)) +			apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); +	} +#endif +  	/*  	 * Clean APIC state for other OSs:  	 */ @@ -991,11 +974,11 @@ int __init verify_local_APIC(void)  	 */  	reg0 = apic_read(APIC_ID);  	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); -	apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); +	apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);  	reg1 = apic_read(APIC_ID);  	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);  	apic_write(APIC_ID, reg0); -	if (reg1 != (reg0 ^ APIC_ID_MASK)) +	if (reg1 != (reg0 ^ apic->apic_id_mask))  		return 0;  	/* @@ -1089,7 +1072,7 @@ static void __cpuinit lapic_setup_esr(void)  		return;  	} -	if (esr_disable) { +	if (apic->disable_esr) {  		/*  		 * Something untraceable is creating bad interrupts on  		 * secondary quads ... for the moment, just leave the @@ -1130,9 +1113,14 @@ void __cpuinit setup_local_APIC(void)  	unsigned int value;  	int i, j; +	if (disable_apic) { +		arch_disable_smp_support(); +		return; +	} +  #ifdef CONFIG_X86_32  	/* Pound the ESR really hard over the head with a big hammer - mbligh */ -	if (lapic_is_integrated() && esr_disable) { +	if (lapic_is_integrated() && apic->disable_esr) {  		apic_write(APIC_ESR, 0);  		apic_write(APIC_ESR, 0);  		apic_write(APIC_ESR, 0); @@ -1146,7 +1134,7 @@ void __cpuinit setup_local_APIC(void)  	 * Double-check whether this APIC is really registered.  	 * This is meaningless in clustered apic mode, so we skip it.  	 */ -	if (!apic_id_registered()) +	if (!apic->apic_id_registered())  		BUG();  	/* @@ -1154,7 +1142,7 @@ void __cpuinit setup_local_APIC(void)  	 * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel  	 * document number 292116).  So here it goes...  	 */ -	init_apic_ldr(); +	apic->init_apic_ldr();  	/*  	 * Set Task Priority to 'accept all'. We never change this @@ -1262,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)  	apic_write(APIC_LVT1, value);  	preempt_enable(); + +#ifdef CONFIG_X86_MCE_INTEL +	/* Recheck CMCI information after local APIC is up on CPU #0 */ +	if (smp_processor_id() == 0) +		cmci_recheck(); +#endif  }  void __cpuinit end_local_APIC_setup(void) @@ -1282,17 +1276,12 @@ void __cpuinit end_local_APIC_setup(void)  	apic_pm_activate();  } -#ifdef HAVE_X2APIC +#ifdef CONFIG_X86_X2APIC  void check_x2apic(void)  { -	int msr, msr2; - -	rdmsr(MSR_IA32_APICBASE, msr, msr2); - -	if (msr & X2APIC_ENABLE) { +	if (x2apic_enabled()) {  		pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");  		x2apic_preenabled = x2apic = 1; -		apic_ops = &x2apic_ops;  	}  } @@ -1300,6 +1289,9 @@ void enable_x2apic(void)  {  	int msr, msr2; +	if (!x2apic) +		return; +  	rdmsr(MSR_IA32_APICBASE, msr, msr2);  	if (!(msr & X2APIC_ENABLE)) {  		pr_info("Enabling x2apic\n"); @@ -1363,7 +1355,6 @@ void __init enable_IR_x2apic(void)  	if (!x2apic) {  		x2apic = 1; -		apic_ops = &x2apic_ops;  		enable_x2apic();  	} @@ -1401,7 +1392,7 @@ end:  	return;  } -#endif /* HAVE_X2APIC */ +#endif /* CONFIG_X86_X2APIC */  #ifdef CONFIG_X86_64  /* @@ -1532,7 +1523,7 @@ void __init early_init_lapic_mapping(void)   */  void __init init_apic_mappings(void)  { -#ifdef HAVE_X2APIC +#ifdef CONFIG_X86_X2APIC  	if (x2apic) {  		boot_cpu_physical_apicid = read_apic_id();  		return; @@ -1570,11 +1561,11 @@ int apic_version[MAX_APICS];  int __init APIC_init_uniprocessor(void)  { -#ifdef CONFIG_X86_64  	if (disable_apic) {  		pr_info("Apic disabled\n");  		return -1;  	} +#ifdef CONFIG_X86_64  	if (!cpu_has_apic) {  		disable_apic = 1;  		pr_info("Apic disabled by BIOS\n"); @@ -1596,11 +1587,9 @@ int __init APIC_init_uniprocessor(void)  	}  #endif -#ifdef HAVE_X2APIC  	enable_IR_x2apic(); -#endif  #ifdef CONFIG_X86_64 -	setup_apic_routing(); +	default_setup_apic_routing();  #endif  	verify_local_APIC(); @@ -1621,35 +1610,31 @@ int __init APIC_init_uniprocessor(void)  	physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);  	setup_local_APIC(); -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_IO_APIC  	/*  	 * Now enable IO-APICs, actually call clear_IO_APIC -	 * We need clear_IO_APIC before enabling vector on BP +	 * We need clear_IO_APIC before enabling error vector  	 */  	if (!skip_ioapic_setup && nr_ioapics)  		enable_IO_APIC();  #endif -#ifdef CONFIG_X86_IO_APIC -	if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) -#endif -		localise_nmi_watchdog();  	end_local_APIC_setup();  #ifdef CONFIG_X86_IO_APIC  	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)  		setup_IO_APIC(); -# ifdef CONFIG_X86_64 -	else +	else {  		nr_ioapics = 0; -# endif +		localise_nmi_watchdog(); +	} +#else +	localise_nmi_watchdog();  #endif +	setup_boot_clock();  #ifdef CONFIG_X86_64 -	setup_boot_APIC_clock();  	check_nmi_watchdog(); -#else -	setup_boot_clock();  #endif  	return 0; @@ -1738,7 +1723,8 @@ void __init connect_bsp_APIC(void)  		outb(0x01, 0x23);  	}  #endif -	enable_apic_mode(); +	if (apic->enable_apic_mode) +		apic->enable_apic_mode();  }  /** @@ -1876,29 +1862,39 @@ void __cpuinit generic_processor_info(int apicid, int version)  	}  #endif -#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) -	/* are we being called early in kernel startup? */ -	if (early_per_cpu_ptr(x86_cpu_to_apicid)) { -		u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); -		u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - -		cpu_to_apicid[cpu] = apicid; -		bios_cpu_apicid[cpu] = apicid; -	} else { -		per_cpu(x86_cpu_to_apicid, cpu) = apicid; -		per_cpu(x86_bios_cpu_apicid, cpu) = apicid; -	} +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) +	early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; +	early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;  #endif  	set_cpu_possible(cpu, true);  	set_cpu_present(cpu, true);  } -#ifdef CONFIG_X86_64  int hard_smp_processor_id(void)  {  	return read_apic_id();  } + +void default_init_apic_ldr(void) +{ +	unsigned long val; + +	apic_write(APIC_DFR, APIC_DFR_VALUE); +	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; +	val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); +	apic_write(APIC_LDR, val); +} + +#ifdef CONFIG_X86_32 +int default_apicid_to_node(int logical_apicid) +{ +#ifdef CONFIG_SMP +	return apicid_2_node[hard_smp_processor_id()]; +#else +	return 0; +#endif +}  #endif  /* @@ -1976,7 +1972,7 @@ static int lapic_resume(struct sys_device *dev)  	local_irq_save(flags); -#ifdef HAVE_X2APIC +#ifdef CONFIG_X86_X2APIC  	if (x2apic)  		enable_x2apic();  	else diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 34185488e4f..f933822dba1 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -17,9 +17,8 @@  #include <linux/init.h>  #include <linux/hardirq.h>  #include <asm/smp.h> +#include <asm/apic.h>  #include <asm/ipi.h> -#include <asm/genapic.h> -#include <mach_apicdef.h>  #ifdef CONFIG_ACPI  #include <acpi/acpi_bus.h> @@ -74,7 +73,7 @@ static inline void _flat_send_IPI_mask(unsigned long mask, int vector)  	unsigned long flags;  	local_irq_save(flags); -	__send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); +	__default_send_IPI_dest_field(mask, vector, apic->dest_logical);  	local_irq_restore(flags);  } @@ -85,14 +84,15 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)  	_flat_send_IPI_mask(mask, vector);  } -static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, -					  int vector) +static void + flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)  {  	unsigned long mask = cpumask_bits(cpumask)[0];  	int cpu = smp_processor_id();  	if (cpu < BITS_PER_LONG)  		clear_bit(cpu, &mask); +  	_flat_send_IPI_mask(mask, vector);  } @@ -114,23 +114,27 @@ static void flat_send_IPI_allbutself(int vector)  			_flat_send_IPI_mask(mask, vector);  		}  	} else if (num_online_cpus() > 1) { -		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); +		__default_send_IPI_shortcut(APIC_DEST_ALLBUT, +					    vector, apic->dest_logical);  	}  }  static void flat_send_IPI_all(int vector)  { -	if (vector == NMI_VECTOR) +	if (vector == NMI_VECTOR) {  		flat_send_IPI_mask(cpu_online_mask, vector); -	else -		__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +	} else { +		__default_send_IPI_shortcut(APIC_DEST_ALLINC, +					    vector, apic->dest_logical); +	}  } -static unsigned int get_apic_id(unsigned long x) +static unsigned int flat_get_apic_id(unsigned long x)  {  	unsigned int id;  	id = (((x)>>24) & 0xFFu); +  	return id;  } @@ -146,7 +150,7 @@ static unsigned int read_xapic_id(void)  {  	unsigned int id; -	id = get_apic_id(apic_read(APIC_ID)); +	id = flat_get_apic_id(apic_read(APIC_ID));  	return id;  } @@ -169,31 +173,67 @@ static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,  	return mask1 & mask2;  } -static unsigned int phys_pkg_id(int index_msb) +static int flat_phys_pkg_id(int initial_apic_id, int index_msb)  {  	return hard_smp_processor_id() >> index_msb;  } -struct genapic apic_flat =  { -	.name = "flat", -	.acpi_madt_oem_check = flat_acpi_madt_oem_check, -	.int_delivery_mode = dest_LowestPrio, -	.int_dest_mode = (APIC_DEST_LOGICAL != 0), -	.target_cpus = flat_target_cpus, -	.vector_allocation_domain = flat_vector_allocation_domain, -	.apic_id_registered = flat_apic_id_registered, -	.init_apic_ldr = flat_init_apic_ldr, -	.send_IPI_all = flat_send_IPI_all, -	.send_IPI_allbutself = flat_send_IPI_allbutself, -	.send_IPI_mask = flat_send_IPI_mask, -	.send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, -	.send_IPI_self = apic_send_IPI_self, -	.cpu_mask_to_apicid = flat_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, -	.phys_pkg_id = phys_pkg_id, -	.get_apic_id = get_apic_id, -	.set_apic_id = set_apic_id, -	.apic_id_mask = (0xFFu<<24), +struct apic apic_flat =  { +	.name				= "flat", +	.probe				= NULL, +	.acpi_madt_oem_check		= flat_acpi_madt_oem_check, +	.apic_id_registered		= flat_apic_id_registered, + +	.irq_delivery_mode		= dest_LowestPrio, +	.irq_dest_mode			= 1, /* logical */ + +	.target_cpus			= flat_target_cpus, +	.disable_esr			= 0, +	.dest_logical			= APIC_DEST_LOGICAL, +	.check_apicid_used		= NULL, +	.check_apicid_present		= NULL, + +	.vector_allocation_domain	= flat_vector_allocation_domain, +	.init_apic_ldr			= flat_init_apic_ldr, + +	.ioapic_phys_id_map		= NULL, +	.setup_apic_routing		= NULL, +	.multi_timer_check		= NULL, +	.apicid_to_node			= NULL, +	.cpu_to_logical_apicid		= NULL, +	.cpu_present_to_apicid		= default_cpu_present_to_apicid, +	.apicid_to_cpu_present		= NULL, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= default_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= flat_phys_pkg_id, +	.mps_oem_check			= NULL, + +	.get_apic_id			= flat_get_apic_id, +	.set_apic_id			= set_apic_id, +	.apic_id_mask			= 0xFFu << 24, + +	.cpu_mask_to_apicid		= flat_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= flat_send_IPI_mask, +	.send_IPI_mask_allbutself	= flat_send_IPI_mask_allbutself, +	.send_IPI_allbutself		= flat_send_IPI_allbutself, +	.send_IPI_all			= flat_send_IPI_all, +	.send_IPI_self			= apic_send_IPI_self, + +	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, +	.wait_for_init_deassert		= NULL, +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= NULL, + +	.read				= native_apic_mem_read, +	.write				= native_apic_mem_write, +	.icr_read			= native_apic_icr_read, +	.icr_write			= native_apic_icr_write, +	.wait_icr_idle			= native_apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,  };  /* @@ -232,18 +272,18 @@ static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)  static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)  { -	send_IPI_mask_sequence(cpumask, vector); +	default_send_IPI_mask_sequence_phys(cpumask, vector);  }  static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,  					      int vector)  { -	send_IPI_mask_allbutself(cpumask, vector); +	default_send_IPI_mask_allbutself_phys(cpumask, vector);  }  static void physflat_send_IPI_allbutself(int vector)  { -	send_IPI_mask_allbutself(cpu_online_mask, vector); +	default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);  }  static void physflat_send_IPI_all(int vector) @@ -276,32 +316,72 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,  	 * We're using fixed IRQ delivery, can only return one phys APIC ID.  	 * May as well be the first.  	 */ -	for_each_cpu_and(cpu, cpumask, andmask) +	for_each_cpu_and(cpu, cpumask, andmask) {  		if (cpumask_test_cpu(cpu, cpu_online_mask))  			break; +	}  	if (cpu < nr_cpu_ids)  		return per_cpu(x86_cpu_to_apicid, cpu); +  	return BAD_APICID;  } -struct genapic apic_physflat =  { -	.name = "physical flat", -	.acpi_madt_oem_check = physflat_acpi_madt_oem_check, -	.int_delivery_mode = dest_Fixed, -	.int_dest_mode = (APIC_DEST_PHYSICAL != 0), -	.target_cpus = physflat_target_cpus, -	.vector_allocation_domain = physflat_vector_allocation_domain, -	.apic_id_registered = flat_apic_id_registered, -	.init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ -	.send_IPI_all = physflat_send_IPI_all, -	.send_IPI_allbutself = physflat_send_IPI_allbutself, -	.send_IPI_mask = physflat_send_IPI_mask, -	.send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, -	.send_IPI_self = apic_send_IPI_self, -	.cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, -	.phys_pkg_id = phys_pkg_id, -	.get_apic_id = get_apic_id, -	.set_apic_id = set_apic_id, -	.apic_id_mask = (0xFFu<<24), +struct apic apic_physflat =  { + +	.name				= "physical flat", +	.probe				= NULL, +	.acpi_madt_oem_check		= physflat_acpi_madt_oem_check, +	.apic_id_registered		= flat_apic_id_registered, + +	.irq_delivery_mode		= dest_Fixed, +	.irq_dest_mode			= 0, /* physical */ + +	.target_cpus			= physflat_target_cpus, +	.disable_esr			= 0, +	.dest_logical			= 0, +	.check_apicid_used		= NULL, +	.check_apicid_present		= NULL, + +	.vector_allocation_domain	= physflat_vector_allocation_domain, +	/* not needed, but shouldn't hurt: */ +	.init_apic_ldr			= flat_init_apic_ldr, + +	.ioapic_phys_id_map		= NULL, +	.setup_apic_routing		= NULL, +	.multi_timer_check		= NULL, +	.apicid_to_node			= NULL, +	.cpu_to_logical_apicid		= NULL, +	.cpu_present_to_apicid		= default_cpu_present_to_apicid, +	.apicid_to_cpu_present		= NULL, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= default_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= flat_phys_pkg_id, +	.mps_oem_check			= NULL, + +	.get_apic_id			= flat_get_apic_id, +	.set_apic_id			= set_apic_id, +	.apic_id_mask			= 0xFFu << 24, + +	.cpu_mask_to_apicid		= physflat_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= physflat_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= physflat_send_IPI_mask, +	.send_IPI_mask_allbutself	= physflat_send_IPI_mask_allbutself, +	.send_IPI_allbutself		= physflat_send_IPI_allbutself, +	.send_IPI_all			= physflat_send_IPI_all, +	.send_IPI_self			= apic_send_IPI_self, + +	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, +	.wait_for_init_deassert		= NULL, +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= NULL, + +	.read				= native_apic_mem_read, +	.write				= native_apic_mem_write, +	.icr_read			= native_apic_icr_read, +	.icr_write			= native_apic_icr_write, +	.wait_icr_idle			= native_apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,  }; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c new file mode 100644 index 00000000000..d806ecaa948 --- /dev/null +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -0,0 +1,267 @@ +/* + * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. + * + * Drives the local APIC in "clustered mode". + */ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/dmi.h> +#include <linux/smp.h> + +#include <asm/apicdef.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <asm/ipi.h> + +static unsigned bigsmp_get_apic_id(unsigned long x) +{ +	return (x >> 24) & 0xFF; +} + +static int bigsmp_apic_id_registered(void) +{ +	return 1; +} + +static const cpumask_t *bigsmp_target_cpus(void) +{ +#ifdef CONFIG_SMP +	return &cpu_online_map; +#else +	return &cpumask_of_cpu(0); +#endif +} + +static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) +{ +	return 0; +} + +static unsigned long bigsmp_check_apicid_present(int bit) +{ +	return 1; +} + +static inline unsigned long calculate_ldr(int cpu) +{ +	unsigned long val, id; + +	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; +	id = per_cpu(x86_bios_cpu_apicid, cpu); +	val |= SET_APIC_LOGICAL_ID(id); + +	return val; +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116).  So here it goes... + */ +static void bigsmp_init_apic_ldr(void) +{ +	unsigned long val; +	int cpu = smp_processor_id(); + +	apic_write(APIC_DFR, APIC_DFR_FLAT); +	val = calculate_ldr(cpu); +	apic_write(APIC_LDR, val); +} + +static void bigsmp_setup_apic_routing(void) +{ +	printk(KERN_INFO +		"Enabling APIC mode:  Physflat.  Using %d I/O APICs\n", +		nr_ioapics); +} + +static int bigsmp_apicid_to_node(int logical_apicid) +{ +	return apicid_2_node[hard_smp_processor_id()]; +} + +static int bigsmp_cpu_present_to_apicid(int mps_cpu) +{ +	if (mps_cpu < nr_cpu_ids) +		return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); + +	return BAD_APICID; +} + +static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid) +{ +	return physid_mask_of_physid(phys_apicid); +} + +/* Mapping from cpu number to logical apicid */ +static inline int bigsmp_cpu_to_logical_apicid(int cpu) +{ +	if (cpu >= nr_cpu_ids) +		return BAD_APICID; +	return cpu_physical_id(cpu); +} + +static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) +{ +	/* For clustered we don't have a good way to do this yet - hack */ +	return physids_promote(0xFFL); +} + +static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) +{ +	return 1; +} + +/* As we are using single CPU as destination, pick only one CPU here */ +static unsigned int bigsmp_cpu_mask_to_apicid(const cpumask_t *cpumask) +{ +	return bigsmp_cpu_to_logical_apicid(first_cpu(*cpumask)); +} + +static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +			      const struct cpumask *andmask) +{ +	int cpu; + +	/* +	 * We're using fixed IRQ delivery, can only return one phys APIC ID. +	 * May as well be the first. +	 */ +	for_each_cpu_and(cpu, cpumask, andmask) { +		if (cpumask_test_cpu(cpu, cpu_online_mask)) +			break; +	} +	if (cpu < nr_cpu_ids) +		return bigsmp_cpu_to_logical_apicid(cpu); + +	return BAD_APICID; +} + +static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) +{ +	return cpuid_apic >> index_msb; +} + +static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector) +{ +	default_send_IPI_mask_sequence_phys(mask, vector); +} + +static void bigsmp_send_IPI_allbutself(int vector) +{ +	default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); +} + +static void bigsmp_send_IPI_all(int vector) +{ +	bigsmp_send_IPI_mask(cpu_online_mask, vector); +} + +static int dmi_bigsmp; /* can be set by dmi scanners */ + +static int hp_ht_bigsmp(const struct dmi_system_id *d) +{ +	printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); +	dmi_bigsmp = 1; + +	return 0; +} + + +static const struct dmi_system_id bigsmp_dmi_table[] = { +	{ hp_ht_bigsmp, "HP ProLiant DL760 G2", +		{	DMI_MATCH(DMI_BIOS_VENDOR, "HP"), +			DMI_MATCH(DMI_BIOS_VERSION, "P44-"), +		} +	}, + +	{ hp_ht_bigsmp, "HP ProLiant DL740", +		{	DMI_MATCH(DMI_BIOS_VENDOR, "HP"), +			DMI_MATCH(DMI_BIOS_VERSION, "P47-"), +		} +	}, +	{ } /* NULL entry stops DMI scanning */ +}; + +static void bigsmp_vector_allocation_domain(int cpu, cpumask_t *retmask) +{ +	cpus_clear(*retmask); +	cpu_set(cpu, *retmask); +} + +static int probe_bigsmp(void) +{ +	if (def_to_bigsmp) +		dmi_bigsmp = 1; +	else +		dmi_check_system(bigsmp_dmi_table); + +	return dmi_bigsmp; +} + +struct apic apic_bigsmp = { + +	.name				= "bigsmp", +	.probe				= probe_bigsmp, +	.acpi_madt_oem_check		= NULL, +	.apic_id_registered		= bigsmp_apic_id_registered, + +	.irq_delivery_mode		= dest_Fixed, +	/* phys delivery to target CPU: */ +	.irq_dest_mode			= 0, + +	.target_cpus			= bigsmp_target_cpus, +	.disable_esr			= 1, +	.dest_logical			= 0, +	.check_apicid_used		= bigsmp_check_apicid_used, +	.check_apicid_present		= bigsmp_check_apicid_present, + +	.vector_allocation_domain	= bigsmp_vector_allocation_domain, +	.init_apic_ldr			= bigsmp_init_apic_ldr, + +	.ioapic_phys_id_map		= bigsmp_ioapic_phys_id_map, +	.setup_apic_routing		= bigsmp_setup_apic_routing, +	.multi_timer_check		= NULL, +	.apicid_to_node			= bigsmp_apicid_to_node, +	.cpu_to_logical_apicid		= bigsmp_cpu_to_logical_apicid, +	.cpu_present_to_apicid		= bigsmp_cpu_present_to_apicid, +	.apicid_to_cpu_present		= bigsmp_apicid_to_cpu_present, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= bigsmp_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= bigsmp_phys_pkg_id, +	.mps_oem_check			= NULL, + +	.get_apic_id			= bigsmp_get_apic_id, +	.set_apic_id			= NULL, +	.apic_id_mask			= 0xFF << 24, + +	.cpu_mask_to_apicid		= bigsmp_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= bigsmp_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= bigsmp_send_IPI_mask, +	.send_IPI_mask_allbutself	= NULL, +	.send_IPI_allbutself		= bigsmp_send_IPI_allbutself, +	.send_IPI_all			= bigsmp_send_IPI_all, +	.send_IPI_self			= default_send_IPI_self, + +	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, + +	.wait_for_init_deassert		= default_wait_for_init_deassert, + +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= default_inquire_remote_apic, + +	.read				= native_apic_mem_read, +	.write				= native_apic_mem_write, +	.icr_read			= native_apic_icr_read, +	.icr_write			= native_apic_icr_write, +	.wait_icr_idle			= native_apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, +}; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c new file mode 100644 index 00000000000..19588f2770e --- /dev/null +++ b/arch/x86/kernel/apic/es7000_32.c @@ -0,0 +1,780 @@ +/* + * Written by: Garry Forsgren, Unisys Corporation + *             Natalie Protasevich, Unisys Corporation + * + * This file contains the code to configure and interface + * with Unisys ES7000 series hardware system manager. + * + * Copyright (c) 2003 Unisys Corporation. + * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar + * + *   All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Unisys Corporation, Township Line & Union Meeting + * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: + * + * http://www.unisys.com + */ +#include <linux/notifier.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/threads.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/acpi.h> +#include <linux/init.h> +#include <linux/nmi.h> +#include <linux/smp.h> +#include <linux/io.h> + +#include <asm/apicdef.h> +#include <asm/atomic.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/ipi.h> + +/* + * ES7000 chipsets + */ + +#define NON_UNISYS			0 +#define ES7000_CLASSIC			1 +#define ES7000_ZORRO			2 + +#define	MIP_REG				1 +#define	MIP_PSAI_REG			4 + +#define	MIP_BUSY			1 +#define	MIP_SPIN			0xf0000 +#define	MIP_VALID			0x0100000000000000ULL +#define	MIP_SW_APIC			0x1020b + +#define	MIP_PORT(val)			((val >> 32) & 0xffff) + +#define	MIP_RD_LO(val)			(val & 0xffffffff) + +struct mip_reg { +	unsigned long long		off_0x00; +	unsigned long long		off_0x08; +	unsigned long long		off_0x10; +	unsigned long long		off_0x18; +	unsigned long long		off_0x20; +	unsigned long long		off_0x28; +	unsigned long long		off_0x30; +	unsigned long long		off_0x38; +}; + +struct mip_reg_info { +	unsigned long long		mip_info; +	unsigned long long		delivery_info; +	unsigned long long		host_reg; +	unsigned long long		mip_reg; +}; + +struct psai { +	unsigned long long		entry_type; +	unsigned long long		addr; +	unsigned long long		bep_addr; +}; + +#ifdef CONFIG_ACPI + +struct es7000_oem_table { +	struct acpi_table_header	Header; +	u32				OEMTableAddr; +	u32				OEMTableSize; +}; + +static unsigned long			oem_addrX; +static unsigned long			oem_size; + +#endif + +/* + * ES7000 Globals + */ + +static volatile unsigned long		*psai; +static struct mip_reg			*mip_reg; +static struct mip_reg			*host_reg; +static int 				mip_port; +static unsigned long			mip_addr; +static unsigned long			host_addr; + +int					es7000_plat; + +/* + * GSI override for ES7000 platforms. + */ + +static unsigned int			base; + +static int +es7000_rename_gsi(int ioapic, int gsi) +{ +	if (es7000_plat == ES7000_ZORRO) +		return gsi; + +	if (!base) { +		int i; +		for (i = 0; i < nr_ioapics; i++) +			base += nr_ioapic_registers[i]; +	} + +	if (!ioapic && (gsi < 16)) +		gsi += base; + +	return gsi; +} + +static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +{ +	unsigned long vect = 0, psaival = 0; + +	if (psai == NULL) +		return -1; + +	vect = ((unsigned long)__pa(eip)/0x1000) << 16; +	psaival = (0x1000000 | vect | cpu); + +	while (*psai & 0x1000000) +		; + +	*psai = psaival; + +	return 0; +} + +static int es7000_apic_is_cluster(void) +{ +	/* MPENTIUMIII */ +	if (boot_cpu_data.x86 == 6 && +	    (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) +		return 1; + +	return 0; +} + +static void setup_unisys(void) +{ +	/* +	 * Determine the generation of the ES7000 currently running. +	 * +	 * es7000_plat = 1 if the machine is a 5xx ES7000 box +	 * es7000_plat = 2 if the machine is a x86_64 ES7000 box +	 * +	 */ +	if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) +		es7000_plat = ES7000_ZORRO; +	else +		es7000_plat = ES7000_CLASSIC; +	ioapic_renumber_irq = es7000_rename_gsi; +} + +/* + * Parse the OEM Table: + */ +static int parse_unisys_oem(char *oemptr) +{ +	int			i; +	int 			success = 0; +	unsigned char		type, size; +	unsigned long		val; +	char			*tp = NULL; +	struct psai		*psaip = NULL; +	struct mip_reg_info 	*mi; +	struct mip_reg		*host, *mip; + +	tp = oemptr; + +	tp += 8; + +	for (i = 0; i <= 6; i++) { +		type = *tp++; +		size = *tp++; +		tp -= 2; +		switch (type) { +		case MIP_REG: +			mi = (struct mip_reg_info *)tp; +			val = MIP_RD_LO(mi->host_reg); +			host_addr = val; +			host = (struct mip_reg *)val; +			host_reg = __va(host); +			val = MIP_RD_LO(mi->mip_reg); +			mip_port = MIP_PORT(mi->mip_info); +			mip_addr = val; +			mip = (struct mip_reg *)val; +			mip_reg = __va(mip); +			pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", +				 (unsigned long)host_reg); +			pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", +				 (unsigned long)mip_reg); +			success++; +			break; +		case MIP_PSAI_REG: +			psaip = (struct psai *)tp; +			if (tp != NULL) { +				if (psaip->addr) +					psai = __va(psaip->addr); +				else +					psai = NULL; +				success++; +			} +			break; +		default: +			break; +		} +		tp += size; +	} + +	if (success < 2) +		es7000_plat = NON_UNISYS; +	else +		setup_unisys(); + +	return es7000_plat; +} + +#ifdef CONFIG_ACPI +static int find_unisys_acpi_oem_table(unsigned long *oem_addr) +{ +	struct acpi_table_header *header = NULL; +	struct es7000_oem_table *table; +	acpi_size tbl_size; +	acpi_status ret; +	int i = 0; + +	for (;;) { +		ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size); +		if (!ACPI_SUCCESS(ret)) +			return -1; + +		if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) +			break; + +		early_acpi_os_unmap_memory(header, tbl_size); +	} + +	table = (void *)header; + +	oem_addrX	= table->OEMTableAddr; +	oem_size	= table->OEMTableSize; + +	early_acpi_os_unmap_memory(header, tbl_size); + +	*oem_addr	= (unsigned long)__acpi_map_table(oem_addrX, oem_size); + +	return 0; +} + +static void unmap_unisys_acpi_oem_table(unsigned long oem_addr) +{ +	if (!oem_addr) +		return; + +	__acpi_unmap_table((char *)oem_addr, oem_size); +} + +static int es7000_check_dsdt(void) +{ +	struct acpi_table_header header; + +	if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) && +	    !strncmp(header.oem_id, "UNISYS", 6)) +		return 1; +	return 0; +} + +static int es7000_acpi_ret; + +/* Hook from generic ACPI tables.c */ +static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +	unsigned long oem_addr = 0; +	int check_dsdt; +	int ret = 0; + +	/* check dsdt at first to avoid clear fix_map for oem_addr */ +	check_dsdt = es7000_check_dsdt(); + +	if (!find_unisys_acpi_oem_table(&oem_addr)) { +		if (check_dsdt) { +			ret = parse_unisys_oem((char *)oem_addr); +		} else { +			setup_unisys(); +			ret = 1; +		} +		/* +		 * we need to unmap it +		 */ +		unmap_unisys_acpi_oem_table(oem_addr); +	} + +	es7000_acpi_ret = ret; + +	return ret && !es7000_apic_is_cluster(); +} + +static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id) +{ +	int ret = es7000_acpi_ret; + +	return ret && es7000_apic_is_cluster(); +} + +#else /* !CONFIG_ACPI: */ +static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +	return 0; +} + +static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id) +{ +	return 0; +} +#endif /* !CONFIG_ACPI */ + +static void es7000_spin(int n) +{ +	int i = 0; + +	while (i++ < n) +		rep_nop(); +} + +static int es7000_mip_write(struct mip_reg *mip_reg) +{ +	int status = 0; +	int spin; + +	spin = MIP_SPIN; +	while ((host_reg->off_0x38 & MIP_VALID) != 0) { +		if (--spin <= 0) { +			WARN(1,	"Timeout waiting for Host Valid Flag\n"); +			return -1; +		} +		es7000_spin(MIP_SPIN); +	} + +	memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); +	outb(1, mip_port); + +	spin = MIP_SPIN; + +	while ((mip_reg->off_0x38 & MIP_VALID) == 0) { +		if (--spin <= 0) { +			WARN(1,	"Timeout waiting for MIP Valid Flag\n"); +			return -1; +		} +		es7000_spin(MIP_SPIN); +	} + +	status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48; +	mip_reg->off_0x38 &= ~MIP_VALID; + +	return status; +} + +static void es7000_enable_apic_mode(void) +{ +	struct mip_reg es7000_mip_reg; +	int mip_status; + +	if (!es7000_plat) +		return; + +	printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); +	memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); +	es7000_mip_reg.off_0x00 = MIP_SW_APIC; +	es7000_mip_reg.off_0x38 = MIP_VALID; + +	while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) +		WARN(1, "Command failed, status = %x\n", mip_status); +} + +static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask) +{ +	/* Careful. Some cpus do not strictly honor the set of cpus +	 * specified in the interrupt destination when using lowest +	 * priority interrupt delivery mode. +	 * +	 * In particular there was a hyperthreading cpu observed to +	 * deliver interrupts to the wrong hyperthread when only one +	 * hyperthread was specified in the interrupt desitination. +	 */ +	*retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; +} + + +static void es7000_wait_for_init_deassert(atomic_t *deassert) +{ +	while (!atomic_read(deassert)) +		cpu_relax(); +} + +static unsigned int es7000_get_apic_id(unsigned long x) +{ +	return (x >> 24) & 0xFF; +} + +static void es7000_send_IPI_mask(const struct cpumask *mask, int vector) +{ +	default_send_IPI_mask_sequence_phys(mask, vector); +} + +static void es7000_send_IPI_allbutself(int vector) +{ +	default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); +} + +static void es7000_send_IPI_all(int vector) +{ +	es7000_send_IPI_mask(cpu_online_mask, vector); +} + +static int es7000_apic_id_registered(void) +{ +	return 1; +} + +static const cpumask_t *target_cpus_cluster(void) +{ +	return &CPU_MASK_ALL; +} + +static const cpumask_t *es7000_target_cpus(void) +{ +	return &cpumask_of_cpu(smp_processor_id()); +} + +static unsigned long +es7000_check_apicid_used(physid_mask_t bitmap, int apicid) +{ +	return 0; +} +static unsigned long es7000_check_apicid_present(int bit) +{ +	return physid_isset(bit, phys_cpu_present_map); +} + +static unsigned long calculate_ldr(int cpu) +{ +	unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); + +	return SET_APIC_LOGICAL_ID(id); +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LdR and TPR before enabling + * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116).  So here it goes... + */ +static void es7000_init_apic_ldr_cluster(void) +{ +	unsigned long val; +	int cpu = smp_processor_id(); + +	apic_write(APIC_DFR, APIC_DFR_CLUSTER); +	val = calculate_ldr(cpu); +	apic_write(APIC_LDR, val); +} + +static void es7000_init_apic_ldr(void) +{ +	unsigned long val; +	int cpu = smp_processor_id(); + +	apic_write(APIC_DFR, APIC_DFR_FLAT); +	val = calculate_ldr(cpu); +	apic_write(APIC_LDR, val); +} + +static void es7000_setup_apic_routing(void) +{ +	int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); + +	printk(KERN_INFO +	  "Enabling APIC mode:  %s. Using %d I/O APICs, target cpus %lx\n", +		(apic_version[apic] == 0x14) ? +			"Physical Cluster" : "Logical Cluster", +		nr_ioapics, cpus_addr(*es7000_target_cpus())[0]); +} + +static int es7000_apicid_to_node(int logical_apicid) +{ +	return 0; +} + + +static int es7000_cpu_present_to_apicid(int mps_cpu) +{ +	if (!mps_cpu) +		return boot_cpu_physical_apicid; +	else if (mps_cpu < nr_cpu_ids) +		return per_cpu(x86_bios_cpu_apicid, mps_cpu); +	else +		return BAD_APICID; +} + +static int cpu_id; + +static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) +{ +	physid_mask_t mask; + +	mask = physid_mask_of_physid(cpu_id); +	++cpu_id; + +	return mask; +} + +/* Mapping from cpu number to logical apicid */ +static int es7000_cpu_to_logical_apicid(int cpu) +{ +#ifdef CONFIG_SMP +	if (cpu >= nr_cpu_ids) +		return BAD_APICID; +	return cpu_2_logical_apicid[cpu]; +#else +	return logical_smp_processor_id(); +#endif +} + +static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) +{ +	/* For clustered we don't have a good way to do this yet - hack */ +	return physids_promote(0xff); +} + +static int es7000_check_phys_apicid_present(int cpu_physical_apicid) +{ +	boot_cpu_physical_apicid = read_apic_id(); +	return 1; +} + +static unsigned int es7000_cpu_mask_to_apicid(const cpumask_t *cpumask) +{ +	unsigned int round = 0; +	int cpu, uninitialized_var(apicid); + +	/* +	 * The cpus in the mask must all be on the apic cluster. +	 */ +	for_each_cpu(cpu, cpumask) { +		int new_apicid = es7000_cpu_to_logical_apicid(cpu); + +		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { +			WARN(1, "Not a valid mask!"); + +			return BAD_APICID; +		} +		apicid = new_apicid; +		round++; +	} +	return apicid; +} + +static unsigned int +es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, +			      const struct cpumask *andmask) +{ +	int apicid = es7000_cpu_to_logical_apicid(0); +	cpumask_var_t cpumask; + +	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) +		return apicid; + +	cpumask_and(cpumask, inmask, andmask); +	cpumask_and(cpumask, cpumask, cpu_online_mask); +	apicid = es7000_cpu_mask_to_apicid(cpumask); + +	free_cpumask_var(cpumask); + +	return apicid; +} + +static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) +{ +	return cpuid_apic >> index_msb; +} + +static int probe_es7000(void) +{ +	/* probed later in mptable/ACPI hooks */ +	return 0; +} + +static int es7000_mps_ret; +static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem, +		char *productid) +{ +	int ret = 0; + +	if (mpc->oemptr) { +		struct mpc_oemtable *oem_table = +			(struct mpc_oemtable *)mpc->oemptr; + +		if (!strncmp(oem, "UNISYS", 6)) +			ret = parse_unisys_oem((char *)oem_table); +	} + +	es7000_mps_ret = ret; + +	return ret && !es7000_apic_is_cluster(); +} + +static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, +		char *productid) +{ +	int ret = es7000_mps_ret; + +	return ret && es7000_apic_is_cluster(); +} + +struct apic apic_es7000_cluster = { + +	.name				= "es7000", +	.probe				= probe_es7000, +	.acpi_madt_oem_check		= es7000_acpi_madt_oem_check_cluster, +	.apic_id_registered		= es7000_apic_id_registered, + +	.irq_delivery_mode		= dest_LowestPrio, +	/* logical delivery broadcast to all procs: */ +	.irq_dest_mode			= 1, + +	.target_cpus			= target_cpus_cluster, +	.disable_esr			= 1, +	.dest_logical			= 0, +	.check_apicid_used		= es7000_check_apicid_used, +	.check_apicid_present		= es7000_check_apicid_present, + +	.vector_allocation_domain	= es7000_vector_allocation_domain, +	.init_apic_ldr			= es7000_init_apic_ldr_cluster, + +	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map, +	.setup_apic_routing		= es7000_setup_apic_routing, +	.multi_timer_check		= NULL, +	.apicid_to_node			= es7000_apicid_to_node, +	.cpu_to_logical_apicid		= es7000_cpu_to_logical_apicid, +	.cpu_present_to_apicid		= es7000_cpu_present_to_apicid, +	.apicid_to_cpu_present		= es7000_apicid_to_cpu_present, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= es7000_check_phys_apicid_present, +	.enable_apic_mode		= es7000_enable_apic_mode, +	.phys_pkg_id			= es7000_phys_pkg_id, +	.mps_oem_check			= es7000_mps_oem_check_cluster, + +	.get_apic_id			= es7000_get_apic_id, +	.set_apic_id			= NULL, +	.apic_id_mask			= 0xFF << 24, + +	.cpu_mask_to_apicid		= es7000_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= es7000_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= es7000_send_IPI_mask, +	.send_IPI_mask_allbutself	= NULL, +	.send_IPI_allbutself		= es7000_send_IPI_allbutself, +	.send_IPI_all			= es7000_send_IPI_all, +	.send_IPI_self			= default_send_IPI_self, + +	.wakeup_secondary_cpu		= wakeup_secondary_cpu_via_mip, + +	.trampoline_phys_low		= 0x467, +	.trampoline_phys_high		= 0x469, + +	.wait_for_init_deassert		= NULL, + +	/* Nothing to do for most platforms, since cleared by the INIT cycle: */ +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= default_inquire_remote_apic, + +	.read				= native_apic_mem_read, +	.write				= native_apic_mem_write, +	.icr_read			= native_apic_icr_read, +	.icr_write			= native_apic_icr_write, +	.wait_icr_idle			= native_apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, +}; + +struct apic apic_es7000 = { + +	.name				= "es7000", +	.probe				= probe_es7000, +	.acpi_madt_oem_check		= es7000_acpi_madt_oem_check, +	.apic_id_registered		= es7000_apic_id_registered, + +	.irq_delivery_mode		= dest_Fixed, +	/* phys delivery to target CPUs: */ +	.irq_dest_mode			= 0, + +	.target_cpus			= es7000_target_cpus, +	.disable_esr			= 1, +	.dest_logical			= 0, +	.check_apicid_used		= es7000_check_apicid_used, +	.check_apicid_present		= es7000_check_apicid_present, + +	.vector_allocation_domain	= es7000_vector_allocation_domain, +	.init_apic_ldr			= es7000_init_apic_ldr, + +	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map, +	.setup_apic_routing		= es7000_setup_apic_routing, +	.multi_timer_check		= NULL, +	.apicid_to_node			= es7000_apicid_to_node, +	.cpu_to_logical_apicid		= es7000_cpu_to_logical_apicid, +	.cpu_present_to_apicid		= es7000_cpu_present_to_apicid, +	.apicid_to_cpu_present		= es7000_apicid_to_cpu_present, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= es7000_check_phys_apicid_present, +	.enable_apic_mode		= es7000_enable_apic_mode, +	.phys_pkg_id			= es7000_phys_pkg_id, +	.mps_oem_check			= es7000_mps_oem_check, + +	.get_apic_id			= es7000_get_apic_id, +	.set_apic_id			= NULL, +	.apic_id_mask			= 0xFF << 24, + +	.cpu_mask_to_apicid		= es7000_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= es7000_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= es7000_send_IPI_mask, +	.send_IPI_mask_allbutself	= NULL, +	.send_IPI_allbutself		= es7000_send_IPI_allbutself, +	.send_IPI_all			= es7000_send_IPI_all, +	.send_IPI_self			= default_send_IPI_self, + +	.trampoline_phys_low		= 0x467, +	.trampoline_phys_high		= 0x469, + +	.wait_for_init_deassert		= es7000_wait_for_init_deassert, + +	/* Nothing to do for most platforms, since cleared by the INIT cycle: */ +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= default_inquire_remote_apic, + +	.read				= native_apic_mem_read, +	.write				= native_apic_mem_write, +	.icr_read			= native_apic_icr_read, +	.icr_write			= native_apic_icr_write, +	.wait_icr_idle			= native_apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, +}; diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/apic/io_apic.c index bc7ac4da90d..00e6071cefc 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1,7 +1,7 @@  /*   *	Intel IO-APIC support for multi-Pentium hosts.   * - *	Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + *	Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo   *   *	Many thanks to Stig Venaas for trying out countless experimental   *	patches and reporting/debugging problems patiently! @@ -46,6 +46,7 @@  #include <asm/idle.h>  #include <asm/io.h>  #include <asm/smp.h> +#include <asm/cpu.h>  #include <asm/desc.h>  #include <asm/proto.h>  #include <asm/acpi.h> @@ -61,9 +62,7 @@  #include <asm/uv/uv_hub.h>  #include <asm/uv/uv_irq.h> -#include <mach_ipi.h> -#include <mach_apic.h> -#include <mach_apicdef.h> +#include <asm/apic.h>  #define __apicdebuginit(type) static type __init @@ -82,11 +81,11 @@ static DEFINE_SPINLOCK(vector_lock);  int nr_ioapic_registers[MAX_IO_APICS];  /* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; +struct mpc_ioapic mp_ioapics[MAX_IO_APICS];  int nr_ioapics;  /* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; +struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];  /* # of MP IRQ source entries */  int mp_irq_entries; @@ -99,10 +98,19 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);  int skip_ioapic_setup; +void arch_disable_smp_support(void) +{ +#ifdef CONFIG_PCI +	noioapicquirk = 1; +	noioapicreroute = -1; +#endif +	skip_ioapic_setup = 1; +} +  static int __init parse_noapic(char *str)  {  	/* disable IO-APIC */ -	disable_ioapic_setup(); +	arch_disable_smp_support();  	return 0;  }  early_param("noapic", parse_noapic); @@ -356,7 +364,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)  	if (!cfg->move_in_progress) {  		/* it means that domain is not changed */ -		if (!cpumask_intersects(&desc->affinity, mask)) +		if (!cpumask_intersects(desc->affinity, mask))  			cfg->move_desc_pending = 1;  	}  } @@ -386,7 +394,7 @@ struct io_apic {  static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)  {  	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) -		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); +		+ (mp_ioapics[idx].apicaddr & ~PAGE_MASK);  }  static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) @@ -478,7 +486,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)  	io_apic_write(apic, 0x10 + 2*pin, eu.w1);  } -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)  {  	unsigned long flags;  	spin_lock_irqsave(&ioapic_lock, flags); @@ -513,11 +521,11 @@ static void send_cleanup_vector(struct irq_cfg *cfg)  		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)  			cfg->move_cleanup_count++;  		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) -			send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); +			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);  	} else {  		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);  		cfg->move_cleanup_count = cpumask_weight(cleanup_mask); -		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); +		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);  		free_cpumask_var(cleanup_mask);  	}  	cfg->move_in_progress = 0; @@ -562,8 +570,9 @@ static int  assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);  /* - * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid - * of that, or returns BAD_APICID and leaves desc->affinity untouched. + * Either sets desc->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * leaves desc->affinity untouched.   */  static unsigned int  set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) @@ -579,9 +588,10 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)  	if (assign_irq_vector(irq, cfg, mask))  		return BAD_APICID; -	cpumask_and(&desc->affinity, cfg->domain, mask); +	cpumask_and(desc->affinity, cfg->domain, mask);  	set_extra_move_desc(desc, mask); -	return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); + +	return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);  }  static void @@ -796,23 +806,6 @@ static void clear_IO_APIC (void)  			clear_IO_APIC_pin(apic, pin);  } -#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) -void send_IPI_self(int vector) -{ -	unsigned int cfg; - -	/* -	 * Wait for idle. -	 */ -	apic_wait_icr_idle(); -	cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; -	/* -	 * Send the IPI. The write to APIC_ICR fires this off. -	 */ -	apic_write(APIC_ICR, cfg); -} -#endif /* !CONFIG_SMP && CONFIG_X86_32*/ -  #ifdef CONFIG_X86_32  /*   * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to @@ -820,8 +813,9 @@ void send_IPI_self(int vector)   */  #define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; +static int pirq_entries[MAX_PIRQS] = { +	[0 ... MAX_PIRQS - 1] = -1 +};  static int __init ioapic_pirq_setup(char *str)  { @@ -830,10 +824,6 @@ static int __init ioapic_pirq_setup(char *str)  	get_options(str, ARRAY_SIZE(ints), ints); -	for (i = 0; i < MAX_PIRQS; i++) -		pirq_entries[i] = -1; - -	pirqs_enabled = 1;  	apic_printk(APIC_VERBOSE, KERN_INFO  			"PIRQ redirection, working around broken MP-BIOS.\n");  	max = MAX_PIRQS; @@ -944,10 +934,10 @@ static int find_irq_entry(int apic, int pin, int type)  	int i;  	for (i = 0; i < mp_irq_entries; i++) -		if (mp_irqs[i].mp_irqtype == type && -		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || -		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) && -		    mp_irqs[i].mp_dstirq == pin) +		if (mp_irqs[i].irqtype == type && +		    (mp_irqs[i].dstapic == mp_ioapics[apic].apicid || +		     mp_irqs[i].dstapic == MP_APIC_ALL) && +		    mp_irqs[i].dstirq == pin)  			return i;  	return -1; @@ -961,13 +951,13 @@ static int __init find_isa_irq_pin(int irq, int type)  	int i;  	for (i = 0; i < mp_irq_entries; i++) { -		int lbus = mp_irqs[i].mp_srcbus; +		int lbus = mp_irqs[i].srcbus;  		if (test_bit(lbus, mp_bus_not_pci) && -		    (mp_irqs[i].mp_irqtype == type) && -		    (mp_irqs[i].mp_srcbusirq == irq)) +		    (mp_irqs[i].irqtype == type) && +		    (mp_irqs[i].srcbusirq == irq)) -			return mp_irqs[i].mp_dstirq; +			return mp_irqs[i].dstirq;  	}  	return -1;  } @@ -977,17 +967,17 @@ static int __init find_isa_irq_apic(int irq, int type)  	int i;  	for (i = 0; i < mp_irq_entries; i++) { -		int lbus = mp_irqs[i].mp_srcbus; +		int lbus = mp_irqs[i].srcbus;  		if (test_bit(lbus, mp_bus_not_pci) && -		    (mp_irqs[i].mp_irqtype == type) && -		    (mp_irqs[i].mp_srcbusirq == irq)) +		    (mp_irqs[i].irqtype == type) && +		    (mp_irqs[i].srcbusirq == irq))  			break;  	}  	if (i < mp_irq_entries) {  		int apic;  		for(apic = 0; apic < nr_ioapics; apic++) { -			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) +			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)  				return apic;  		}  	} @@ -1012,23 +1002,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)  		return -1;  	}  	for (i = 0; i < mp_irq_entries; i++) { -		int lbus = mp_irqs[i].mp_srcbus; +		int lbus = mp_irqs[i].srcbus;  		for (apic = 0; apic < nr_ioapics; apic++) -			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || -			    mp_irqs[i].mp_dstapic == MP_APIC_ALL) +			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || +			    mp_irqs[i].dstapic == MP_APIC_ALL)  				break;  		if (!test_bit(lbus, mp_bus_not_pci) && -		    !mp_irqs[i].mp_irqtype && +		    !mp_irqs[i].irqtype &&  		    (bus == lbus) && -		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { -			int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); +		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { +			int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);  			if (!(apic || IO_APIC_IRQ(irq)))  				continue; -			if (pin == (mp_irqs[i].mp_srcbusirq & 3)) +			if (pin == (mp_irqs[i].srcbusirq & 3))  				return irq;  			/*  			 * Use the first all-but-pin matching entry as a @@ -1071,7 +1061,7 @@ static int EISA_ELCR(unsigned int irq)   * EISA conforming in the MP table, that means its trigger type must   * be read in from the ELCR */ -#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) +#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))  #define default_EISA_polarity(idx)	default_ISA_polarity(idx)  /* PCI interrupts are always polarity one level triggered, @@ -1088,13 +1078,13 @@ static int EISA_ELCR(unsigned int irq)  static int MPBIOS_polarity(int idx)  { -	int bus = mp_irqs[idx].mp_srcbus; +	int bus = mp_irqs[idx].srcbus;  	int polarity;  	/*  	 * Determine IRQ line polarity (high active or low active):  	 */ -	switch (mp_irqs[idx].mp_irqflag & 3) +	switch (mp_irqs[idx].irqflag & 3)  	{  		case 0: /* conforms, ie. bus-type dependent polarity */  			if (test_bit(bus, mp_bus_not_pci)) @@ -1130,13 +1120,13 @@ static int MPBIOS_polarity(int idx)  static int MPBIOS_trigger(int idx)  { -	int bus = mp_irqs[idx].mp_srcbus; +	int bus = mp_irqs[idx].srcbus;  	int trigger;  	/*  	 * Determine IRQ trigger mode (edge or level sensitive):  	 */ -	switch ((mp_irqs[idx].mp_irqflag>>2) & 3) +	switch ((mp_irqs[idx].irqflag>>2) & 3)  	{  		case 0: /* conforms, ie. bus-type dependent */  			if (test_bit(bus, mp_bus_not_pci)) @@ -1214,16 +1204,16 @@ int (*ioapic_renumber_irq)(int ioapic, int irq);  static int pin_2_irq(int idx, int apic, int pin)  {  	int irq, i; -	int bus = mp_irqs[idx].mp_srcbus; +	int bus = mp_irqs[idx].srcbus;  	/*  	 * Debugging check, we are in big trouble if this message pops up!  	 */ -	if (mp_irqs[idx].mp_dstirq != pin) +	if (mp_irqs[idx].dstirq != pin)  		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");  	if (test_bit(bus, mp_bus_not_pci)) { -		irq = mp_irqs[idx].mp_srcbusirq; +		irq = mp_irqs[idx].srcbusirq;  	} else {  		/*  		 * PCI IRQs are mapped in order @@ -1315,7 +1305,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)  		int new_cpu;  		int vector, offset; -		vector_allocation_domain(cpu, tmp_mask); +		apic->vector_allocation_domain(cpu, tmp_mask);  		vector = current_vector;  		offset = current_offset; @@ -1485,10 +1475,10 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t  					      handle_edge_irq, "edge");  } -static int setup_ioapic_entry(int apic, int irq, -			      struct IO_APIC_route_entry *entry, -			      unsigned int destination, int trigger, -			      int polarity, int vector) +int setup_ioapic_entry(int apic_id, int irq, +		       struct IO_APIC_route_entry *entry, +		       unsigned int destination, int trigger, +		       int polarity, int vector)  {  	/*  	 * add it to the IO-APIC irq-routing table: @@ -1497,25 +1487,25 @@ static int setup_ioapic_entry(int apic, int irq,  #ifdef CONFIG_INTR_REMAP  	if (intr_remapping_enabled) { -		struct intel_iommu *iommu = map_ioapic_to_ir(apic); +		struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);  		struct irte irte;  		struct IR_IO_APIC_route_entry *ir_entry =  			(struct IR_IO_APIC_route_entry *) entry;  		int index;  		if (!iommu) -			panic("No mapping iommu for ioapic %d\n", apic); +			panic("No mapping iommu for ioapic %d\n", apic_id);  		index = alloc_irte(iommu, irq, 1);  		if (index < 0) -			panic("Failed to allocate IRTE for ioapic %d\n", apic); +			panic("Failed to allocate IRTE for ioapic %d\n", apic_id);  		memset(&irte, 0, sizeof(irte));  		irte.present = 1; -		irte.dst_mode = INT_DEST_MODE; +		irte.dst_mode = apic->irq_dest_mode;  		irte.trigger_mode = trigger; -		irte.dlvry_mode = INT_DELIVERY_MODE; +		irte.dlvry_mode = apic->irq_delivery_mode;  		irte.vector = vector;  		irte.dest_id = IRTE_DEST(destination); @@ -1528,8 +1518,8 @@ static int setup_ioapic_entry(int apic, int irq,  	} else  #endif  	{ -		entry->delivery_mode = INT_DELIVERY_MODE; -		entry->dest_mode = INT_DEST_MODE; +		entry->delivery_mode = apic->irq_delivery_mode; +		entry->dest_mode = apic->irq_dest_mode;  		entry->dest = destination;  	} @@ -1546,7 +1536,7 @@ static int setup_ioapic_entry(int apic, int irq,  	return 0;  } -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc, +static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,  			      int trigger, int polarity)  {  	struct irq_cfg *cfg; @@ -1558,22 +1548,22 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de  	cfg = desc->chip_data; -	if (assign_irq_vector(irq, cfg, TARGET_CPUS)) +	if (assign_irq_vector(irq, cfg, apic->target_cpus()))  		return; -	dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); +	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());  	apic_printk(APIC_VERBOSE,KERN_DEBUG  		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "  		    "IRQ %d Mode:%i Active:%i)\n", -		    apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, +		    apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,  		    irq, trigger, polarity); -	if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, +	if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,  			       dest, trigger, polarity, cfg->vector)) {  		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n", -		       mp_ioapics[apic].mp_apicid, pin); +		       mp_ioapics[apic_id].apicid, pin);  		__clear_irq_vector(irq, cfg);  		return;  	} @@ -1582,12 +1572,12 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de  	if (irq < NR_IRQS_LEGACY)  		disable_8259A_irq(irq); -	ioapic_write_entry(apic, pin, entry); +	ioapic_write_entry(apic_id, pin, entry);  }  static void __init setup_IO_APIC_irqs(void)  { -	int apic, pin, idx, irq; +	int apic_id, pin, idx, irq;  	int notcon = 0;  	struct irq_desc *desc;  	struct irq_cfg *cfg; @@ -1595,21 +1585,19 @@ static void __init setup_IO_APIC_irqs(void)  	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); -	for (apic = 0; apic < nr_ioapics; apic++) { -		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { +	for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { +		for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { -			idx = find_irq_entry(apic, pin, mp_INT); +			idx = find_irq_entry(apic_id, pin, mp_INT);  			if (idx == -1) {  				if (!notcon) {  					notcon = 1;  					apic_printk(APIC_VERBOSE,  						KERN_DEBUG " %d-%d", -						mp_ioapics[apic].mp_apicid, -						pin); +						mp_ioapics[apic_id].apicid, pin);  				} else  					apic_printk(APIC_VERBOSE, " %d-%d", -						mp_ioapics[apic].mp_apicid, -						pin); +						mp_ioapics[apic_id].apicid, pin);  				continue;  			}  			if (notcon) { @@ -1618,20 +1606,25 @@ static void __init setup_IO_APIC_irqs(void)  				notcon = 0;  			} -			irq = pin_2_irq(idx, apic, pin); -#ifdef CONFIG_X86_32 -			if (multi_timer_check(apic, irq)) +			irq = pin_2_irq(idx, apic_id, pin); + +			/* +			 * Skip the timer IRQ if there's a quirk handler +			 * installed and if it returns 1: +			 */ +			if (apic->multi_timer_check && +					apic->multi_timer_check(apic_id, irq))  				continue; -#endif +  			desc = irq_to_desc_alloc_cpu(irq, cpu);  			if (!desc) {  				printk(KERN_INFO "can not get irq_desc for %d\n", irq);  				continue;  			}  			cfg = desc->chip_data; -			add_pin_to_irq_cpu(cfg, cpu, apic, pin); +			add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); -			setup_IO_APIC_irq(apic, pin, irq, desc, +			setup_IO_APIC_irq(apic_id, pin, irq, desc,  					irq_trigger(idx), irq_polarity(idx));  		}  	} @@ -1644,7 +1637,7 @@ static void __init setup_IO_APIC_irqs(void)  /*   * Set up the timer pin, possibly with the 8259A-master behind.   */ -static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, +static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,  					int vector)  {  	struct IO_APIC_route_entry entry; @@ -1660,10 +1653,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,  	 * We use logical delivery to get the timer IRQ  	 * to the first CPU.  	 */ -	entry.dest_mode = INT_DEST_MODE; -	entry.mask = 1;					/* mask IRQ now */ -	entry.dest = cpu_mask_to_apicid(TARGET_CPUS); -	entry.delivery_mode = INT_DELIVERY_MODE; +	entry.dest_mode = apic->irq_dest_mode; +	entry.mask = 0;			/* don't mask IRQ for edge */ +	entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); +	entry.delivery_mode = apic->irq_delivery_mode;  	entry.polarity = 0;  	entry.trigger = 0;  	entry.vector = vector; @@ -1677,7 +1670,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,  	/*  	 * Add it to the IO-APIC irq-routing table:  	 */ -	ioapic_write_entry(apic, pin, entry); +	ioapic_write_entry(apic_id, pin, entry);  } @@ -1699,7 +1692,7 @@ __apicdebuginit(void) print_IO_APIC(void)  	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);  	for (i = 0; i < nr_ioapics; i++)  		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", -		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); +		       mp_ioapics[i].apicid, nr_ioapic_registers[i]);  	/*  	 * We are a bit conservative about what we expect.  We have to @@ -1719,7 +1712,7 @@ __apicdebuginit(void) print_IO_APIC(void)  	spin_unlock_irqrestore(&ioapic_lock, flags);  	printk("\n"); -	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); +	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);  	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);  	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);  	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type); @@ -1980,13 +1973,6 @@ void __init enable_IO_APIC(void)  	int apic;  	unsigned long flags; -#ifdef CONFIG_X86_32 -	int i; -	if (!pirqs_enabled) -		for (i = 0; i < MAX_PIRQS; i++) -			pirq_entries[i] = -1; -#endif -  	/*  	 * The number of IO-APIC IRQ registers (== #pins):  	 */ @@ -2090,7 +2076,7 @@ static void __init setup_ioapic_ids_from_mpc(void)  {  	union IO_APIC_reg_00 reg_00;  	physid_mask_t phys_id_present_map; -	int apic; +	int apic_id;  	int i;  	unsigned char old_id;  	unsigned long flags; @@ -2109,26 +2095,26 @@ static void __init setup_ioapic_ids_from_mpc(void)  	 * This is broken; anything with a real cpu count has to  	 * circumvent this idiocy regardless.  	 */ -	phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); +	phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);  	/*  	 * Set the IOAPIC ID to the value stored in the MPC table.  	 */ -	for (apic = 0; apic < nr_ioapics; apic++) { +	for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {  		/* Read the register 0 value */  		spin_lock_irqsave(&ioapic_lock, flags); -		reg_00.raw = io_apic_read(apic, 0); +		reg_00.raw = io_apic_read(apic_id, 0);  		spin_unlock_irqrestore(&ioapic_lock, flags); -		old_id = mp_ioapics[apic].mp_apicid; +		old_id = mp_ioapics[apic_id].apicid; -		if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { +		if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {  			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", -				apic, mp_ioapics[apic].mp_apicid); +				apic_id, mp_ioapics[apic_id].apicid);  			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",  				reg_00.bits.ID); -			mp_ioapics[apic].mp_apicid = reg_00.bits.ID; +			mp_ioapics[apic_id].apicid = reg_00.bits.ID;  		}  		/* @@ -2136,10 +2122,10 @@ static void __init setup_ioapic_ids_from_mpc(void)  		 * system must have a unique ID or we get lots of nice  		 * 'stuck on smp_invalidate_needed IPI wait' messages.  		 */ -		if (check_apicid_used(phys_id_present_map, -					mp_ioapics[apic].mp_apicid)) { +		if (apic->check_apicid_used(phys_id_present_map, +					mp_ioapics[apic_id].apicid)) {  			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", -				apic, mp_ioapics[apic].mp_apicid); +				apic_id, mp_ioapics[apic_id].apicid);  			for (i = 0; i < get_physical_broadcast(); i++)  				if (!physid_isset(i, phys_id_present_map))  					break; @@ -2148,13 +2134,13 @@ static void __init setup_ioapic_ids_from_mpc(void)  			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",  				i);  			physid_set(i, phys_id_present_map); -			mp_ioapics[apic].mp_apicid = i; +			mp_ioapics[apic_id].apicid = i;  		} else {  			physid_mask_t tmp; -			tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); +			tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);  			apic_printk(APIC_VERBOSE, "Setting %d in the "  					"phys_id_present_map\n", -					mp_ioapics[apic].mp_apicid); +					mp_ioapics[apic_id].apicid);  			physids_or(phys_id_present_map, phys_id_present_map, tmp);  		} @@ -2163,11 +2149,11 @@ static void __init setup_ioapic_ids_from_mpc(void)  		 * We need to adjust the IRQ routing table  		 * if the ID changed.  		 */ -		if (old_id != mp_ioapics[apic].mp_apicid) +		if (old_id != mp_ioapics[apic_id].apicid)  			for (i = 0; i < mp_irq_entries; i++) -				if (mp_irqs[i].mp_dstapic == old_id) -					mp_irqs[i].mp_dstapic -						= mp_ioapics[apic].mp_apicid; +				if (mp_irqs[i].dstapic == old_id) +					mp_irqs[i].dstapic +						= mp_ioapics[apic_id].apicid;  		/*  		 * Read the right value from the MPC table and @@ -2175,20 +2161,20 @@ static void __init setup_ioapic_ids_from_mpc(void)  		 */  		apic_printk(APIC_VERBOSE, KERN_INFO  			"...changing IO-APIC physical APIC ID to %d ...", -			mp_ioapics[apic].mp_apicid); +			mp_ioapics[apic_id].apicid); -		reg_00.bits.ID = mp_ioapics[apic].mp_apicid; +		reg_00.bits.ID = mp_ioapics[apic_id].apicid;  		spin_lock_irqsave(&ioapic_lock, flags); -		io_apic_write(apic, 0, reg_00.raw); +		io_apic_write(apic_id, 0, reg_00.raw);  		spin_unlock_irqrestore(&ioapic_lock, flags);  		/*  		 * Sanity check  		 */  		spin_lock_irqsave(&ioapic_lock, flags); -		reg_00.raw = io_apic_read(apic, 0); +		reg_00.raw = io_apic_read(apic_id, 0);  		spin_unlock_irqrestore(&ioapic_lock, flags); -		if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) +		if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)  			printk("could not set ID!\n");  		else  			apic_printk(APIC_VERBOSE, " ok.\n"); @@ -2291,7 +2277,7 @@ static int ioapic_retrigger_irq(unsigned int irq)  	unsigned long flags;  	spin_lock_irqsave(&vector_lock, flags); -	send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); +	apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);  	spin_unlock_irqrestore(&vector_lock, flags);  	return 1; @@ -2299,7 +2285,7 @@ static int ioapic_retrigger_irq(unsigned int irq)  #else  static int ioapic_retrigger_irq(unsigned int irq)  { -	send_IPI_self(irq_cfg(irq)->vector); +	apic->send_IPI_self(irq_cfg(irq)->vector);  	return 1;  } @@ -2363,7 +2349,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)  	set_extra_move_desc(desc, mask); -	dest = cpu_mask_to_apicid_and(cfg->domain, mask); +	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);  	modify_ioapic_rte = desc->status & IRQ_LEVEL;  	if (modify_ioapic_rte) { @@ -2383,7 +2369,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)  	if (cfg->move_in_progress)  		send_cleanup_vector(cfg); -	cpumask_copy(&desc->affinity, mask); +	cpumask_copy(desc->affinity, mask);  }  static int migrate_irq_remapped_level_desc(struct irq_desc *desc) @@ -2405,11 +2391,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)  	}  	/* everthing is clear. we have right of way */ -	migrate_ioapic_irq_desc(desc, &desc->pending_mask); +	migrate_ioapic_irq_desc(desc, desc->pending_mask);  	ret = 0;  	desc->status &= ~IRQ_MOVE_PENDING; -	cpumask_clear(&desc->pending_mask); +	cpumask_clear(desc->pending_mask);  unmask:  	unmask_IO_APIC_irq_desc(desc); @@ -2434,7 +2420,7 @@ static void ir_irq_migration(struct work_struct *work)  				continue;  			} -			desc->chip->set_affinity(irq, &desc->pending_mask); +			desc->chip->set_affinity(irq, desc->pending_mask);  			spin_unlock_irqrestore(&desc->lock, flags);  		}  	} @@ -2448,7 +2434,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,  {  	if (desc->status & IRQ_LEVEL) {  		desc->status |= IRQ_MOVE_PENDING; -		cpumask_copy(&desc->pending_mask, mask); +		cpumask_copy(desc->pending_mask, mask);  		migrate_irq_remapped_level_desc(desc);  		return;  	} @@ -2516,7 +2502,7 @@ static void irq_complete_move(struct irq_desc **descp)  		/* domain has not changed, but affinity did */  		me = smp_processor_id(); -		if (cpu_isset(me, desc->affinity)) { +		if (cpumask_test_cpu(me, desc->affinity)) {  			*descp = desc = move_irq_desc(desc, me);  			/* get the new one */  			cfg = desc->chip_data; @@ -2867,19 +2853,15 @@ static inline void __init check_timer(void)  	int cpu = boot_cpu_id;  	int apic1, pin1, apic2, pin2;  	unsigned long flags; -	unsigned int ver;  	int no_pin1 = 0;  	local_irq_save(flags); -	ver = apic_read(APIC_LVR); -	ver = GET_APIC_VERSION(ver); -  	/*  	 * get/set the timer IRQ vector:  	 */  	disable_8259A_irq(0); -	assign_irq_vector(0, cfg, TARGET_CPUS); +	assign_irq_vector(0, cfg, apic->target_cpus());  	/*  	 * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2893,7 +2875,13 @@ static inline void __init check_timer(void)  	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);  	init_8259A(1);  #ifdef CONFIG_X86_32 -	timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +	{ +		unsigned int ver; + +		ver = apic_read(APIC_LVR); +		ver = GET_APIC_VERSION(ver); +		timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +	}  #endif  	pin1  = find_isa_irq_pin(0, mp_INT); @@ -2932,8 +2920,17 @@ static inline void __init check_timer(void)  		if (no_pin1) {  			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);  			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); +		} else { +			/* for edge trigger, setup_IO_APIC_irq already +			 * leave it unmasked. +			 * so only need to unmask if it is level-trigger +			 * do we really have level trigger timer? +			 */ +			int idx; +			idx = find_irq_entry(apic1, pin1, mp_INT); +			if (idx != -1 && irq_trigger(idx)) +				unmask_IO_APIC_irq_desc(desc);  		} -		unmask_IO_APIC_irq_desc(desc);  		if (timer_irq_works()) {  			if (nmi_watchdog == NMI_IO_APIC) {  				setup_nmi(); @@ -2947,6 +2944,7 @@ static inline void __init check_timer(void)  		if (intr_remapping_enabled)  			panic("timer doesn't work through Interrupt-remapped IO-APIC");  #endif +		local_irq_disable();  		clear_IO_APIC_pin(apic1, pin1);  		if (!no_pin1)  			apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -2961,7 +2959,6 @@ static inline void __init check_timer(void)  		 */  		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);  		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); -		unmask_IO_APIC_irq_desc(desc);  		enable_8259A_irq(0);  		if (timer_irq_works()) {  			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2976,6 +2973,7 @@ static inline void __init check_timer(void)  		/*  		 * Cleanup, just in case ...  		 */ +		local_irq_disable();  		disable_8259A_irq(0);  		clear_IO_APIC_pin(apic2, pin2);  		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); @@ -3001,6 +2999,7 @@ static inline void __init check_timer(void)  		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");  		goto out;  	} +	local_irq_disable();  	disable_8259A_irq(0);  	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);  	apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); @@ -3018,6 +3017,7 @@ static inline void __init check_timer(void)  		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");  		goto out;  	} +	local_irq_disable();  	apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");  	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "  		"report.  Then try booting with the 'noapic' option.\n"); @@ -3047,13 +3047,9 @@ out:  void __init setup_IO_APIC(void)  { -#ifdef CONFIG_X86_32 -	enable_IO_APIC(); -#else  	/*  	 * calling enable_IO_APIC() is moved to setup_local_APIC for BP  	 */ -#endif  	io_apic_irqs = ~PIC_IRQS; @@ -3118,8 +3114,8 @@ static int ioapic_resume(struct sys_device *dev)  	spin_lock_irqsave(&ioapic_lock, flags);  	reg_00.raw = io_apic_read(dev->id, 0); -	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { -		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; +	if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { +		reg_00.bits.ID = mp_ioapics[dev->id].apicid;  		io_apic_write(dev->id, 0, reg_00.raw);  	}  	spin_unlock_irqrestore(&ioapic_lock, flags); @@ -3169,6 +3165,7 @@ static int __init ioapic_init_sysfs(void)  device_initcall(ioapic_init_sysfs); +static int nr_irqs_gsi = NR_IRQS_LEGACY;  /*   * Dynamic irq allocate and deallocation   */ @@ -3183,11 +3180,11 @@ unsigned int create_irq_nr(unsigned int irq_want)  	struct irq_desc *desc_new = NULL;  	irq = 0; -	spin_lock_irqsave(&vector_lock, flags); -	for (new = irq_want; new < NR_IRQS; new++) { -		if (platform_legacy_irq(new)) -			continue; +	if (irq_want < nr_irqs_gsi) +		irq_want = nr_irqs_gsi; +	spin_lock_irqsave(&vector_lock, flags); +	for (new = irq_want; new < nr_irqs; new++) {  		desc_new = irq_to_desc_alloc_cpu(new, cpu);  		if (!desc_new) {  			printk(KERN_INFO "can not get irq_desc for %d\n", new); @@ -3197,7 +3194,7 @@ unsigned int create_irq_nr(unsigned int irq_want)  		if (cfg_new->vector != 0)  			continue; -		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0) +		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)  			irq = new;  		break;  	} @@ -3212,7 +3209,6 @@ unsigned int create_irq_nr(unsigned int irq_want)  	return irq;  } -static int nr_irqs_gsi = NR_IRQS_LEGACY;  int create_irq(void)  {  	unsigned int irq_want; @@ -3259,12 +3255,15 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms  	int err;  	unsigned dest; +	if (disable_apic) +		return -ENXIO; +  	cfg = irq_cfg(irq); -	err = assign_irq_vector(irq, cfg, TARGET_CPUS); +	err = assign_irq_vector(irq, cfg, apic->target_cpus());  	if (err)  		return err; -	dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); +	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());  #ifdef CONFIG_INTR_REMAP  	if (irq_remapped(irq)) { @@ -3278,9 +3277,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms  		memset (&irte, 0, sizeof(irte));  		irte.present = 1; -		irte.dst_mode = INT_DEST_MODE; +		irte.dst_mode = apic->irq_dest_mode;  		irte.trigger_mode = 0; /* edge */ -		irte.dlvry_mode = INT_DELIVERY_MODE; +		irte.dlvry_mode = apic->irq_delivery_mode;  		irte.vector = cfg->vector;  		irte.dest_id = IRTE_DEST(dest); @@ -3298,10 +3297,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms  		msg->address_hi = MSI_ADDR_BASE_HI;  		msg->address_lo =  			MSI_ADDR_BASE_LO | -			((INT_DEST_MODE == 0) ? +			((apic->irq_dest_mode == 0) ?  				MSI_ADDR_DEST_MODE_PHYSICAL:  				MSI_ADDR_DEST_MODE_LOGICAL) | -			((INT_DELIVERY_MODE != dest_LowestPrio) ? +			((apic->irq_delivery_mode != dest_LowestPrio) ?  				MSI_ADDR_REDIRECTION_CPU:  				MSI_ADDR_REDIRECTION_LOWPRI) |  			MSI_ADDR_DEST_ID(dest); @@ -3309,7 +3308,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms  		msg->data =  			MSI_DATA_TRIGGER_EDGE |  			MSI_DATA_LEVEL_ASSERT | -			((INT_DELIVERY_MODE != dest_LowestPrio) ? +			((apic->irq_delivery_mode != dest_LowestPrio) ?  				MSI_DATA_DELIVERY_FIXED:  				MSI_DATA_DELIVERY_LOWPRI) |  			MSI_DATA_VECTOR(cfg->vector); @@ -3464,40 +3463,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)  	return 0;  } -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc) -{ -	unsigned int irq; -	int ret; -	unsigned int irq_want; - -	irq_want = nr_irqs_gsi; -	irq = create_irq_nr(irq_want); -	if (irq == 0) -		return -1; - -#ifdef CONFIG_INTR_REMAP -	if (!intr_remapping_enabled) -		goto no_ir; - -	ret = msi_alloc_irte(dev, irq, 1); -	if (ret < 0) -		goto error; -no_ir: -#endif -	ret = setup_msi_irq(dev, msidesc, irq); -	if (ret < 0) { -		destroy_irq(irq); -		return ret; -	} -	return 0; - -#ifdef CONFIG_INTR_REMAP -error: -	destroy_irq(irq); -	return ret; -#endif -} -  int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)  {  	unsigned int irq; @@ -3514,9 +3479,9 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)  	sub_handle = 0;  	list_for_each_entry(msidesc, &dev->msi_list, list) {  		irq = create_irq_nr(irq_want); -		irq_want++;  		if (irq == 0)  			return -1; +		irq_want = irq + 1;  #ifdef CONFIG_INTR_REMAP  		if (!intr_remapping_enabled)  			goto no_ir; @@ -3727,13 +3692,17 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)  	struct irq_cfg *cfg;  	int err; +	if (disable_apic) +		return -ENXIO; +  	cfg = irq_cfg(irq); -	err = assign_irq_vector(irq, cfg, TARGET_CPUS); +	err = assign_irq_vector(irq, cfg, apic->target_cpus());  	if (!err) {  		struct ht_irq_msg msg;  		unsigned dest; -		dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); +		dest = apic->cpu_mask_to_apicid_and(cfg->domain, +						    apic->target_cpus());  		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); @@ -3741,11 +3710,11 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)  			HT_IRQ_LOW_BASE |  			HT_IRQ_LOW_DEST_ID(dest) |  			HT_IRQ_LOW_VECTOR(cfg->vector) | -			((INT_DEST_MODE == 0) ? +			((apic->irq_dest_mode == 0) ?  				HT_IRQ_LOW_DM_PHYSICAL :  				HT_IRQ_LOW_DM_LOGICAL) |  			HT_IRQ_LOW_RQEOI_EDGE | -			((INT_DELIVERY_MODE != dest_LowestPrio) ? +			((apic->irq_delivery_mode != dest_LowestPrio) ?  				HT_IRQ_LOW_MT_FIXED :  				HT_IRQ_LOW_MT_ARBITRATED) |  			HT_IRQ_LOW_IRQ_MASKED; @@ -3761,7 +3730,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)  }  #endif /* CONFIG_HT_IRQ */ -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_UV  /*   * Re-target the irq to the specified CPU and enable the specified MMR located   * on the specified blade to allow the sending of MSIs to the specified CPU. @@ -3793,12 +3762,12 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));  	entry->vector = cfg->vector; -	entry->delivery_mode = INT_DELIVERY_MODE; -	entry->dest_mode = INT_DEST_MODE; +	entry->delivery_mode = apic->irq_delivery_mode; +	entry->dest_mode = apic->irq_dest_mode;  	entry->polarity = 0;  	entry->trigger = 0;  	entry->mask = 0; -	entry->dest = cpu_mask_to_apicid(eligible_cpu); +	entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);  	mmr_pnode = uv_blade_to_pnode(mmr_blade);  	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3861,6 +3830,28 @@ void __init probe_nr_irqs_gsi(void)  	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);  } +#ifdef CONFIG_SPARSE_IRQ +int __init arch_probe_nr_irqs(void) +{ +	int nr; + +	if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) +		nr_irqs = NR_VECTORS * nr_cpu_ids; + +	nr = nr_irqs_gsi + 8 * nr_cpu_ids; +#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) +	/* +	 * for MSI and HT dyn irq +	 */ +	nr += nr_irqs_gsi * 16; +#endif +	if (nr < nr_irqs) +		nr_irqs = nr; + +	return 0; +} +#endif +  /* --------------------------------------------------------------------------                            ACPI-based IOAPIC Configuration     -------------------------------------------------------------------------- */ @@ -3886,7 +3877,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)  	 */  	if (physids_empty(apic_id_map)) -		apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); +		apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);  	spin_lock_irqsave(&ioapic_lock, flags);  	reg_00.raw = io_apic_read(ioapic, 0); @@ -3902,10 +3893,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)  	 * Every APIC in a system must have a unique ID or we get lots of nice  	 * 'stuck on smp_invalidate_needed IPI wait' messages.  	 */ -	if (check_apicid_used(apic_id_map, apic_id)) { +	if (apic->check_apicid_used(apic_id_map, apic_id)) {  		for (i = 0; i < get_physical_broadcast(); i++) { -			if (!check_apicid_used(apic_id_map, i)) +			if (!apic->check_apicid_used(apic_id_map, i))  				break;  		} @@ -3918,7 +3909,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)  		apic_id = i;  	} -	tmp = apicid_to_cpu_present(apic_id); +	tmp = apic->apicid_to_cpu_present(apic_id);  	physids_or(apic_id_map, apic_id_map, tmp);  	if (reg_00.bits.ID != apic_id) { @@ -3995,8 +3986,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)  		return -1;  	for (i = 0; i < mp_irq_entries; i++) -		if (mp_irqs[i].mp_irqtype == mp_INT && -		    mp_irqs[i].mp_srcbusirq == bus_irq) +		if (mp_irqs[i].irqtype == mp_INT && +		    mp_irqs[i].srcbusirq == bus_irq)  			break;  	if (i >= mp_irq_entries)  		return -1; @@ -4011,7 +4002,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)  /*   * This function currently is only a helper for the i386 smp boot process where   * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS + * so mask in all cases should simply be apic->target_cpus()   */  #ifdef CONFIG_SMP  void __init setup_ioapic_dest(void) @@ -4050,9 +4041,9 @@ void __init setup_ioapic_dest(void)  			 */  			if (desc->status &  			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) -				mask = &desc->affinity; +				mask = desc->affinity;  			else -				mask = TARGET_CPUS; +				mask = apic->target_cpus();  #ifdef CONFIG_INTR_REMAP  			if (intr_remapping_enabled) @@ -4111,7 +4102,7 @@ void __init ioapic_init_mappings(void)  	ioapic_res = ioapic_setup_resources();  	for (i = 0; i < nr_ioapics; i++) {  		if (smp_found_config) { -			ioapic_phys = mp_ioapics[i].mp_apicaddr; +			ioapic_phys = mp_ioapics[i].apicaddr;  #ifdef CONFIG_X86_32  			if (!ioapic_phys) {  				printk(KERN_ERR diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c new file mode 100644 index 00000000000..dbf5445727a --- /dev/null +++ b/arch/x86/kernel/apic/ipi.c @@ -0,0 +1,164 @@ +#include <linux/cpumask.h> +#include <linux/interrupt.h> +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/cache.h> +#include <linux/cpu.h> +#include <linux/module.h> + +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/apic.h> +#include <asm/proto.h> +#include <asm/ipi.h> + +void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) +{ +	unsigned long query_cpu; +	unsigned long flags; + +	/* +	 * Hack. The clustered APIC addressing mode doesn't allow us to send +	 * to an arbitrary mask, so I do a unicast to each CPU instead. +	 * - mbligh +	 */ +	local_irq_save(flags); +	for_each_cpu(query_cpu, mask) { +		__default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, +				query_cpu), vector, APIC_DEST_PHYSICAL); +	} +	local_irq_restore(flags); +} + +void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, +						 int vector) +{ +	unsigned int this_cpu = smp_processor_id(); +	unsigned int query_cpu; +	unsigned long flags; + +	/* See Hack comment above */ + +	local_irq_save(flags); +	for_each_cpu(query_cpu, mask) { +		if (query_cpu == this_cpu) +			continue; +		__default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, +				 query_cpu), vector, APIC_DEST_PHYSICAL); +	} +	local_irq_restore(flags); +} + +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, +						 int vector) +{ +	unsigned long flags; +	unsigned int query_cpu; + +	/* +	 * Hack. The clustered APIC addressing mode doesn't allow us to send +	 * to an arbitrary mask, so I do a unicasts to each CPU instead. This +	 * should be modified to do 1 message per cluster ID - mbligh +	 */ + +	local_irq_save(flags); +	for_each_cpu(query_cpu, mask) +		__default_send_IPI_dest_field( +			apic->cpu_to_logical_apicid(query_cpu), vector, +			apic->dest_logical); +	local_irq_restore(flags); +} + +void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, +						 int vector) +{ +	unsigned long flags; +	unsigned int query_cpu; +	unsigned int this_cpu = smp_processor_id(); + +	/* See Hack comment above */ + +	local_irq_save(flags); +	for_each_cpu(query_cpu, mask) { +		if (query_cpu == this_cpu) +			continue; +		__default_send_IPI_dest_field( +			apic->cpu_to_logical_apicid(query_cpu), vector, +			apic->dest_logical); +		} +	local_irq_restore(flags); +} + +#ifdef CONFIG_X86_32 + +/* + * This is only used on smaller machines. + */ +void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) +{ +	unsigned long mask = cpumask_bits(cpumask)[0]; +	unsigned long flags; + +	local_irq_save(flags); +	WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); +	__default_send_IPI_dest_field(mask, vector, apic->dest_logical); +	local_irq_restore(flags); +} + +void default_send_IPI_allbutself(int vector) +{ +	/* +	 * if there are no other CPUs in the system then we get an APIC send +	 * error if we try to broadcast, thus avoid sending IPIs in this case. +	 */ +	if (!(num_online_cpus() > 1)) +		return; + +	__default_local_send_IPI_allbutself(vector); +} + +void default_send_IPI_all(int vector) +{ +	__default_local_send_IPI_all(vector); +} + +void default_send_IPI_self(int vector) +{ +	__default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical); +} + +/* must come after the send_IPI functions above for inlining */ +static int convert_apicid_to_cpu(int apic_id) +{ +	int i; + +	for_each_possible_cpu(i) { +		if (per_cpu(x86_cpu_to_apicid, i) == apic_id) +			return i; +	} +	return -1; +} + +int safe_smp_processor_id(void) +{ +	int apicid, cpuid; + +	if (!boot_cpu_has(X86_FEATURE_APIC)) +		return 0; + +	apicid = hard_smp_processor_id(); +	if (apicid == BAD_APICID) +		return 0; + +	cpuid = convert_apicid_to_cpu(apicid); + +	return cpuid >= 0 ? cpuid : 0; +} +#endif diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/apic/nmi.c index 7228979f1e7..bdfad80c3cf 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -34,7 +34,7 @@  #include <asm/mce.h> -#include <mach_traps.h> +#include <asm/mach_traps.h>  int unknown_nmi_panic;  int nmi_watchdog_enabled; @@ -61,11 +61,7 @@ static int endflag __initdata;  static inline unsigned int get_nmi_count(int cpu)  { -#ifdef CONFIG_X86_64 -	return cpu_pda(cpu)->__nmi_count; -#else -	return nmi_count(cpu); -#endif +	return per_cpu(irq_stat, cpu).__nmi_count;  }  static inline int mce_in_progress(void) @@ -82,12 +78,8 @@ static inline int mce_in_progress(void)   */  static inline unsigned int get_timer_irqs(int cpu)  { -#ifdef CONFIG_X86_64 -	return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); -#else  	return per_cpu(irq_stat, cpu).apic_timer_irqs +  		per_cpu(irq_stat, cpu).irq0_irqs; -#endif  }  #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c new file mode 100644 index 00000000000..ba2fc646553 --- /dev/null +++ b/arch/x86/kernel/apic/numaq_32.c @@ -0,0 +1,557 @@ +/* + * Written by: Patricia Gaughen, IBM Corporation + * + * Copyright (C) 2002, IBM Corp. + * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT.  See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to <gone@us.ibm.com> + */ +#include <linux/nodemask.h> +#include <linux/topology.h> +#include <linux/bootmem.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/kernel.h> +#include <linux/mmzone.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/numa.h> +#include <linux/smp.h> +#include <linux/io.h> +#include <linux/mm.h> + +#include <asm/processor.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/numaq.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/e820.h> +#include <asm/ipi.h> + +#define	MB_TO_PAGES(addr)		((addr) << (20 - PAGE_SHIFT)) + +int found_numaq; + +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ +struct mpc_trans { +	unsigned char			mpc_type; +	unsigned char			trans_len; +	unsigned char			trans_type; +	unsigned char			trans_quad; +	unsigned char			trans_global; +	unsigned char			trans_local; +	unsigned short			trans_reserved; +}; + +/* x86_quirks member */ +static int				mpc_record; + +static struct mpc_trans			*translation_table[MAX_MPC_ENTRY]; + +int					mp_bus_id_to_node[MAX_MP_BUSSES]; +int					mp_bus_id_to_local[MAX_MP_BUSSES]; +int					quad_local_to_mp_bus_id[NR_CPUS/4][4]; + + +static inline void numaq_register_node(int node, struct sys_cfg_data *scd) +{ +	struct eachquadmem *eq = scd->eq + node; + +	node_set_online(node); + +	/* Convert to pages */ +	node_start_pfn[node] = +		 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size); + +	node_end_pfn[node] = +		 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); + +	e820_register_active_regions(node, node_start_pfn[node], +						node_end_pfn[node]); + +	memory_present(node, node_start_pfn[node], node_end_pfn[node]); + +	node_remap_size[node] = node_memmap_size_bytes(node, +					node_start_pfn[node], +					node_end_pfn[node]); +} + +/* + * Function: smp_dump_qct() + * + * Description: gets memory layout from the quad config table.  This + * function also updates node_online_map with the nodes (quads) present. + */ +static void __init smp_dump_qct(void) +{ +	struct sys_cfg_data *scd; +	int node; + +	scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); + +	nodes_clear(node_online_map); +	for_each_node(node) { +		if (scd->quads_present31_0 & (1 << node)) +			numaq_register_node(node, scd); +	} +} + +void __cpuinit numaq_tsc_disable(void) +{ +	if (!found_numaq) +		return; + +	if (num_online_nodes() > 1) { +		printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); +		setup_clear_cpu_cap(X86_FEATURE_TSC); +	} +} + +static int __init numaq_pre_time_init(void) +{ +	numaq_tsc_disable(); +	return 0; +} + +static inline int generate_logical_apicid(int quad, int phys_apicid) +{ +	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); +} + +/* x86_quirks member */ +static int mpc_apic_id(struct mpc_cpu *m) +{ +	int quad = translation_table[mpc_record]->trans_quad; +	int logical_apicid = generate_logical_apicid(quad, m->apicid); + +	printk(KERN_DEBUG +		"Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", +		 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8, +		(m->cpufeature & CPU_MODEL_MASK) >> 4, +		 m->apicver, quad, logical_apicid); + +	return logical_apicid; +} + +/* x86_quirks member */ +static void mpc_oem_bus_info(struct mpc_bus *m, char *name) +{ +	int quad = translation_table[mpc_record]->trans_quad; +	int local = translation_table[mpc_record]->trans_local; + +	mp_bus_id_to_node[m->busid] = quad; +	mp_bus_id_to_local[m->busid] = local; + +	printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad); +} + +/* x86_quirks member */ +static void mpc_oem_pci_bus(struct mpc_bus *m) +{ +	int quad = translation_table[mpc_record]->trans_quad; +	int local = translation_table[mpc_record]->trans_local; + +	quad_local_to_mp_bus_id[quad][local] = m->busid; +} + +static void __init MP_translation_info(struct mpc_trans *m) +{ +	printk(KERN_INFO +	    "Translation: record %d, type %d, quad %d, global %d, local %d\n", +	       mpc_record, m->trans_type, m->trans_quad, m->trans_global, +	       m->trans_local); + +	if (mpc_record >= MAX_MPC_ENTRY) +		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); +	else +		translation_table[mpc_record] = m; /* stash this for later */ + +	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) +		node_set_online(m->trans_quad); +} + +static int __init mpf_checksum(unsigned char *mp, int len) +{ +	int sum = 0; + +	while (len--) +		sum += *mp++; + +	return sum & 0xFF; +} + +/* + * Read/parse the MPC oem tables + */ +static void __init + smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize) +{ +	int count = sizeof(*oemtable);	/* the header size */ +	unsigned char *oemptr = ((unsigned char *)oemtable) + count; + +	mpc_record = 0; +	printk(KERN_INFO +		"Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + +	if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { +		printk(KERN_WARNING +		       "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", +		       oemtable->signature[0], oemtable->signature[1], +		       oemtable->signature[2], oemtable->signature[3]); +		return; +	} + +	if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) { +		printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); +		return; +	} + +	while (count < oemtable->length) { +		switch (*oemptr) { +		case MP_TRANSLATION: +			{ +				struct mpc_trans *m = (void *)oemptr; + +				MP_translation_info(m); +				oemptr += sizeof(*m); +				count += sizeof(*m); +				++mpc_record; +				break; +			} +		default: +			printk(KERN_WARNING +			       "Unrecognised OEM table entry type! - %d\n", +			       (int)*oemptr); +			return; +		} +	} +} + +static int __init numaq_setup_ioapic_ids(void) +{ +	/* so can skip it */ +	return 1; +} + +static struct x86_quirks numaq_x86_quirks __initdata = { +	.arch_pre_time_init		= numaq_pre_time_init, +	.arch_time_init			= NULL, +	.arch_pre_intr_init		= NULL, +	.arch_memory_setup		= NULL, +	.arch_intr_init			= NULL, +	.arch_trap_init			= NULL, +	.mach_get_smp_config		= NULL, +	.mach_find_smp_config		= NULL, +	.mpc_record			= &mpc_record, +	.mpc_apic_id			= mpc_apic_id, +	.mpc_oem_bus_info		= mpc_oem_bus_info, +	.mpc_oem_pci_bus		= mpc_oem_pci_bus, +	.smp_read_mpc_oem		= smp_read_mpc_oem, +	.setup_ioapic_ids		= numaq_setup_ioapic_ids, +}; + +static __init void early_check_numaq(void) +{ +	/* +	 * Find possible boot-time SMP configuration: +	 */ +	early_find_smp_config(); + +	/* +	 * get boot-time SMP configuration: +	 */ +	if (smp_found_config) +		early_get_smp_config(); + +	if (found_numaq) +		x86_quirks = &numaq_x86_quirks; +} + +int __init get_memcfg_numaq(void) +{ +	early_check_numaq(); +	if (!found_numaq) +		return 0; +	smp_dump_qct(); + +	return 1; +} + +#define NUMAQ_APIC_DFR_VALUE	(APIC_DFR_CLUSTER) + +static inline unsigned int numaq_get_apic_id(unsigned long x) +{ +	return (x >> 24) & 0x0F; +} + +static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector) +{ +	default_send_IPI_mask_sequence_logical(mask, vector); +} + +static inline void numaq_send_IPI_allbutself(int vector) +{ +	default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector); +} + +static inline void numaq_send_IPI_all(int vector) +{ +	numaq_send_IPI_mask(cpu_online_mask, vector); +} + +#define NUMAQ_TRAMPOLINE_PHYS_LOW	(0x8) +#define NUMAQ_TRAMPOLINE_PHYS_HIGH	(0xa) + +/* + * Because we use NMIs rather than the INIT-STARTUP sequence to + * bootstrap the CPUs, the APIC may be in a weird state. Kick it: + */ +static inline void numaq_smp_callin_clear_local_apic(void) +{ +	clear_local_APIC(); +} + +static inline const cpumask_t *numaq_target_cpus(void) +{ +	return &CPU_MASK_ALL; +} + +static inline unsigned long +numaq_check_apicid_used(physid_mask_t bitmap, int apicid) +{ +	return physid_isset(apicid, bitmap); +} + +static inline unsigned long numaq_check_apicid_present(int bit) +{ +	return physid_isset(bit, phys_cpu_present_map); +} + +static inline int numaq_apic_id_registered(void) +{ +	return 1; +} + +static inline void numaq_init_apic_ldr(void) +{ +	/* Already done in NUMA-Q firmware */ +} + +static inline void numaq_setup_apic_routing(void) +{ +	printk(KERN_INFO +		"Enabling APIC mode:  NUMA-Q.  Using %d I/O APICs\n", +		nr_ioapics); +} + +/* + * Skip adding the timer int on secondary nodes, which causes + * a small but painful rift in the time-space continuum. + */ +static inline int numaq_multi_timer_check(int apic, int irq) +{ +	return apic != 0 && irq == 0; +} + +static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) +{ +	/* We don't have a good way to do this yet - hack */ +	return physids_promote(0xFUL); +} + +static inline int numaq_cpu_to_logical_apicid(int cpu) +{ +	if (cpu >= nr_cpu_ids) +		return BAD_APICID; +	return cpu_2_logical_apicid[cpu]; +} + +/* + * Supporting over 60 cpus on NUMA-Q requires a locality-dependent + * cpu to APIC ID relation to properly interact with the intelligent + * mode of the cluster controller. + */ +static inline int numaq_cpu_present_to_apicid(int mps_cpu) +{ +	if (mps_cpu < 60) +		return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3)); +	else +		return BAD_APICID; +} + +static inline int numaq_apicid_to_node(int logical_apicid) +{ +	return logical_apicid >> 4; +} + +static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) +{ +	int node = numaq_apicid_to_node(logical_apicid); +	int cpu = __ffs(logical_apicid & 0xf); + +	return physid_mask_of_physid(cpu + 4*node); +} + +/* Where the IO area was mapped on multiquad, always 0 otherwise */ +void *xquad_portio; + +static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) +{ +	return 1; +} + +/* + * We use physical apicids here, not logical, so just return the default + * physical broadcast to stop people from breaking us + */ +static inline unsigned int numaq_cpu_mask_to_apicid(const cpumask_t *cpumask) +{ +	return 0x0F; +} + +static inline unsigned int +numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +			     const struct cpumask *andmask) +{ +	return 0x0F; +} + +/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ +static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb) +{ +	return cpuid_apic >> index_msb; +} + +static int +numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) +{ +	if (strncmp(oem, "IBM NUMA", 8)) +		printk(KERN_ERR "Warning! Not a NUMA-Q system!\n"); +	else +		found_numaq = 1; + +	return found_numaq; +} + +static int probe_numaq(void) +{ +	/* already know from get_memcfg_numaq() */ +	return found_numaq; +} + +static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask) +{ +	/* Careful. Some cpus do not strictly honor the set of cpus +	 * specified in the interrupt destination when using lowest +	 * priority interrupt delivery mode. +	 * +	 * In particular there was a hyperthreading cpu observed to +	 * deliver interrupts to the wrong hyperthread when only one +	 * hyperthread was specified in the interrupt desitination. +	 */ +	*retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; +} + +static void numaq_setup_portio_remap(void) +{ +	int num_quads = num_online_nodes(); + +	if (num_quads <= 1) +		return; + +	printk(KERN_INFO +		"Remapping cross-quad port I/O for %d quads\n", num_quads); + +	xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD); + +	printk(KERN_INFO +		"xquad_portio vaddr 0x%08lx, len %08lx\n", +		(u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); +} + +struct apic apic_numaq = { + +	.name				= "NUMAQ", +	.probe				= probe_numaq, +	.acpi_madt_oem_check		= NULL, +	.apic_id_registered		= numaq_apic_id_registered, + +	.irq_delivery_mode		= dest_LowestPrio, +	/* physical delivery on LOCAL quad: */ +	.irq_dest_mode			= 0, + +	.target_cpus			= numaq_target_cpus, +	.disable_esr			= 1, +	.dest_logical			= APIC_DEST_LOGICAL, +	.check_apicid_used		= numaq_check_apicid_used, +	.check_apicid_present		= numaq_check_apicid_present, + +	.vector_allocation_domain	= numaq_vector_allocation_domain, +	.init_apic_ldr			= numaq_init_apic_ldr, + +	.ioapic_phys_id_map		= numaq_ioapic_phys_id_map, +	.setup_apic_routing		= numaq_setup_apic_routing, +	.multi_timer_check		= numaq_multi_timer_check, +	.apicid_to_node			= numaq_apicid_to_node, +	.cpu_to_logical_apicid		= numaq_cpu_to_logical_apicid, +	.cpu_present_to_apicid		= numaq_cpu_present_to_apicid, +	.apicid_to_cpu_present		= numaq_apicid_to_cpu_present, +	.setup_portio_remap		= numaq_setup_portio_remap, +	.check_phys_apicid_present	= numaq_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= numaq_phys_pkg_id, +	.mps_oem_check			= numaq_mps_oem_check, + +	.get_apic_id			= numaq_get_apic_id, +	.set_apic_id			= NULL, +	.apic_id_mask			= 0x0F << 24, + +	.cpu_mask_to_apicid		= numaq_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= numaq_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= numaq_send_IPI_mask, +	.send_IPI_mask_allbutself	= NULL, +	.send_IPI_allbutself		= numaq_send_IPI_allbutself, +	.send_IPI_all			= numaq_send_IPI_all, +	.send_IPI_self			= default_send_IPI_self, + +	.wakeup_secondary_cpu		= wakeup_secondary_cpu_via_nmi, +	.trampoline_phys_low		= NUMAQ_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= NUMAQ_TRAMPOLINE_PHYS_HIGH, + +	/* We don't do anything here because we use NMI's to boot instead */ +	.wait_for_init_deassert		= NULL, + +	.smp_callin_clear_local_apic	= numaq_smp_callin_clear_local_apic, +	.inquire_remote_apic		= NULL, + +	.read				= native_apic_mem_read, +	.write				= native_apic_mem_write, +	.icr_read			= native_apic_icr_read, +	.icr_write			= native_apic_icr_write, +	.wait_icr_idle			= native_apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, +}; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c new file mode 100644 index 00000000000..141c99a1c26 --- /dev/null +++ b/arch/x86/kernel/apic/probe_32.c @@ -0,0 +1,284 @@ +/* + * Default generic APIC driver. This handles up to 8 CPUs. + * + * Copyright 2003 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License, v.2 + * + * Generic x86 APIC driver probe layer. + */ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/setup.h> + +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <asm/mpspec.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <asm/ipi.h> + +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <asm/acpi.h> +#include <asm/e820.h> +#include <asm/setup.h> + +#ifdef CONFIG_HOTPLUG_CPU +#define DEFAULT_SEND_IPI	(1) +#else +#define DEFAULT_SEND_IPI	(0) +#endif + +int no_broadcast = DEFAULT_SEND_IPI; + +static __init int no_ipi_broadcast(char *str) +{ +	get_option(&str, &no_broadcast); +	pr_info("Using %s mode\n", +		no_broadcast ? "No IPI Broadcast" : "IPI Broadcast"); +	return 1; +} +__setup("no_ipi_broadcast=", no_ipi_broadcast); + +static int __init print_ipi_mode(void) +{ +	pr_info("Using IPI %s mode\n", +		no_broadcast ? "No-Shortcut" : "Shortcut"); +	return 0; +} +late_initcall(print_ipi_mode); + +void default_setup_apic_routing(void) +{ +#ifdef CONFIG_X86_IO_APIC +	printk(KERN_INFO +		"Enabling APIC mode:  Flat.  Using %d I/O APICs\n", +		nr_ioapics); +#endif +} + +static void default_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ +	/* +	 * Careful. Some cpus do not strictly honor the set of cpus +	 * specified in the interrupt destination when using lowest +	 * priority interrupt delivery mode. +	 * +	 * In particular there was a hyperthreading cpu observed to +	 * deliver interrupts to the wrong hyperthread when only one +	 * hyperthread was specified in the interrupt desitination. +	 */ +	*retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } }; +} + +/* should be called last. */ +static int probe_default(void) +{ +	return 1; +} + +struct apic apic_default = { + +	.name				= "default", +	.probe				= probe_default, +	.acpi_madt_oem_check		= NULL, +	.apic_id_registered		= default_apic_id_registered, + +	.irq_delivery_mode		= dest_LowestPrio, +	/* logical delivery broadcast to all CPUs: */ +	.irq_dest_mode			= 1, + +	.target_cpus			= default_target_cpus, +	.disable_esr			= 0, +	.dest_logical			= APIC_DEST_LOGICAL, +	.check_apicid_used		= default_check_apicid_used, +	.check_apicid_present		= default_check_apicid_present, + +	.vector_allocation_domain	= default_vector_allocation_domain, +	.init_apic_ldr			= default_init_apic_ldr, + +	.ioapic_phys_id_map		= default_ioapic_phys_id_map, +	.setup_apic_routing		= default_setup_apic_routing, +	.multi_timer_check		= NULL, +	.apicid_to_node			= default_apicid_to_node, +	.cpu_to_logical_apicid		= default_cpu_to_logical_apicid, +	.cpu_present_to_apicid		= default_cpu_present_to_apicid, +	.apicid_to_cpu_present		= default_apicid_to_cpu_present, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= default_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= default_phys_pkg_id, +	.mps_oem_check			= NULL, + +	.get_apic_id			= default_get_apic_id, +	.set_apic_id			= NULL, +	.apic_id_mask			= 0x0F << 24, + +	.cpu_mask_to_apicid		= default_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= default_send_IPI_mask_logical, +	.send_IPI_mask_allbutself	= default_send_IPI_mask_allbutself_logical, +	.send_IPI_allbutself		= default_send_IPI_allbutself, +	.send_IPI_all			= default_send_IPI_all, +	.send_IPI_self			= default_send_IPI_self, + +	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, + +	.wait_for_init_deassert		= default_wait_for_init_deassert, + +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= default_inquire_remote_apic, + +	.read				= native_apic_mem_read, +	.write				= native_apic_mem_write, +	.icr_read			= native_apic_icr_read, +	.icr_write			= native_apic_icr_write, +	.wait_icr_idle			= native_apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, +}; + +extern struct apic apic_numaq; +extern struct apic apic_summit; +extern struct apic apic_bigsmp; +extern struct apic apic_es7000; +extern struct apic apic_es7000_cluster; +extern struct apic apic_default; + +struct apic *apic = &apic_default; +EXPORT_SYMBOL_GPL(apic); + +static struct apic *apic_probe[] __initdata = { +#ifdef CONFIG_X86_NUMAQ +	&apic_numaq, +#endif +#ifdef CONFIG_X86_SUMMIT +	&apic_summit, +#endif +#ifdef CONFIG_X86_BIGSMP +	&apic_bigsmp, +#endif +#ifdef CONFIG_X86_ES7000 +	&apic_es7000, +	&apic_es7000_cluster, +#endif +	&apic_default,	/* must be last */ +	NULL, +}; + +static int cmdline_apic __initdata; +static int __init parse_apic(char *arg) +{ +	int i; + +	if (!arg) +		return -EINVAL; + +	for (i = 0; apic_probe[i]; i++) { +		if (!strcmp(apic_probe[i]->name, arg)) { +			apic = apic_probe[i]; +			cmdline_apic = 1; +			return 0; +		} +	} + +	/* Parsed again by __setup for debug/verbose */ +	return 0; +} +early_param("apic", parse_apic); + +void __init generic_bigsmp_probe(void) +{ +#ifdef CONFIG_X86_BIGSMP +	/* +	 * This routine is used to switch to bigsmp mode when +	 * - There is no apic= option specified by the user +	 * - generic_apic_probe() has chosen apic_default as the sub_arch +	 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support +	 */ + +	if (!cmdline_apic && apic == &apic_default) { +		if (apic_bigsmp.probe()) { +			apic = &apic_bigsmp; +			printk(KERN_INFO "Overriding APIC driver with %s\n", +			       apic->name); +		} +	} +#endif +} + +void __init generic_apic_probe(void) +{ +	if (!cmdline_apic) { +		int i; +		for (i = 0; apic_probe[i]; i++) { +			if (apic_probe[i]->probe()) { +				apic = apic_probe[i]; +				break; +			} +		} +		/* Not visible without early console */ +		if (!apic_probe[i]) +			panic("Didn't find an APIC driver"); +	} +	printk(KERN_INFO "Using APIC driver %s\n", apic->name); +} + +/* These functions can switch the APIC even after the initial ->probe() */ + +int __init +generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) +{ +	int i; + +	for (i = 0; apic_probe[i]; ++i) { +		if (!apic_probe[i]->mps_oem_check) +			continue; +		if (!apic_probe[i]->mps_oem_check(mpc, oem, productid)) +			continue; + +		if (!cmdline_apic) { +			apic = apic_probe[i]; +			printk(KERN_INFO "Switched to APIC driver `%s'.\n", +			       apic->name); +		} +		return 1; +	} +	return 0; +} + +int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +	int i; + +	for (i = 0; apic_probe[i]; ++i) { +		if (!apic_probe[i]->acpi_madt_oem_check) +			continue; +		if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) +			continue; + +		if (!cmdline_apic) { +			apic = apic_probe[i]; +			printk(KERN_INFO "Switched to APIC driver `%s'.\n", +			       apic->name); +		} +		return 1; +	} +	return 0; +} diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/apic/probe_64.c index 2bced78b0b8..8d7748efe6a 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -19,22 +19,27 @@  #include <linux/dmar.h>  #include <asm/smp.h> +#include <asm/apic.h>  #include <asm/ipi.h> -#include <asm/genapic.h>  #include <asm/setup.h> -extern struct genapic apic_flat; -extern struct genapic apic_physflat; -extern struct genapic apic_x2xpic_uv_x; -extern struct genapic apic_x2apic_phys; -extern struct genapic apic_x2apic_cluster; +extern struct apic apic_flat; +extern struct apic apic_physflat; +extern struct apic apic_x2xpic_uv_x; +extern struct apic apic_x2apic_phys; +extern struct apic apic_x2apic_cluster; -struct genapic __read_mostly *genapic = &apic_flat; +struct apic __read_mostly *apic = &apic_flat; +EXPORT_SYMBOL_GPL(apic); -static struct genapic *apic_probe[] __initdata = { +static struct apic *apic_probe[] __initdata = { +#ifdef CONFIG_X86_UV  	&apic_x2apic_uv_x, +#endif +#ifdef CONFIG_X86_X2APIC  	&apic_x2apic_phys,  	&apic_x2apic_cluster, +#endif  	&apic_physflat,  	NULL,  }; @@ -42,39 +47,45 @@ static struct genapic *apic_probe[] __initdata = {  /*   * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.   */ -void __init setup_apic_routing(void) +void __init default_setup_apic_routing(void)  { -	if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) { -		if (!intr_remapping_enabled) -			genapic = &apic_flat; +#ifdef CONFIG_X86_X2APIC +	if (x2apic && (apic != &apic_x2apic_phys && +#ifdef CONFIG_X86_UV +		       apic != &apic_x2apic_uv_x && +#endif +		       apic != &apic_x2apic_cluster)) { +		if (x2apic_phys) +			apic = &apic_x2apic_phys; +		else +			apic = &apic_x2apic_cluster; +		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);  	} +#endif -	if (genapic == &apic_flat) { +	if (apic == &apic_flat) {  		if (max_physical_apicid >= 8) -			genapic = &apic_physflat; -		printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); +			apic = &apic_physflat; +		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);  	} - -	if (x86_quirks->update_genapic) -		x86_quirks->update_genapic();  }  /* Same for both flat and physical. */  void apic_send_IPI_self(int vector)  { -	__send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +	__default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);  } -int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)  {  	int i;  	for (i = 0; apic_probe[i]; ++i) {  		if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { -			genapic = apic_probe[i]; +			apic = apic_probe[i];  			printk(KERN_INFO "Setting APIC routing to %s.\n", -				genapic->name); +				apic->name);  			return 1;  		}  	} diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c new file mode 100644 index 00000000000..aac52fa873f --- /dev/null +++ b/arch/x86/kernel/apic/summit_32.c @@ -0,0 +1,579 @@ +/* + * IBM Summit-Specific Code + * + * Written By: Matthew Dobson, IBM Corporation + * + * Copyright (c) 2003 IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT.  See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to <colpatch@us.ibm.com> + * + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <asm/io.h> +#include <asm/bios_ebda.h> + +/* + * APIC driver for the IBM "Summit" chipset. + */ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <asm/smp.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <asm/ipi.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/gfp.h> +#include <linux/smp.h> + +static unsigned summit_get_apic_id(unsigned long x) +{ +	return (x >> 24) & 0xFF; +} + +static inline void summit_send_IPI_mask(const cpumask_t *mask, int vector) +{ +	default_send_IPI_mask_sequence_logical(mask, vector); +} + +static void summit_send_IPI_allbutself(int vector) +{ +	cpumask_t mask = cpu_online_map; +	cpu_clear(smp_processor_id(), mask); + +	if (!cpus_empty(mask)) +		summit_send_IPI_mask(&mask, vector); +} + +static void summit_send_IPI_all(int vector) +{ +	summit_send_IPI_mask(&cpu_online_map, vector); +} + +#include <asm/tsc.h> + +extern int use_cyclone; + +#ifdef CONFIG_X86_SUMMIT_NUMA +static void setup_summit(void); +#else +static inline void setup_summit(void) {} +#endif + +static int summit_mps_oem_check(struct mpc_table *mpc, char *oem, +		char *productid) +{ +	if (!strncmp(oem, "IBM ENSW", 8) && +			(!strncmp(productid, "VIGIL SMP", 9) +			 || !strncmp(productid, "EXA", 3) +			 || !strncmp(productid, "RUTHLESS SMP", 12))){ +		mark_tsc_unstable("Summit based system"); +		use_cyclone = 1; /*enable cyclone-timer*/ +		setup_summit(); +		return 1; +	} +	return 0; +} + +/* Hook from generic ACPI tables.c */ +static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +	if (!strncmp(oem_id, "IBM", 3) && +	    (!strncmp(oem_table_id, "SERVIGIL", 8) +	     || !strncmp(oem_table_id, "EXA", 3))){ +		mark_tsc_unstable("Summit based system"); +		use_cyclone = 1; /*enable cyclone-timer*/ +		setup_summit(); +		return 1; +	} +	return 0; +} + +struct rio_table_hdr { +	unsigned char version;      /* Version number of this data structure           */ +	                            /* Version 3 adds chassis_num & WP_index           */ +	unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil)   */ +	unsigned char num_rio_dev;  /* # of RIO I/O devices (Cyclones and Winnipegs)   */ +} __attribute__((packed)); + +struct scal_detail { +	unsigned char node_id;      /* Scalability Node ID                             */ +	unsigned long CBAR;         /* Address of 1MB register space                   */ +	unsigned char port0node;    /* Node ID port connected to: 0xFF=None            */ +	unsigned char port0port;    /* Port num port connected to: 0,1,2, or 0xFF=None */ +	unsigned char port1node;    /* Node ID port connected to: 0xFF = None          */ +	unsigned char port1port;    /* Port num port connected to: 0,1,2, or 0xFF=None */ +	unsigned char port2node;    /* Node ID port connected to: 0xFF = None          */ +	unsigned char port2port;    /* Port num port connected to: 0,1,2, or 0xFF=None */ +	unsigned char chassis_num;  /* 1 based Chassis number (1 = boot node)          */ +} __attribute__((packed)); + +struct rio_detail { +	unsigned char node_id;      /* RIO Node ID                                     */ +	unsigned long BBAR;         /* Address of 1MB register space                   */ +	unsigned char type;         /* Type of device                                  */ +	unsigned char owner_id;     /* For WPEG: Node ID of Cyclone that owns this WPEG*/ +	                            /* For CYC:  Node ID of Twister that owns this CYC */ +	unsigned char port0node;    /* Node ID port connected to: 0xFF=None            */ +	unsigned char port0port;    /* Port num port connected to: 0,1,2, or 0xFF=None */ +	unsigned char port1node;    /* Node ID port connected to: 0xFF=None            */ +	unsigned char port1port;    /* Port num port connected to: 0,1,2, or 0xFF=None */ +	unsigned char first_slot;   /* For WPEG: Lowest slot number below this WPEG    */ +	                            /* For CYC:  0                                     */ +	unsigned char status;       /* For WPEG: Bit 0 = 1 : the XAPIC is used         */ +	                            /*                 = 0 : the XAPIC is not used, ie:*/ +	                            /*                     ints fwded to another XAPIC */ +	                            /*           Bits1:7 Reserved                      */ +	                            /* For CYC:  Bits0:7 Reserved                      */ +	unsigned char WP_index;     /* For WPEG: WPEG instance index - lower ones have */ +	                            /*           lower slot numbers/PCI bus numbers    */ +	                            /* For CYC:  No meaning                            */ +	unsigned char chassis_num;  /* 1 based Chassis number                          */ +	                            /* For LookOut WPEGs this field indicates the      */ +	                            /* Expansion Chassis #, enumerated from Boot       */ +	                            /* Node WPEG external port, then Boot Node CYC     */ +	                            /* external port, then Next Vigil chassis WPEG     */ +	                            /* external port, etc.                             */ +	                            /* Shared Lookouts have only 1 chassis number (the */ +	                            /* first one assigned)                             */ +} __attribute__((packed)); + + +typedef enum { +	CompatTwister = 0,  /* Compatibility Twister               */ +	AltTwister    = 1,  /* Alternate Twister of internal 8-way */ +	CompatCyclone = 2,  /* Compatibility Cyclone               */ +	AltCyclone    = 3,  /* Alternate Cyclone of internal 8-way */ +	CompatWPEG    = 4,  /* Compatibility WPEG                  */ +	AltWPEG       = 5,  /* Second Planar WPEG                  */ +	LookOutAWPEG  = 6,  /* LookOut WPEG                        */ +	LookOutBWPEG  = 7,  /* LookOut WPEG                        */ +} node_type; + +static inline int is_WPEG(struct rio_detail *rio){ +	return (rio->type == CompatWPEG || rio->type == AltWPEG || +		rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); +} + + +/* In clustered mode, the high nibble of APIC ID is a cluster number. + * The low nibble is a 4-bit bitmap. */ +#define XAPIC_DEST_CPUS_SHIFT	4 +#define XAPIC_DEST_CPUS_MASK	((1u << XAPIC_DEST_CPUS_SHIFT) - 1) +#define XAPIC_DEST_CLUSTER_MASK	(XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) + +#define SUMMIT_APIC_DFR_VALUE	(APIC_DFR_CLUSTER) + +static const cpumask_t *summit_target_cpus(void) +{ +	/* CPU_MASK_ALL (0xff) has undefined behaviour with +	 * dest_LowestPrio mode logical clustered apic interrupt routing +	 * Just start on cpu 0.  IRQ balancing will spread load +	 */ +	return &cpumask_of_cpu(0); +} + +static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) +{ +	return 0; +} + +/* we don't use the phys_cpu_present_map to indicate apicid presence */ +static unsigned long summit_check_apicid_present(int bit) +{ +	return 1; +} + +static void summit_init_apic_ldr(void) +{ +	unsigned long val, id; +	int count = 0; +	u8 my_id = (u8)hard_smp_processor_id(); +	u8 my_cluster = APIC_CLUSTER(my_id); +#ifdef CONFIG_SMP +	u8 lid; +	int i; + +	/* Create logical APIC IDs by counting CPUs already in cluster. */ +	for (count = 0, i = nr_cpu_ids; --i >= 0; ) { +		lid = cpu_2_logical_apicid[i]; +		if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) +			++count; +	} +#endif +	/* We only have a 4 wide bitmap in cluster mode.  If a deranged +	 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ +	BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); +	id = my_cluster | (1UL << count); +	apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); +	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; +	val |= SET_APIC_LOGICAL_ID(id); +	apic_write(APIC_LDR, val); +} + +static int summit_apic_id_registered(void) +{ +	return 1; +} + +static void summit_setup_apic_routing(void) +{ +	printk("Enabling APIC mode:  Summit.  Using %d I/O APICs\n", +						nr_ioapics); +} + +static int summit_apicid_to_node(int logical_apicid) +{ +#ifdef CONFIG_SMP +	return apicid_2_node[hard_smp_processor_id()]; +#else +	return 0; +#endif +} + +/* Mapping from cpu number to logical apicid */ +static inline int summit_cpu_to_logical_apicid(int cpu) +{ +#ifdef CONFIG_SMP +	if (cpu >= nr_cpu_ids) +		return BAD_APICID; +	return cpu_2_logical_apicid[cpu]; +#else +	return logical_smp_processor_id(); +#endif +} + +static int summit_cpu_present_to_apicid(int mps_cpu) +{ +	if (mps_cpu < nr_cpu_ids) +		return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); +	else +		return BAD_APICID; +} + +static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) +{ +	/* For clustered we don't have a good way to do this yet - hack */ +	return physids_promote(0x0F); +} + +static physid_mask_t summit_apicid_to_cpu_present(int apicid) +{ +	return physid_mask_of_physid(0); +} + +static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) +{ +	return 1; +} + +static unsigned int summit_cpu_mask_to_apicid(const cpumask_t *cpumask) +{ +	unsigned int round = 0; +	int cpu, apicid = 0; + +	/* +	 * The cpus in the mask must all be on the apic cluster. +	 */ +	for_each_cpu(cpu, cpumask) { +		int new_apicid = summit_cpu_to_logical_apicid(cpu); + +		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { +			printk("%s: Not a valid mask!\n", __func__); +			return BAD_APICID; +		} +		apicid |= new_apicid; +		round++; +	} +	return apicid; +} + +static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, +			      const struct cpumask *andmask) +{ +	int apicid = summit_cpu_to_logical_apicid(0); +	cpumask_var_t cpumask; + +	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) +		return apicid; + +	cpumask_and(cpumask, inmask, andmask); +	cpumask_and(cpumask, cpumask, cpu_online_mask); +	apicid = summit_cpu_mask_to_apicid(cpumask); + +	free_cpumask_var(cpumask); + +	return apicid; +} + +/* + * cpuid returns the value latched in the HW at reset, not the APIC ID + * register's value.  For any box whose BIOS changes APIC IDs, like + * clustered APIC systems, we must use hard_smp_processor_id. + * + * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. + */ +static int summit_phys_pkg_id(int cpuid_apic, int index_msb) +{ +	return hard_smp_processor_id() >> index_msb; +} + +static int probe_summit(void) +{ +	/* probed later in mptable/ACPI hooks */ +	return 0; +} + +static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask) +{ +	/* Careful. Some cpus do not strictly honor the set of cpus +	 * specified in the interrupt destination when using lowest +	 * priority interrupt delivery mode. +	 * +	 * In particular there was a hyperthreading cpu observed to +	 * deliver interrupts to the wrong hyperthread when only one +	 * hyperthread was specified in the interrupt desitination. +	 */ +	*retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; +} + +#ifdef CONFIG_X86_SUMMIT_NUMA +static struct rio_table_hdr *rio_table_hdr; +static struct scal_detail   *scal_devs[MAX_NUMNODES]; +static struct rio_detail    *rio_devs[MAX_NUMNODES*4]; + +#ifndef CONFIG_X86_NUMAQ +static int mp_bus_id_to_node[MAX_MP_BUSSES]; +#endif + +static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) +{ +	int twister = 0, node = 0; +	int i, bus, num_buses; + +	for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { +		if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) { +			twister = rio_devs[i]->owner_id; +			break; +		} +	} +	if (i == rio_table_hdr->num_rio_dev) { +		printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); +		return last_bus; +	} + +	for (i = 0; i < rio_table_hdr->num_scal_dev; i++) { +		if (scal_devs[i]->node_id == twister) { +			node = scal_devs[i]->node_id; +			break; +		} +	} +	if (i == rio_table_hdr->num_scal_dev) { +		printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); +		return last_bus; +	} + +	switch (rio_devs[wpeg_num]->type) { +	case CompatWPEG: +		/* +		 * The Compatibility Winnipeg controls the 2 legacy buses, +		 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case +		 * a PCI-PCI bridge card is used in either slot: total 5 buses. +		 */ +		num_buses = 5; +		break; +	case AltWPEG: +		/* +		 * The Alternate Winnipeg controls the 2 133MHz buses [1 slot +		 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and +		 * the "extra" buses for each of those slots: total 7 buses. +		 */ +		num_buses = 7; +		break; +	case LookOutAWPEG: +	case LookOutBWPEG: +		/* +		 * A Lookout Winnipeg controls 3 100MHz buses [2 slots each] +		 * & the "extra" buses for each of those slots: total 9 buses. +		 */ +		num_buses = 9; +		break; +	default: +		printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); +		return last_bus; +	} + +	for (bus = last_bus; bus < last_bus + num_buses; bus++) +		mp_bus_id_to_node[bus] = node; +	return bus; +} + +static int build_detail_arrays(void) +{ +	unsigned long ptr; +	int i, scal_detail_size, rio_detail_size; + +	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { +		printk(KERN_WARNING "%s: MAX_NUMNODES too low!  Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); +		return 0; +	} + +	switch (rio_table_hdr->version) { +	default: +		printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); +		return 0; +	case 2: +		scal_detail_size = 11; +		rio_detail_size = 13; +		break; +	case 3: +		scal_detail_size = 12; +		rio_detail_size = 15; +		break; +	} + +	ptr = (unsigned long)rio_table_hdr + 3; +	for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size) +		scal_devs[i] = (struct scal_detail *)ptr; + +	for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size) +		rio_devs[i] = (struct rio_detail *)ptr; + +	return 1; +} + +void setup_summit(void) +{ +	unsigned long		ptr; +	unsigned short		offset; +	int			i, next_wpeg, next_bus = 0; + +	/* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ +	ptr = get_bios_ebda(); +	ptr = (unsigned long)phys_to_virt(ptr); + +	rio_table_hdr = NULL; +	offset = 0x180; +	while (offset) { +		/* The block id is stored in the 2nd word */ +		if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) { +			/* set the pointer past the offset & block id */ +			rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); +			break; +		} +		/* The next offset is stored in the 1st word.  0 means no more */ +		offset = *((unsigned short *)(ptr + offset)); +	} +	if (!rio_table_hdr) { +		printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); +		return; +	} + +	if (!build_detail_arrays()) +		return; + +	/* The first Winnipeg we're looking for has an index of 0 */ +	next_wpeg = 0; +	do { +		for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { +			if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) { +				/* It's the Winnipeg we're looking for! */ +				next_bus = setup_pci_node_map_for_wpeg(i, next_bus); +				next_wpeg++; +				break; +			} +		} +		/* +		 * If we go through all Rio devices and don't find one with +		 * the next index, it means we've found all the Winnipegs, +		 * and thus all the PCI buses. +		 */ +		if (i == rio_table_hdr->num_rio_dev) +			next_wpeg = 0; +	} while (next_wpeg != 0); +} +#endif + +struct apic apic_summit = { + +	.name				= "summit", +	.probe				= probe_summit, +	.acpi_madt_oem_check		= summit_acpi_madt_oem_check, +	.apic_id_registered		= summit_apic_id_registered, + +	.irq_delivery_mode		= dest_LowestPrio, +	/* logical delivery broadcast to all CPUs: */ +	.irq_dest_mode			= 1, + +	.target_cpus			= summit_target_cpus, +	.disable_esr			= 1, +	.dest_logical			= APIC_DEST_LOGICAL, +	.check_apicid_used		= summit_check_apicid_used, +	.check_apicid_present		= summit_check_apicid_present, + +	.vector_allocation_domain	= summit_vector_allocation_domain, +	.init_apic_ldr			= summit_init_apic_ldr, + +	.ioapic_phys_id_map		= summit_ioapic_phys_id_map, +	.setup_apic_routing		= summit_setup_apic_routing, +	.multi_timer_check		= NULL, +	.apicid_to_node			= summit_apicid_to_node, +	.cpu_to_logical_apicid		= summit_cpu_to_logical_apicid, +	.cpu_present_to_apicid		= summit_cpu_present_to_apicid, +	.apicid_to_cpu_present		= summit_apicid_to_cpu_present, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= summit_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= summit_phys_pkg_id, +	.mps_oem_check			= summit_mps_oem_check, + +	.get_apic_id			= summit_get_apic_id, +	.set_apic_id			= NULL, +	.apic_id_mask			= 0xFF << 24, + +	.cpu_mask_to_apicid		= summit_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= summit_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= summit_send_IPI_mask, +	.send_IPI_mask_allbutself	= NULL, +	.send_IPI_allbutself		= summit_send_IPI_allbutself, +	.send_IPI_all			= summit_send_IPI_all, +	.send_IPI_self			= default_send_IPI_self, + +	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, + +	.wait_for_init_deassert		= default_wait_for_init_deassert, + +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= default_inquire_remote_apic, + +	.read				= native_apic_mem_read, +	.write				= native_apic_mem_write, +	.icr_read			= native_apic_icr_read, +	.icr_write			= native_apic_icr_write, +	.wait_icr_idle			= native_apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, +}; diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 6ce497cc372..8fb87b6dd63 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -7,17 +7,14 @@  #include <linux/dmar.h>  #include <asm/smp.h> +#include <asm/apic.h>  #include <asm/ipi.h> -#include <asm/genapic.h>  DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);  static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)  { -	if (cpu_has_x2apic) -		return 1; - -	return 0; +	return x2apic_enabled();  }  /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */ @@ -36,8 +33,8 @@ static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)  	cpumask_set_cpu(cpu, retmask);  } -static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, -				   unsigned int dest) +static void + __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)  {  	unsigned long cfg; @@ -46,7 +43,7 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,  	/*  	 * send the IPI.  	 */ -	x2apic_icr_write(cfg, apicid); +	native_x2apic_icr_write(cfg, apicid);  }  /* @@ -57,45 +54,50 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,   */  static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)  { -	unsigned long flags;  	unsigned long query_cpu; +	unsigned long flags;  	local_irq_save(flags); -	for_each_cpu(query_cpu, mask) +	for_each_cpu(query_cpu, mask) {  		__x2apic_send_IPI_dest(  			per_cpu(x86_cpu_to_logical_apicid, query_cpu), -			vector, APIC_DEST_LOGICAL); +			vector, apic->dest_logical); +	}  	local_irq_restore(flags);  } -static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, -					    int vector) +static void + x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)  { -	unsigned long flags; -	unsigned long query_cpu;  	unsigned long this_cpu = smp_processor_id(); +	unsigned long query_cpu; +	unsigned long flags;  	local_irq_save(flags); -	for_each_cpu(query_cpu, mask) -		if (query_cpu != this_cpu) -			__x2apic_send_IPI_dest( +	for_each_cpu(query_cpu, mask) { +		if (query_cpu == this_cpu) +			continue; +		__x2apic_send_IPI_dest(  				per_cpu(x86_cpu_to_logical_apicid, query_cpu), -				vector, APIC_DEST_LOGICAL); +				vector, apic->dest_logical); +	}  	local_irq_restore(flags);  }  static void x2apic_send_IPI_allbutself(int vector)  { -	unsigned long flags; -	unsigned long query_cpu;  	unsigned long this_cpu = smp_processor_id(); +	unsigned long query_cpu; +	unsigned long flags;  	local_irq_save(flags); -	for_each_online_cpu(query_cpu) -		if (query_cpu != this_cpu) -			__x2apic_send_IPI_dest( +	for_each_online_cpu(query_cpu) { +		if (query_cpu == this_cpu) +			continue; +		__x2apic_send_IPI_dest(  				per_cpu(x86_cpu_to_logical_apicid, query_cpu), -				vector, APIC_DEST_LOGICAL); +				vector, apic->dest_logical); +	}  	local_irq_restore(flags);  } @@ -111,21 +113,21 @@ static int x2apic_apic_id_registered(void)  static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)  { -	int cpu; -  	/*  	 * We're using fixed IRQ delivery, can only return one logical APIC ID.  	 * May as well be the first.  	 */ -	cpu = cpumask_first(cpumask); +	int cpu = cpumask_first(cpumask); +  	if ((unsigned)cpu < nr_cpu_ids)  		return per_cpu(x86_cpu_to_logical_apicid, cpu);  	else  		return BAD_APICID;  } -static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -						  const struct cpumask *andmask) +static unsigned int +x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +			      const struct cpumask *andmask)  {  	int cpu; @@ -133,15 +135,18 @@ static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,  	 * We're using fixed IRQ delivery, can only return one logical APIC ID.  	 * May as well be the first.  	 */ -	for_each_cpu_and(cpu, cpumask, andmask) +	for_each_cpu_and(cpu, cpumask, andmask) {  		if (cpumask_test_cpu(cpu, cpu_online_mask))  			break; +	} +  	if (cpu < nr_cpu_ids)  		return per_cpu(x86_cpu_to_logical_apicid, cpu); +  	return BAD_APICID;  } -static unsigned int get_apic_id(unsigned long x) +static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)  {  	unsigned int id; @@ -157,7 +162,7 @@ static unsigned long set_apic_id(unsigned int id)  	return x;  } -static unsigned int phys_pkg_id(int index_msb) +static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)  {  	return current_cpu_data.initial_apicid >> index_msb;  } @@ -172,27 +177,63 @@ static void init_x2apic_ldr(void)  	int cpu = smp_processor_id();  	per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); -	return;  } -struct genapic apic_x2apic_cluster = { -	.name = "cluster x2apic", -	.acpi_madt_oem_check = x2apic_acpi_madt_oem_check, -	.int_delivery_mode = dest_LowestPrio, -	.int_dest_mode = (APIC_DEST_LOGICAL != 0), -	.target_cpus = x2apic_target_cpus, -	.vector_allocation_domain = x2apic_vector_allocation_domain, -	.apic_id_registered = x2apic_apic_id_registered, -	.init_apic_ldr = init_x2apic_ldr, -	.send_IPI_all = x2apic_send_IPI_all, -	.send_IPI_allbutself = x2apic_send_IPI_allbutself, -	.send_IPI_mask = x2apic_send_IPI_mask, -	.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, -	.send_IPI_self = x2apic_send_IPI_self, -	.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, -	.phys_pkg_id = phys_pkg_id, -	.get_apic_id = get_apic_id, -	.set_apic_id = set_apic_id, -	.apic_id_mask = (0xFFFFFFFFu), +struct apic apic_x2apic_cluster = { + +	.name				= "cluster x2apic", +	.probe				= NULL, +	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check, +	.apic_id_registered		= x2apic_apic_id_registered, + +	.irq_delivery_mode		= dest_LowestPrio, +	.irq_dest_mode			= 1, /* logical */ + +	.target_cpus			= x2apic_target_cpus, +	.disable_esr			= 0, +	.dest_logical			= APIC_DEST_LOGICAL, +	.check_apicid_used		= NULL, +	.check_apicid_present		= NULL, + +	.vector_allocation_domain	= x2apic_vector_allocation_domain, +	.init_apic_ldr			= init_x2apic_ldr, + +	.ioapic_phys_id_map		= NULL, +	.setup_apic_routing		= NULL, +	.multi_timer_check		= NULL, +	.apicid_to_node			= NULL, +	.cpu_to_logical_apicid		= NULL, +	.cpu_present_to_apicid		= default_cpu_present_to_apicid, +	.apicid_to_cpu_present		= NULL, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= default_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= x2apic_cluster_phys_pkg_id, +	.mps_oem_check			= NULL, + +	.get_apic_id			= x2apic_cluster_phys_get_apic_id, +	.set_apic_id			= set_apic_id, +	.apic_id_mask			= 0xFFFFFFFFu, + +	.cpu_mask_to_apicid		= x2apic_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= x2apic_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= x2apic_send_IPI_mask, +	.send_IPI_mask_allbutself	= x2apic_send_IPI_mask_allbutself, +	.send_IPI_allbutself		= x2apic_send_IPI_allbutself, +	.send_IPI_all			= x2apic_send_IPI_all, +	.send_IPI_self			= x2apic_send_IPI_self, + +	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, +	.wait_for_init_deassert		= NULL, +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= NULL, + +	.read				= native_apic_msr_read, +	.write				= native_apic_msr_write, +	.icr_read			= native_x2apic_icr_read, +	.icr_write			= native_x2apic_icr_write, +	.wait_icr_idle			= native_x2apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_x2apic_wait_icr_idle,  }; diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 21bcc0e098b..23625b9f98b 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -7,10 +7,10 @@  #include <linux/dmar.h>  #include <asm/smp.h> +#include <asm/apic.h>  #include <asm/ipi.h> -#include <asm/genapic.h> -static int x2apic_phys; +int x2apic_phys;  static int set_x2apic_phys_mode(char *arg)  { @@ -21,10 +21,10 @@ early_param("x2apic_phys", set_x2apic_phys_mode);  static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)  { -	if (cpu_has_x2apic && x2apic_phys) -		return 1; - -	return 0; +	if (x2apic_phys) +		return x2apic_enabled(); +	else +		return 0;  }  /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */ @@ -50,13 +50,13 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,  	/*  	 * send the IPI.  	 */ -	x2apic_icr_write(cfg, apicid); +	native_x2apic_icr_write(cfg, apicid);  }  static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)  { -	unsigned long flags;  	unsigned long query_cpu; +	unsigned long flags;  	local_irq_save(flags);  	for_each_cpu(query_cpu, mask) { @@ -66,12 +66,12 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)  	local_irq_restore(flags);  } -static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, -					    int vector) +static void + x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)  { -	unsigned long flags; -	unsigned long query_cpu;  	unsigned long this_cpu = smp_processor_id(); +	unsigned long query_cpu; +	unsigned long flags;  	local_irq_save(flags);  	for_each_cpu(query_cpu, mask) { @@ -85,16 +85,17 @@ static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,  static void x2apic_send_IPI_allbutself(int vector)  { -	unsigned long flags; -	unsigned long query_cpu;  	unsigned long this_cpu = smp_processor_id(); +	unsigned long query_cpu; +	unsigned long flags;  	local_irq_save(flags); -	for_each_online_cpu(query_cpu) -		if (query_cpu != this_cpu) -			__x2apic_send_IPI_dest( -				per_cpu(x86_cpu_to_apicid, query_cpu), -				vector, APIC_DEST_PHYSICAL); +	for_each_online_cpu(query_cpu) { +		if (query_cpu == this_cpu) +			continue; +		__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), +				       vector, APIC_DEST_PHYSICAL); +	}  	local_irq_restore(flags);  } @@ -110,21 +111,21 @@ static int x2apic_apic_id_registered(void)  static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)  { -	int cpu; -  	/*  	 * We're using fixed IRQ delivery, can only return one phys APIC ID.  	 * May as well be the first.  	 */ -	cpu = cpumask_first(cpumask); +	int cpu = cpumask_first(cpumask); +  	if ((unsigned)cpu < nr_cpu_ids)  		return per_cpu(x86_cpu_to_apicid, cpu);  	else  		return BAD_APICID;  } -static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -						  const struct cpumask *andmask) +static unsigned int +x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +			      const struct cpumask *andmask)  {  	int cpu; @@ -132,31 +133,28 @@ static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,  	 * We're using fixed IRQ delivery, can only return one phys APIC ID.  	 * May as well be the first.  	 */ -	for_each_cpu_and(cpu, cpumask, andmask) +	for_each_cpu_and(cpu, cpumask, andmask) {  		if (cpumask_test_cpu(cpu, cpu_online_mask))  			break; +	} +  	if (cpu < nr_cpu_ids)  		return per_cpu(x86_cpu_to_apicid, cpu); +  	return BAD_APICID;  } -static unsigned int get_apic_id(unsigned long x) +static unsigned int x2apic_phys_get_apic_id(unsigned long x)  { -	unsigned int id; - -	id = x; -	return id; +	return x;  }  static unsigned long set_apic_id(unsigned int id)  { -	unsigned long x; - -	x = id; -	return x; +	return id;  } -static unsigned int phys_pkg_id(int index_msb) +static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)  {  	return current_cpu_data.initial_apicid >> index_msb;  } @@ -168,27 +166,63 @@ static void x2apic_send_IPI_self(int vector)  static void init_x2apic_ldr(void)  { -	return;  } -struct genapic apic_x2apic_phys = { -	.name = "physical x2apic", -	.acpi_madt_oem_check = x2apic_acpi_madt_oem_check, -	.int_delivery_mode = dest_Fixed, -	.int_dest_mode = (APIC_DEST_PHYSICAL != 0), -	.target_cpus = x2apic_target_cpus, -	.vector_allocation_domain = x2apic_vector_allocation_domain, -	.apic_id_registered = x2apic_apic_id_registered, -	.init_apic_ldr = init_x2apic_ldr, -	.send_IPI_all = x2apic_send_IPI_all, -	.send_IPI_allbutself = x2apic_send_IPI_allbutself, -	.send_IPI_mask = x2apic_send_IPI_mask, -	.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, -	.send_IPI_self = x2apic_send_IPI_self, -	.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, -	.phys_pkg_id = phys_pkg_id, -	.get_apic_id = get_apic_id, -	.set_apic_id = set_apic_id, -	.apic_id_mask = (0xFFFFFFFFu), +struct apic apic_x2apic_phys = { + +	.name				= "physical x2apic", +	.probe				= NULL, +	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check, +	.apic_id_registered		= x2apic_apic_id_registered, + +	.irq_delivery_mode		= dest_Fixed, +	.irq_dest_mode			= 0, /* physical */ + +	.target_cpus			= x2apic_target_cpus, +	.disable_esr			= 0, +	.dest_logical			= 0, +	.check_apicid_used		= NULL, +	.check_apicid_present		= NULL, + +	.vector_allocation_domain	= x2apic_vector_allocation_domain, +	.init_apic_ldr			= init_x2apic_ldr, + +	.ioapic_phys_id_map		= NULL, +	.setup_apic_routing		= NULL, +	.multi_timer_check		= NULL, +	.apicid_to_node			= NULL, +	.cpu_to_logical_apicid		= NULL, +	.cpu_present_to_apicid		= default_cpu_present_to_apicid, +	.apicid_to_cpu_present		= NULL, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= default_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= x2apic_phys_pkg_id, +	.mps_oem_check			= NULL, + +	.get_apic_id			= x2apic_phys_get_apic_id, +	.set_apic_id			= set_apic_id, +	.apic_id_mask			= 0xFFFFFFFFu, + +	.cpu_mask_to_apicid		= x2apic_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= x2apic_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= x2apic_send_IPI_mask, +	.send_IPI_mask_allbutself	= x2apic_send_IPI_mask_allbutself, +	.send_IPI_allbutself		= x2apic_send_IPI_allbutself, +	.send_IPI_all			= x2apic_send_IPI_all, +	.send_IPI_self			= x2apic_send_IPI_self, + +	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, +	.wait_for_init_deassert		= NULL, +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= NULL, + +	.read				= native_apic_msr_read, +	.write				= native_apic_msr_write, +	.icr_read			= native_x2apic_icr_read, +	.icr_write			= native_x2apic_icr_write, +	.wait_icr_idle			= native_x2apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_x2apic_wait_icr_idle,  }; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b193e082f6c..1bd6da1f8fa 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -7,27 +7,28 @@   *   * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.   */ - -#include <linux/kernel.h> -#include <linux/threads.h> -#include <linux/cpu.h>  #include <linux/cpumask.h> +#include <linux/hardirq.h> +#include <linux/proc_fs.h> +#include <linux/threads.h> +#include <linux/kernel.h> +#include <linux/module.h>  #include <linux/string.h>  #include <linux/ctype.h> -#include <linux/init.h>  #include <linux/sched.h> -#include <linux/module.h> -#include <linux/hardirq.h>  #include <linux/timer.h> -#include <linux/proc_fs.h> -#include <asm/current.h> -#include <asm/smp.h> -#include <asm/ipi.h> -#include <asm/genapic.h> -#include <asm/pgtable.h> +#include <linux/cpu.h> +#include <linux/init.h> +  #include <asm/uv/uv_mmrs.h>  #include <asm/uv/uv_hub.h> +#include <asm/current.h> +#include <asm/pgtable.h>  #include <asm/uv/bios.h> +#include <asm/uv/uv.h> +#include <asm/apic.h> +#include <asm/ipi.h> +#include <asm/smp.h>  DEFINE_PER_CPU(int, x2apic_extra_bits); @@ -90,39 +91,43 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)  	cpumask_set_cpu(cpu, retmask);  } -int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) +static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)  { +#ifdef CONFIG_SMP  	unsigned long val;  	int pnode;  	pnode = uv_apicid_to_pnode(phys_apicid);  	val = (1UL << UVH_IPI_INT_SEND_SHFT) |  	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | -	    (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | +	    ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |  	    APIC_DM_INIT;  	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);  	mdelay(10);  	val = (1UL << UVH_IPI_INT_SEND_SHFT) |  	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | -	    (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | +	    ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |  	    APIC_DM_STARTUP;  	uv_write_global_mmr64(pnode, UVH_IPI_INT, val); + +	atomic_set(&init_deasserted, 1); +#endif  	return 0;  }  static void uv_send_IPI_one(int cpu, int vector)  { -	unsigned long val, apicid, lapicid; +	unsigned long val, apicid;  	int pnode;  	apicid = per_cpu(x86_cpu_to_apicid, cpu); -	lapicid = apicid & 0x3f;		/* ZZZ macro needed */  	pnode = uv_apicid_to_pnode(apicid); -	val = -	    (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << -					      UVH_IPI_INT_APIC_ID_SHFT) | -	    (vector << UVH_IPI_INT_VECTOR_SHFT); + +	val = (1UL << UVH_IPI_INT_SEND_SHFT) | +	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) | +	      (vector << UVH_IPI_INT_VECTOR_SHFT); +  	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);  } @@ -136,22 +141,24 @@ static void uv_send_IPI_mask(const struct cpumask *mask, int vector)  static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)  { -	unsigned int cpu;  	unsigned int this_cpu = smp_processor_id(); +	unsigned int cpu; -	for_each_cpu(cpu, mask) +	for_each_cpu(cpu, mask) {  		if (cpu != this_cpu)  			uv_send_IPI_one(cpu, vector); +	}  }  static void uv_send_IPI_allbutself(int vector)  { -	unsigned int cpu;  	unsigned int this_cpu = smp_processor_id(); +	unsigned int cpu; -	for_each_online_cpu(cpu) +	for_each_online_cpu(cpu) {  		if (cpu != this_cpu)  			uv_send_IPI_one(cpu, vector); +	}  }  static void uv_send_IPI_all(int vector) @@ -170,21 +177,21 @@ static void uv_init_apic_ldr(void)  static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)  { -	int cpu; -  	/*  	 * We're using fixed IRQ delivery, can only return one phys APIC ID.  	 * May as well be the first.  	 */ -	cpu = cpumask_first(cpumask); +	int cpu = cpumask_first(cpumask); +  	if ((unsigned)cpu < nr_cpu_ids)  		return per_cpu(x86_cpu_to_apicid, cpu);  	else  		return BAD_APICID;  } -static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -					      const struct cpumask *andmask) +static unsigned int +uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +			  const struct cpumask *andmask)  {  	int cpu; @@ -192,15 +199,17 @@ static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,  	 * We're using fixed IRQ delivery, can only return one phys APIC ID.  	 * May as well be the first.  	 */ -	for_each_cpu_and(cpu, cpumask, andmask) +	for_each_cpu_and(cpu, cpumask, andmask) {  		if (cpumask_test_cpu(cpu, cpu_online_mask))  			break; +	}  	if (cpu < nr_cpu_ids)  		return per_cpu(x86_cpu_to_apicid, cpu); +  	return BAD_APICID;  } -static unsigned int get_apic_id(unsigned long x) +static unsigned int x2apic_get_apic_id(unsigned long x)  {  	unsigned int id; @@ -222,10 +231,10 @@ static unsigned long set_apic_id(unsigned int id)  static unsigned int uv_read_apic_id(void)  { -	return get_apic_id(apic_read(APIC_ID)); +	return x2apic_get_apic_id(apic_read(APIC_ID));  } -static unsigned int phys_pkg_id(int index_msb) +static int uv_phys_pkg_id(int initial_apicid, int index_msb)  {  	return uv_read_apic_id() >> index_msb;  } @@ -235,26 +244,64 @@ static void uv_send_IPI_self(int vector)  	apic_write(APIC_SELF_IPI, vector);  } -struct genapic apic_x2apic_uv_x = { -	.name = "UV large system", -	.acpi_madt_oem_check = uv_acpi_madt_oem_check, -	.int_delivery_mode = dest_Fixed, -	.int_dest_mode = (APIC_DEST_PHYSICAL != 0), -	.target_cpus = uv_target_cpus, -	.vector_allocation_domain = uv_vector_allocation_domain, -	.apic_id_registered = uv_apic_id_registered, -	.init_apic_ldr = uv_init_apic_ldr, -	.send_IPI_all = uv_send_IPI_all, -	.send_IPI_allbutself = uv_send_IPI_allbutself, -	.send_IPI_mask = uv_send_IPI_mask, -	.send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, -	.send_IPI_self = uv_send_IPI_self, -	.cpu_mask_to_apicid = uv_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, -	.phys_pkg_id = phys_pkg_id, -	.get_apic_id = get_apic_id, -	.set_apic_id = set_apic_id, -	.apic_id_mask = (0xFFFFFFFFu), +struct apic apic_x2apic_uv_x = { + +	.name				= "UV large system", +	.probe				= NULL, +	.acpi_madt_oem_check		= uv_acpi_madt_oem_check, +	.apic_id_registered		= uv_apic_id_registered, + +	.irq_delivery_mode		= dest_Fixed, +	.irq_dest_mode			= 1, /* logical */ + +	.target_cpus			= uv_target_cpus, +	.disable_esr			= 0, +	.dest_logical			= APIC_DEST_LOGICAL, +	.check_apicid_used		= NULL, +	.check_apicid_present		= NULL, + +	.vector_allocation_domain	= uv_vector_allocation_domain, +	.init_apic_ldr			= uv_init_apic_ldr, + +	.ioapic_phys_id_map		= NULL, +	.setup_apic_routing		= NULL, +	.multi_timer_check		= NULL, +	.apicid_to_node			= NULL, +	.cpu_to_logical_apicid		= NULL, +	.cpu_present_to_apicid		= default_cpu_present_to_apicid, +	.apicid_to_cpu_present		= NULL, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= default_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= uv_phys_pkg_id, +	.mps_oem_check			= NULL, + +	.get_apic_id			= x2apic_get_apic_id, +	.set_apic_id			= set_apic_id, +	.apic_id_mask			= 0xFFFFFFFFu, + +	.cpu_mask_to_apicid		= uv_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= uv_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= uv_send_IPI_mask, +	.send_IPI_mask_allbutself	= uv_send_IPI_mask_allbutself, +	.send_IPI_allbutself		= uv_send_IPI_allbutself, +	.send_IPI_all			= uv_send_IPI_all, +	.send_IPI_self			= uv_send_IPI_self, + +	.wakeup_secondary_cpu		= uv_wakeup_secondary, +	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, +	.wait_for_init_deassert		= NULL, +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= NULL, + +	.read				= native_apic_msr_read, +	.write				= native_apic_msr_write, +	.icr_read			= native_x2apic_icr_read, +	.icr_write			= native_x2apic_icr_write, +	.wait_icr_idle			= native_x2apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_x2apic_wait_icr_idle,  };  static __cpuinit void set_x2apic_extra_bits(int pnode) @@ -322,7 +369,7 @@ static __init void map_high(char *id, unsigned long base, int shift,  	paddr = base << shift;  	bytes = (1UL << shift) * (max_pnode + 1);  	printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, -	       					paddr + bytes); +						paddr + bytes);  	if (map_type == map_uc)  		init_extra_mapping_uc(paddr, bytes);  	else @@ -485,7 +532,7 @@ late_initcall(uv_init_heartbeat);  /*   * Called on each cpu to initialize the per_cpu UV data area. - * 	ZZZ hotplug not supported yet + * FIXME: hotplug not supported yet   */  void __cpuinit uv_cpu_init(void)  { diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 266ec6c18b6..10033fe718e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -301,7 +301,7 @@ extern int (*console_blank_hook)(int);   */  #define APM_ZERO_SEGS -#include "apm.h" +#include <asm/apm.h>  /*   * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index ee4df08feee..fbf2f33e308 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -75,6 +75,7 @@ void foo(void)  	OFFSET(PT_DS,  pt_regs, ds);  	OFFSET(PT_ES,  pt_regs, es);  	OFFSET(PT_FS,  pt_regs, fs); +	OFFSET(PT_GS,  pt_regs, gs);  	OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);  	OFFSET(PT_EIP, pt_regs, ip);  	OFFSET(PT_CS,  pt_regs, cs); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 1d41d3f1edb..8793ab33e2c 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -11,7 +11,6 @@  #include <linux/hardirq.h>  #include <linux/suspend.h>  #include <linux/kbuild.h> -#include <asm/pda.h>  #include <asm/processor.h>  #include <asm/segment.h>  #include <asm/thread_info.h> @@ -48,16 +47,6 @@ int main(void)  #endif  	BLANK();  #undef ENTRY -#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) -	ENTRY(kernelstack);  -	ENTRY(oldrsp);  -	ENTRY(pcurrent);  -	ENTRY(irqcount); -	ENTRY(cpunumber); -	ENTRY(irqstackptr); -	ENTRY(data_offset); -	BLANK(); -#undef ENTRY  #ifdef CONFIG_PARAVIRT  	BLANK();  	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 2cf23634b6d..6882a735d9c 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -7,7 +7,7 @@  #include <asm/pat.h>  #include <asm/processor.h> -#include <mach_apic.h> +#include <asm/apic.h>  struct cpuid_bit {  	u16 feature; @@ -69,7 +69,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)   */  void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)  { -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP  	unsigned int eax, ebx, ecx, edx, sub_index;  	unsigned int ht_mask_width, core_plus_mask_width;  	unsigned int core_select_mask, core_level_siblings; @@ -116,22 +116,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)  	core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; -#ifdef CONFIG_X86_32 -	c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) +	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)  						 & core_select_mask; -	c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); +	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);  	/*  	 * Reinit the apicid, now that we have extended initial_apicid.  	 */ -	c->apicid = phys_pkg_id(c->initial_apicid, 0); -#else -	c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; -	c->phys_proc_id = phys_pkg_id(core_plus_mask_width); -	/* -	 * Reinit the apicid, now that we have extended initial_apicid. -	 */ -	c->apicid = phys_pkg_id(0); -#endif +	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); +  	c->x86_max_cores = (core_level_siblings / smp_num_siblings); @@ -143,37 +135,3 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)  	return;  #endif  } - -#ifdef CONFIG_X86_PAT -void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) -{ -	if (!cpu_has_pat) -		pat_disable("PAT not supported by CPU."); - -	switch (c->x86_vendor) { -	case X86_VENDOR_INTEL: -		/* -		 * There is a known erratum on Pentium III and Core Solo -		 * and Core Duo CPUs. -		 * " Page with PAT set to WC while associated MTRR is UC -		 *   may consolidate to UC " -		 * Because of this erratum, it is better to stick with -		 * setting WC in MTRR rather than using PAT on these CPUs. -		 * -		 * Enable PAT WC only on P4, Core 2 or later CPUs. -		 */ -		if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15)) -			return; - -		pat_disable("PAT WC disabled due to known CPU erratum."); -		return; - -	case X86_VENDOR_AMD: -	case X86_VENDOR_CENTAUR: -	case X86_VENDOR_TRANSMETA: -		return; -	} - -	pat_disable("PAT disabled. Not yet verified on this CPU type."); -} -#endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7c878f6aa91..f47df59016c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@  #include <asm/io.h>  #include <asm/processor.h>  #include <asm/apic.h> +#include <asm/cpu.h>  #ifdef CONFIG_X86_64  # include <asm/numa_64.h> @@ -12,8 +13,6 @@  # include <asm/cacheflush.h>  #endif -#include <mach_apic.h> -  #include "cpu.h"  #ifdef CONFIG_X86_32 @@ -143,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)  	}  } +static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP +	/* calling is from identify_secondary_cpu() ? */ +	if (c->cpu_index == boot_cpu_id) +		return; + +	/* +	 * Certain Athlons might work (for various values of 'work') in SMP +	 * but they are not certified as MP capable. +	 */ +	/* Athlon 660/661 is valid. */ +	if ((c->x86_model == 6) && ((c->x86_mask == 0) || +	    (c->x86_mask == 1))) +		goto valid_k7; + +	/* Duron 670 is valid */ +	if ((c->x86_model == 7) && (c->x86_mask == 0)) +		goto valid_k7; + +	/* +	 * Athlon 662, Duron 671, and Athlon >model 7 have capability +	 * bit. It's worth noting that the A5 stepping (662) of some +	 * Athlon XP's have the MP bit set. +	 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for +	 * more. +	 */ +	if (((c->x86_model == 6) && (c->x86_mask >= 2)) || +	    ((c->x86_model == 7) && (c->x86_mask >= 1)) || +	     (c->x86_model > 7)) +		if (cpu_has_mp) +			goto valid_k7; + +	/* If we get here, not a certified SMP capable AMD system. */ + +	/* +	 * Don't taint if we are running SMP kernel on a single non-MP +	 * approved Athlon +	 */ +	WARN_ONCE(1, "WARNING: This combination of AMD" +		"processors is not suitable for SMP.\n"); +	if (!test_taint(TAINT_UNSAFE_SMP)) +		add_taint(TAINT_UNSAFE_SMP); + +valid_k7: +	; +#endif +} +  static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)  {  	u32 l, h; @@ -177,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)  	}  	set_cpu_cap(c, X86_FEATURE_K7); + +	amd_k7_smp_check(c);  }  #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 83492b1f93b..826d5c87627 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,14 +21,14 @@  #include <asm/asm.h>  #include <asm/numa.h>  #include <asm/smp.h> -#ifdef CONFIG_X86_LOCAL_APIC -#include <asm/mpspec.h> +#include <asm/cpu.h> +#include <asm/cpumask.h>  #include <asm/apic.h> -#include <mach_apic.h> -#include <asm/genapic.h> + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/uv/uv.h>  #endif -#include <asm/pda.h>  #include <asm/pgtable.h>  #include <asm/processor.h>  #include <asm/desc.h> @@ -37,6 +37,7 @@  #include <asm/sections.h>  #include <asm/setup.h>  #include <asm/hypervisor.h> +#include <asm/stackprotector.h>  #include "cpu.h" @@ -50,6 +51,15 @@ cpumask_var_t cpu_initialized_mask;  /* representing cpus for which sibling maps can be computed */  cpumask_var_t cpu_sibling_setup_mask; +/* correctly size the local cpu masks */ +void __init setup_cpu_local_masks(void) +{ +	alloc_bootmem_cpumask_var(&cpu_initialized_mask); +	alloc_bootmem_cpumask_var(&cpu_callin_mask); +	alloc_bootmem_cpumask_var(&cpu_callout_mask); +	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); +} +  #else /* CONFIG_X86_32 */  cpumask_t cpu_callin_map; @@ -62,23 +72,23 @@ cpumask_t cpu_sibling_setup_map;  static struct cpu_dev *this_cpu __cpuinitdata; +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {  #ifdef CONFIG_X86_64 -/* We need valid kernel segments for data and code in long mode too - * IRET will check the segment types  kkeil 2000/10/28 - * Also sysret mandates a special GDT layout - */ -/* The TLS descriptors are currently at a different place compared to i386. -   Hopefully nobody expects them at a fixed place (Wine?) */ -DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { +	/* +	 * We need valid kernel segments for data and code in long mode too +	 * IRET will check the segment types  kkeil 2000/10/28 +	 * Also sysret mandates a special GDT layout +	 * +	 * The TLS descriptors are currently at a different place compared to i386. +	 * Hopefully nobody expects them at a fixed place (Wine?) +	 */  	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },  	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },  	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },  	[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },  	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },  	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, -} };  #else -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {  	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },  	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },  	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, @@ -110,9 +120,10 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {  	[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },  	[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, -	[GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, -} }; +	[GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, +	GDT_STACK_CANARY_INIT  #endif +} };  EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);  #ifdef CONFIG_X86_32 @@ -213,6 +224,49 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)  #endif  /* + * Some CPU features depend on higher CPUID levels, which may not always + * be available due to CPUID level capping or broken virtualization + * software.  Add those features to this table to auto-disable them. + */ +struct cpuid_dependent_feature { +	u32 feature; +	u32 level; +}; +static const struct cpuid_dependent_feature __cpuinitconst +cpuid_dependent_features[] = { +	{ X86_FEATURE_MWAIT,		0x00000005 }, +	{ X86_FEATURE_DCA,		0x00000009 }, +	{ X86_FEATURE_XSAVE,		0x0000000d }, +	{ 0, 0 } +}; + +static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) +{ +	const struct cpuid_dependent_feature *df; +	for (df = cpuid_dependent_features; df->feature; df++) { +		/* +		 * Note: cpuid_level is set to -1 if unavailable, but +		 * extended_extended_level is set to 0 if unavailable +		 * and the legitimate extended levels are all negative +		 * when signed; hence the weird messing around with +		 * signs here... +		 */ +		if (cpu_has(c, df->feature) && +		    ((s32)df->level < 0 ? +		     (u32)df->level > (u32)c->extended_cpuid_level : +		     (s32)df->level > (s32)c->cpuid_level)) { +			clear_cpu_cap(c, df->feature); +			if (warn) +				printk(KERN_WARNING +				       "CPU: CPU feature %s disabled " +				       "due to lack of CPUID level 0x%x\n", +				       x86_cap_flags[df->feature], +				       df->level); +		} +	} +} + +/*   * Naming convention should be: <Name> [(<Codename>)]   * This table only is used unless init_<vendor>() below doesn't set it;   * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used @@ -242,18 +296,29 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)  __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +void load_percpu_segment(int cpu) +{ +#ifdef CONFIG_X86_32 +	loadsegment(fs, __KERNEL_PERCPU); +#else +	loadsegment(gs, 0); +	wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); +#endif +	load_stack_canary_segment(); +} +  /* Current gdt points %fs at the "master" per-cpu area: after this,   * it's on the real one. */ -void switch_to_new_gdt(void) +void switch_to_new_gdt(int cpu)  {  	struct desc_ptr gdt_descr; -	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); +	gdt_descr.address = (long)get_cpu_gdt_table(cpu);  	gdt_descr.size = GDT_SIZE - 1;  	load_gdt(&gdt_descr); -#ifdef CONFIG_X86_32 -	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); -#endif +	/* Reload the per-cpu base */ + +	load_percpu_segment(cpu);  }  static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; @@ -383,11 +448,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)  		}  		index_msb = get_count_order(smp_num_siblings); -#ifdef CONFIG_X86_64 -		c->phys_proc_id = phys_pkg_id(index_msb); -#else -		c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); -#endif +		c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);  		smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -395,13 +456,8 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)  		core_bits = get_count_order(c->x86_max_cores); -#ifdef CONFIG_X86_64 -		c->cpu_core_id = phys_pkg_id(index_msb) & -					       ((1 << core_bits) - 1); -#else -		c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & +		c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &  					       ((1 << core_bits) - 1); -#endif  	}  out: @@ -570,11 +626,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)  	if (this_cpu->c_early_init)  		this_cpu->c_early_init(c); -	validate_pat_support(c); -  #ifdef CONFIG_SMP  	c->cpu_index = boot_cpu_id;  #endif +	filter_cpuid_features(c, false);  }  void __init early_cpu_init(void) @@ -637,7 +692,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)  		c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;  #ifdef CONFIG_X86_32  # ifdef CONFIG_X86_HT -		c->apicid = phys_pkg_id(c->initial_apicid, 0); +		c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);  # else  		c->apicid = c->initial_apicid;  # endif @@ -684,7 +739,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  		this_cpu->c_identify(c);  #ifdef CONFIG_X86_64 -	c->apicid = phys_pkg_id(0); +	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);  #endif  	/* @@ -708,6 +763,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  	 * we do "generic changes."  	 */ +	/* Filter out anything that depends on CPUID levels we don't have */ +	filter_cpuid_features(c, true); +  	/* If the model name is still unset, do table lookup. */  	if (!c->x86_model_id[0]) {  		char *p; @@ -877,54 +935,22 @@ static __init int setup_disablecpuid(char *arg)  __setup("clearcpuid=", setup_disablecpuid);  #ifdef CONFIG_X86_64 -struct x8664_pda **_cpu_pda __read_mostly; -EXPORT_SYMBOL(_cpu_pda); -  struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; -static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; +DEFINE_PER_CPU_FIRST(union irq_stack_union, +		     irq_stack_union) __aligned(PAGE_SIZE); +DEFINE_PER_CPU(char *, irq_stack_ptr) = +	init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; -void __cpuinit pda_init(int cpu) -{ -	struct x8664_pda *pda = cpu_pda(cpu); - -	/* Setup up data that may be needed in __get_free_pages early */ -	loadsegment(fs, 0); -	loadsegment(gs, 0); -	/* Memory clobbers used to order PDA accessed */ -	mb(); -	wrmsrl(MSR_GS_BASE, pda); -	mb(); - -	pda->cpunumber = cpu; -	pda->irqcount = -1; -	pda->kernelstack = (unsigned long)stack_thread_info() - -				 PDA_STACKOFFSET + THREAD_SIZE; -	pda->active_mm = &init_mm; -	pda->mmu_state = 0; +DEFINE_PER_CPU(unsigned long, kernel_stack) = +	(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(kernel_stack); -	if (cpu == 0) { -		/* others are initialized in smpboot.c */ -		pda->pcurrent = &init_task; -		pda->irqstackptr = boot_cpu_stack; -		pda->irqstackptr += IRQSTACKSIZE - 64; -	} else { -		if (!pda->irqstackptr) { -			pda->irqstackptr = (char *) -				__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); -			if (!pda->irqstackptr) -				panic("cannot allocate irqstack for cpu %d", -				      cpu); -			pda->irqstackptr += IRQSTACKSIZE - 64; -		} - -		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) -			pda->nodenumber = cpu_to_node(cpu); -	} -} +DEFINE_PER_CPU(unsigned int, irq_count) = -1; -static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + -				  DEBUG_STKSZ] __page_aligned_bss; +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) +	__aligned(PAGE_SIZE);  extern asmlinkage void ignore_sysret(void); @@ -957,16 +983,21 @@ unsigned long kernel_eflags;   */  DEFINE_PER_CPU(struct orig_ist, orig_ist); -#else +#else	/* x86_64 */ + +#ifdef CONFIG_CC_STACKPROTECTOR +DEFINE_PER_CPU(unsigned long, stack_canary); +#endif -/* Make sure %fs is initialized properly in idle threads */ +/* Make sure %fs and %gs are initialized properly in idle threads */  struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)  {  	memset(regs, 0, sizeof(struct pt_regs));  	regs->fs = __KERNEL_PERCPU; +	regs->gs = __KERNEL_STACK_CANARY;  	return regs;  } -#endif +#endif	/* x86_64 */  /*   * cpu_init() initializes state that is per-CPU. Some data is already @@ -982,15 +1013,14 @@ void __cpuinit cpu_init(void)  	struct tss_struct *t = &per_cpu(init_tss, cpu);  	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);  	unsigned long v; -	char *estacks = NULL;  	struct task_struct *me;  	int i; -	/* CPU 0 is initialised in head64.c */ -	if (cpu != 0) -		pda_init(cpu); -	else -		estacks = boot_exception_stacks; +#ifdef CONFIG_NUMA +	if (cpu != 0 && percpu_read(node_number) == 0 && +	    cpu_to_node(cpu) != NUMA_NO_NODE) +		percpu_write(node_number, cpu_to_node(cpu)); +#endif  	me = current; @@ -1006,7 +1036,9 @@ void __cpuinit cpu_init(void)  	 * and set up the GDT descriptor:  	 */ -	switch_to_new_gdt(); +	switch_to_new_gdt(cpu); +	loadsegment(fs, 0); +  	load_idt((const struct desc_ptr *)&idt_descr);  	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); @@ -1017,25 +1049,20 @@ void __cpuinit cpu_init(void)  	barrier();  	check_efer(); -	if (cpu != 0 && x2apic) +	if (cpu != 0)  		enable_x2apic();  	/*  	 * set up and load the per-CPU TSS  	 */  	if (!orig_ist->ist[0]) { -		static const unsigned int order[N_EXCEPTION_STACKS] = { -		  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, -		  [DEBUG_STACK - 1] = DEBUG_STACK_ORDER +		static const unsigned int sizes[N_EXCEPTION_STACKS] = { +		  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, +		  [DEBUG_STACK - 1] = DEBUG_STKSZ  		}; +		char *estacks = per_cpu(exception_stacks, cpu);  		for (v = 0; v < N_EXCEPTION_STACKS; v++) { -			if (cpu) { -				estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); -				if (!estacks) -					panic("Cannot allocate exception " -					      "stack %ld %d\n", v, cpu); -			} -			estacks += PAGE_SIZE << order[v]; +			estacks += sizes[v];  			orig_ist->ist[v] = t->x86_tss.ist[v] =  					(unsigned long)estacks;  		} @@ -1069,22 +1096,19 @@ void __cpuinit cpu_init(void)  	 */  	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)  		arch_kgdb_ops.correct_hw_break(); -	else { +	else  #endif -	/* -	 * Clear all 6 debug registers: -	 */ - -	set_debugreg(0UL, 0); -	set_debugreg(0UL, 1); -	set_debugreg(0UL, 2); -	set_debugreg(0UL, 3); -	set_debugreg(0UL, 6); -	set_debugreg(0UL, 7); -#ifdef CONFIG_KGDB -	/* If the kgdb is connected no debug regs should be altered. */ +	{ +		/* +		 * Clear all 6 debug registers: +		 */ +		set_debugreg(0UL, 0); +		set_debugreg(0UL, 1); +		set_debugreg(0UL, 2); +		set_debugreg(0UL, 3); +		set_debugreg(0UL, 6); +		set_debugreg(0UL, 7);  	} -#endif  	fpu_init(); @@ -1114,7 +1138,7 @@ void __cpuinit cpu_init(void)  		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);  	load_idt(&idt_descr); -	switch_to_new_gdt(); +	switch_to_new_gdt(cpu);  	/*  	 * Set up and load the per-CPU TSS and LDT @@ -1135,9 +1159,6 @@ void __cpuinit cpu_init(void)  	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);  #endif -	/* Clear %gs. */ -	asm volatile ("mov %0, %%gs" : : "r" (0)); -  	/* Clear all 6 debug registers: */  	set_debugreg(0, 0);  	set_debugreg(0, 1); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c..22590cf688a 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)  	if (!data)  		return -ENOMEM; -	data->acpi_data = percpu_ptr(acpi_perf_data, cpu); +	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);  	per_cpu(drv_data, cpu) = data;  	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c index c2f930d8664..41ab3f064cb 100644 --- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c @@ -204,12 +204,12 @@ static int eps_cpu_init(struct cpufreq_policy *policy)  	}  	/* Enable Enhanced PowerSaver */  	rdmsrl(MSR_IA32_MISC_ENABLE, val); -	if (!(val & 1 << 16)) { -		val |= 1 << 16; +	if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { +		val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;  		wrmsrl(MSR_IA32_MISC_ENABLE, val);  		/* Can be locked at 0 */  		rdmsrl(MSR_IA32_MISC_ENABLE, val); -		if (!(val & 1 << 16)) { +		if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {  			printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");  			return -ENODEV;  		} diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index f08998278a3..c9f1fdc0283 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -390,14 +390,14 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)  	   enable it if not. */  	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	if (!(l & (1<<16))) { -		l |= (1<<16); +	if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { +		l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;  		dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);  		wrmsr(MSR_IA32_MISC_ENABLE, l, h);  		/* check to see if it stuck */  		rdmsr(MSR_IA32_MISC_ENABLE, l, h); -		if (!(l & (1<<16))) { +		if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {  			printk(KERN_INFO PFX  				"couldn't enable Enhanced SpeedStep\n");  			return -ENODEV; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 24ff26a38ad..191117f1ad5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -13,6 +13,7 @@  #include <asm/uaccess.h>  #include <asm/ds.h>  #include <asm/bugs.h> +#include <asm/cpu.h>  #ifdef CONFIG_X86_64  #include <asm/topology.h> @@ -24,7 +25,6 @@  #ifdef CONFIG_X86_LOCAL_APIC  #include <asm/mpspec.h>  #include <asm/apic.h> -#include <mach_apic.h>  #endif  static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) @@ -63,6 +63,18 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);  	} +	/* +	 * There is a known erratum on Pentium III and Core Solo +	 * and Core Duo CPUs. +	 * " Page with PAT set to WC while associated MTRR is UC +	 *   may consolidate to UC " +	 * Because of this erratum, it is better to stick with +	 * setting WC in MTRR rather than using PAT on these CPUs. +	 * +	 * Enable PAT WC only on P4, Core 2 or later CPUs. +	 */ +	if (c->x86 == 6 && c->x86_model < 15) +		clear_cpu_cap(c, X86_FEATURE_PAT);  }  #ifdef CONFIG_X86_32 @@ -99,6 +111,28 @@ static void __cpuinit trap_init_f00f_bug(void)  }  #endif +static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP +	/* calling is from identify_secondary_cpu() ? */ +	if (c->cpu_index == boot_cpu_id) +		return; + +	/* +	 * Mask B, Pentium, but not Pentium MMX +	 */ +	if (c->x86 == 5 && +	    c->x86_mask >= 1 && c->x86_mask <= 4 && +	    c->x86_model <= 3) { +		/* +		 * Remember we have B step Pentia with bugs +		 */ +		WARN_ONCE(1, "WARNING: SMP operation may be unreliable" +				    "with B stepping processors.\n"); +	} +#endif +} +  static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)  {  	unsigned long lo, hi; @@ -135,10 +169,10 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)  	 */  	if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {  		rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); -		if ((lo & (1<<9)) == 0) { +		if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {  			printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");  			printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); -			lo |= (1<<9);	/* Disable hw prefetching */ +			lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;  			wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);  		}  	} @@ -175,6 +209,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)  #ifdef CONFIG_X86_NUMAQ  	numaq_tsc_disable();  #endif + +	intel_smp_check(c);  }  #else  static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index da299eb85fc..7293508d8f5 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -147,7 +147,16 @@ struct _cpuid4_info {  	union _cpuid4_leaf_ecx ecx;  	unsigned long size;  	unsigned long can_disable; -	cpumask_t shared_cpu_map;	/* future?: only cpus/node is needed */ +	DECLARE_BITMAP(shared_cpu_map, NR_CPUS); +}; + +/* subset of above _cpuid4_info w/o shared_cpu_map */ +struct _cpuid4_info_regs { +	union _cpuid4_leaf_eax eax; +	union _cpuid4_leaf_ebx ebx; +	union _cpuid4_leaf_ecx ecx; +	unsigned long size; +	unsigned long can_disable;  };  #ifdef CONFIG_PCI @@ -278,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,  }  static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) +amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)  {  	if (index < 3)  		return; @@ -286,7 +295,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)  }  static int -__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) +__cpuinit cpuid4_cache_lookup_regs(int index, +				   struct _cpuid4_info_regs *this_leaf)  {  	union _cpuid4_leaf_eax 	eax;  	union _cpuid4_leaf_ebx 	ebx; @@ -314,6 +324,15 @@ __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)  	return 0;  } +static int +__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) +{ +	struct _cpuid4_info_regs *leaf_regs = +		(struct _cpuid4_info_regs *)this_leaf; + +	return cpuid4_cache_lookup_regs(index, leaf_regs); +} +  static int __cpuinit find_num_cache_leaves(void)  {  	unsigned int		eax, ebx, ecx, edx; @@ -353,11 +372,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)  		 * parameters cpuid leaf to find the cache details  		 */  		for (i = 0; i < num_cache_leaves; i++) { -			struct _cpuid4_info this_leaf; - +			struct _cpuid4_info_regs this_leaf;  			int retval; -			retval = cpuid4_cache_lookup(i, &this_leaf); +			retval = cpuid4_cache_lookup_regs(i, &this_leaf);  			if (retval >= 0) {  				switch(this_leaf.eax.split.level) {  				    case 1: @@ -506,17 +524,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)  	num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;  	if (num_threads_sharing == 1) -		cpu_set(cpu, this_leaf->shared_cpu_map); +		cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));  	else {  		index_msb = get_count_order(num_threads_sharing);  		for_each_online_cpu(i) {  			if (cpu_data(i).apicid >> index_msb ==  			    c->apicid >> index_msb) { -				cpu_set(i, this_leaf->shared_cpu_map); +				cpumask_set_cpu(i, +					to_cpumask(this_leaf->shared_cpu_map));  				if (i != cpu && per_cpu(cpuid4_info, i))  { -					sibling_leaf = CPUID4_INFO_IDX(i, index); -					cpu_set(cpu, sibling_leaf->shared_cpu_map); +					sibling_leaf = +						CPUID4_INFO_IDX(i, index); +					cpumask_set_cpu(cpu, to_cpumask( +						sibling_leaf->shared_cpu_map));  				}  			}  		} @@ -528,9 +549,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)  	int sibling;  	this_leaf = CPUID4_INFO_IDX(cpu, index); -	for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { +	for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {  		sibling_leaf = CPUID4_INFO_IDX(sibling, index); -		cpu_clear(cpu, sibling_leaf->shared_cpu_map); +		cpumask_clear_cpu(cpu, +				  to_cpumask(sibling_leaf->shared_cpu_map));  	}  }  #else @@ -635,8 +657,9 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,  	int n = 0;  	if (len > 1) { -		cpumask_t *mask = &this_leaf->shared_cpu_map; +		const struct cpumask *mask; +		mask = to_cpumask(this_leaf->shared_cpu_map);  		n = type?  			cpulist_scnprintf(buf, len-2, mask) :  			cpumask_scnprintf(buf, len-2, mask); @@ -699,7 +722,8 @@ static struct pci_dev *get_k8_northbridge(int node)  static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)  { -	int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); +	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); +	int node = cpu_to_node(cpumask_first(mask));  	struct pci_dev *dev = NULL;  	ssize_t ret = 0;  	int i; @@ -733,7 +757,8 @@ static ssize_t  store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,  		    size_t count)  { -	int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); +	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); +	int node = cpu_to_node(cpumask_first(mask));  	struct pci_dev *dev = NULL;  	unsigned int ret, index, val; @@ -878,7 +903,7 @@ err_out:  	return -ENOMEM;  } -static cpumask_t cache_dev_map = CPU_MASK_NONE; +static DECLARE_BITMAP(cache_dev_map, NR_CPUS);  /* Add/Remove cache interface for CPU device */  static int __cpuinit cache_add_dev(struct sys_device * sys_dev) @@ -918,7 +943,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)  		}  		kobject_uevent(&(this_object->kobj), KOBJ_ADD);  	} -	cpu_set(cpu, cache_dev_map); +	cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));  	kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);  	return 0; @@ -931,9 +956,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)  	if (per_cpu(cpuid4_info, cpu) == NULL)  		return; -	if (!cpu_isset(cpu, cache_dev_map)) +	if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))  		return; -	cpu_clear(cpu, cache_dev_map); +	cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));  	for (i = 0; i < num_cache_leaves; i++)  		kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index d7d2323bbb6..b2f89829bbe 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o  obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o  obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o  obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index dfaebce3633..3552119b091 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)  	}  } -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ -	old_cr4 = read_cr4(); -	clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ -	if (old_cr4 & X86_CR4_MCE) -		set_in_cr4(X86_CR4_MCE); -} -  static int __init mcheck_disable(char *str)  {  	mce_disabled = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index fe79985ce0f..bfbd5323a63 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -3,6 +3,8 @@   * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.   * Rest from unknown author(s).   * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen   */  #include <linux/init.h> @@ -24,6 +26,9 @@  #include <linux/ctype.h>  #include <linux/kmod.h>  #include <linux/kdebug.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/ratelimit.h>  #include <asm/processor.h>  #include <asm/msr.h>  #include <asm/mce.h> @@ -32,7 +37,6 @@  #include <asm/idle.h>  #define MISC_MCELOG_MINOR 227 -#define NR_SYSFS_BANKS 6  atomic_t mce_entry; @@ -47,7 +51,7 @@ static int mce_dont_init;   */  static int tolerant = 1;  static int banks; -static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; +static u64 *bank;  static unsigned long notify_user;  static int rip_msr;  static int mce_bootlog = -1; @@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };  static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { +	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ +	memset(m, 0, sizeof(struct mce)); +	m->cpu = smp_processor_id(); +	rdtscll(m->tsc); +} +  /*   * Lockless MCE logging infrastructure.   * This avoids deadlocks on printk locks without having to break locks. Also @@ -119,11 +136,11 @@ static void print_mce(struct mce *m)  			print_symbol("{%s}", m->ip);  		printk("\n");  	} -	printk(KERN_EMERG "TSC %Lx ", m->tsc); +	printk(KERN_EMERG "TSC %llx ", m->tsc);  	if (m->addr) -		printk("ADDR %Lx ", m->addr); +		printk("ADDR %llx ", m->addr);  	if (m->misc) -		printk("MISC %Lx ", m->misc); +		printk("MISC %llx ", m->misc);  	printk("\n");  	printk(KERN_EMERG "This is not a software problem!\n");  	printk(KERN_EMERG "Run through mcelog --ascii to decode " @@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)  	panic(msg);  } -static int mce_available(struct cpuinfo_x86 *c) +int mce_available(struct cpuinfo_x86 *c)  { +	if (mce_dont_init) +		return 0;  	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);  } @@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)  }  /* - * The actual machine check handler + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + */ +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ +	struct mce m; +	int i; + +	mce_setup(&m); + +	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); +	for (i = 0; i < banks; i++) { +		if (!bank[i] || !test_bit(i, *b)) +			continue; + +		m.misc = 0; +		m.addr = 0; +		m.bank = i; +		m.tsc = 0; + +		barrier(); +		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); +		if (!(m.status & MCI_STATUS_VAL)) +			continue; + +		/* +		 * Uncorrected events are handled by the exception handler +		 * when it is enabled. But when the exception is disabled log +		 * everything. +		 * +		 * TBD do the same check for MCI_STATUS_EN here? +		 */ +		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) +			continue; + +		if (m.status & MCI_STATUS_MISCV) +			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); +		if (m.status & MCI_STATUS_ADDRV) +			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + +		if (!(flags & MCP_TIMESTAMP)) +			m.tsc = 0; +		/* +		 * Don't get the IP here because it's unlikely to +		 * have anything to do with the actual error location. +		 */ + +		mce_log(&m); +		add_taint(TAINT_MACHINE_CHECK); + +		/* +		 * Clear state for this bank. +		 */ +		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +	} + +	/* +	 * Don't clear MCG_STATUS here because it's only defined for +	 * exceptions. +	 */ +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there!   */  void do_machine_check(struct pt_regs * regs, long error_code)  { @@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)  	 * error.  	 */  	int kill_it = 0; +	DECLARE_BITMAP(toclear, MAX_NR_BANKS);  	atomic_inc(&mce_entry); -	if ((regs -	     && notify_die(DIE_NMI, "machine check", regs, error_code, +	if (notify_die(DIE_NMI, "machine check", regs, error_code,  			   18, SIGKILL) == NOTIFY_STOP) -	    || !banks)  		goto out2; +	if (!banks) +		goto out2; + +	mce_setup(&m); -	memset(&m, 0, sizeof(struct mce)); -	m.cpu = smp_processor_id();  	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);  	/* if the restart IP is not valid, we're done for */  	if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)  	barrier();  	for (i = 0; i < banks; i++) { -		if (i < NR_SYSFS_BANKS && !bank[i]) +		__clear_bit(i, toclear); +		if (!bank[i])  			continue;  		m.misc = 0;  		m.addr = 0;  		m.bank = i; -		m.tsc = 0;  		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);  		if ((m.status & MCI_STATUS_VAL) == 0)  			continue; +		/* +		 * Non uncorrected errors are handled by machine_check_poll +		 * Leave them alone. +		 */ +		if ((m.status & MCI_STATUS_UC) == 0) +			continue; + +		/* +		 * Set taint even when machine check was not enabled. +		 */ +		add_taint(TAINT_MACHINE_CHECK); + +		__set_bit(i, toclear); +  		if (m.status & MCI_STATUS_EN) {  			/* if PCC was set, there's no way out */  			no_way_out |= !!(m.status & MCI_STATUS_PCC); @@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)  					no_way_out = 1;  				kill_it = 1;  			} +		} else { +			/* +			 * Machine check event was not enabled. Clear, but +			 * ignore. +			 */ +			continue;  		}  		if (m.status & MCI_STATUS_MISCV) @@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)  			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);  		mce_get_rip(&m, regs); -		if (error_code >= 0) -			rdtscll(m.tsc); -		if (error_code != -2) -			mce_log(&m); +		mce_log(&m);  		/* Did this bank cause the exception? */  		/* Assume that the bank with uncorrectable errors did it, @@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)  			panicm = m;  			panicm_found = 1;  		} - -		add_taint(TAINT_MACHINE_CHECK);  	} -	/* Never do anything final in the polling timer */ -	if (!regs) -		goto out; -  	/* If we didn't find an uncorrectable error, pick  	   the last one (shouldn't happen, just being safe). */  	if (!panicm_found) @@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)  	/* notify userspace ASAP */  	set_thread_flag(TIF_MCE_NOTIFY); - out:  	/* the last thing we do is clear state */ -	for (i = 0; i < banks; i++) -		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +	for (i = 0; i < banks; i++) { +		if (test_bit(i, toclear)) +			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +	}  	wrmsrl(MSR_IA32_MCG_STATUS, 0);   out2:  	atomic_dec(&mce_entry); @@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)   * and historically has been the register value of the   * MSR_IA32_THERMAL_STATUS (Intel) msr.   */ -void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +void mce_log_therm_throt_event(__u64 status)  {  	struct mce m; -	memset(&m, 0, sizeof(m)); -	m.cpu = cpu; +	mce_setup(&m);  	m.bank = MCE_THERMAL_BANK;  	m.status = status; -	rdtscll(m.tsc);  	mce_log(&m);  }  #endif /* CONFIG_X86_MCE_INTEL */ @@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)  static int check_interval = 5 * 60; /* 5 minutes */  static int next_interval; /* in jiffies */ -static void mcheck_timer(struct work_struct *work); -static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); +static void mcheck_timer(unsigned long); +static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_check_cpu(void *info) +static void mcheck_timer(unsigned long data)  { -	if (mce_available(¤t_cpu_data)) -		do_machine_check(NULL, 0); -} +	struct timer_list *t = &per_cpu(mce_timer, data); -static void mcheck_timer(struct work_struct *work) -{ -	on_each_cpu(mcheck_check_cpu, NULL, 1); +	WARN_ON(smp_processor_id() != data); + +	if (mce_available(¤t_cpu_data)) +		machine_check_poll(MCP_TIMESTAMP, +				&__get_cpu_var(mce_poll_banks));  	/*  	 * Alert userspace if needed.  If we logged an MCE, reduce the @@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)  				(int)round_jiffies_relative(check_interval*HZ));  	} -	schedule_delayed_work(&mcheck_work, next_interval); +	t->expires = jiffies + next_interval; +	add_timer(t); +} + +static void mce_do_trigger(struct work_struct *work) +{ +	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);  } +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); +  /* - * This is only called from process context.  This is where we do - * anything we need to alert userspace about new MCEs.  This is called - * directly from the poller and also from entry.S and idle, thanks to - * TIF_MCE_NOTIFY. + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context.   */  int mce_notify_user(void)  { +	/* Not more than two messages every minute */ +	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); +  	clear_thread_flag(TIF_MCE_NOTIFY);  	if (test_and_clear_bit(0, ¬ify_user)) { -		static unsigned long last_print; -		unsigned long now = jiffies; -  		wake_up_interruptible(&mce_wait); -		if (trigger[0]) -			call_usermodehelper(trigger, trigger_argv, NULL, -						UMH_NO_WAIT); -		if (time_after_eq(now, last_print + (check_interval*HZ))) { -			last_print = now; +		/* +		 * There is no risk of missing notifications because +		 * work_pending is always cleared before the function is +		 * executed. +		 */ +		if (trigger[0] && !work_pending(&mce_trigger_work)) +			schedule_work(&mce_trigger_work); + +		if (__ratelimit(&ratelimit))  			printk(KERN_INFO "Machine check events logged\n"); -		}  		return 1;  	} @@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {  static __init int periodic_mcheck_init(void)  { -	next_interval = check_interval * HZ; -	if (next_interval) -		schedule_delayed_work(&mcheck_work, -				      round_jiffies_relative(next_interval)); -	idle_notifier_register(&mce_idle_notifier); -	return 0; +       idle_notifier_register(&mce_idle_notifier); +       return 0;  }  __initcall(periodic_mcheck_init); -  /*   * Initialize Machine Checks for a CPU.   */ -static void mce_init(void *dummy) +static int mce_cap_init(void)  {  	u64 cap; -	int i; +	unsigned b;  	rdmsrl(MSR_IA32_MCG_CAP, cap); -	banks = cap & 0xff; -	if (banks > MCE_EXTENDED_BANK) { -		banks = MCE_EXTENDED_BANK; -		printk(KERN_INFO "MCE: warning: using only %d banks\n", -		       MCE_EXTENDED_BANK); +	b = cap & 0xff; +	if (b > MAX_NR_BANKS) { +		printk(KERN_WARNING +		       "MCE: Using only %u machine check banks out of %u\n", +			MAX_NR_BANKS, b); +		b = MAX_NR_BANKS; +	} + +	/* Don't support asymmetric configurations today */ +	WARN_ON(banks != 0 && b != banks); +	banks = b; +	if (!bank) { +		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); +		if (!bank) +			return -ENOMEM; +		memset(bank, 0xff, banks * sizeof(u64));  	} +  	/* Use accurate RIP reporting if available. */  	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)  		rip_msr = MSR_IA32_MCG_EIP; -	/* Log the machine checks left over from the previous reset. -	   This also clears all registers */ -	do_machine_check(NULL, mce_bootlog ? -1 : -2); +	return 0; +} + +static void mce_init(void *dummy) +{ +	u64 cap; +	int i; +	mce_banks_t all_banks; + +	/* +	 * Log the machine checks left over from the previous reset. +	 */ +	bitmap_fill(all_banks, MAX_NR_BANKS); +	machine_check_poll(MCP_UC, &all_banks);  	set_in_cr4(X86_CR4_MCE); +	rdmsrl(MSR_IA32_MCG_CAP, cap);  	if (cap & MCG_CTL_P)  		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);  	for (i = 0; i < banks; i++) { -		if (i < NR_SYSFS_BANKS) -			wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); -		else -			wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); - +		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);  		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);  	}  }  /* Add per CPU specific workarounds here */ -static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static void mce_cpu_quirks(struct cpuinfo_x86 *c)  {  	/* This should be disabled by the BIOS, but isn't always */  	if (c->x86_vendor == X86_VENDOR_AMD) { -		if(c->x86 == 15) +		if (c->x86 == 15 && banks > 4)  			/* disable GART TBL walk error reporting, which trips off  			   incorrectly with the IOMMU & 3ware & Cerberus. */ -			clear_bit(10, &bank[4]); +			clear_bit(10, (unsigned long *)&bank[4]);  		if(c->x86 <= 17 && mce_bootlog < 0)  			/* Lots of broken BIOS around that don't clear them  			   by default and leave crap in there. Don't log. */ @@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)  	}  } +static void mce_init_timer(void) +{ +	struct timer_list *t = &__get_cpu_var(mce_timer); + +	/* data race harmless because everyone sets to the same value */ +	if (!next_interval) +		next_interval = check_interval * HZ; +	if (!next_interval) +		return; +	setup_timer(t, mcheck_timer, smp_processor_id()); +	t->expires = round_jiffies_relative(jiffies + next_interval); +	add_timer(t); +} +  /*   * Called for each booted CPU to set up machine checks.   * Must be called with preempt off.   */  void __cpuinit mcheck_init(struct cpuinfo_x86 *c)  { -	mce_cpu_quirks(c); +	if (!mce_available(c)) +		return; -	if (mce_dont_init || -	    !mce_available(c)) +	if (mce_cap_init() < 0) { +		mce_dont_init = 1;  		return; +	} +	mce_cpu_quirks(c);  	mce_init(NULL);  	mce_cpu_features(c); +	mce_init_timer();  }  /* @@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,  {  	unsigned long *cpu_tsc;  	static DEFINE_MUTEX(mce_read_mutex); -	unsigned next; +	unsigned prev, next;  	char __user *buf = ubuf;  	int i, err; @@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,  	}  	err = 0; -	for (i = 0; i < next; i++) { -		unsigned long start = jiffies; +	prev = 0; +	do { +		for (i = prev; i < next; i++) { +			unsigned long start = jiffies; -		while (!mcelog.entry[i].finished) { -			if (time_after_eq(jiffies, start + 2)) { -				memset(mcelog.entry + i,0, sizeof(struct mce)); -				goto timeout; +			while (!mcelog.entry[i].finished) { +				if (time_after_eq(jiffies, start + 2)) { +					memset(mcelog.entry + i, 0, +					       sizeof(struct mce)); +					goto timeout; +				} +				cpu_relax();  			} -			cpu_relax(); +			smp_rmb(); +			err |= copy_to_user(buf, mcelog.entry + i, +					    sizeof(struct mce)); +			buf += sizeof(struct mce); +timeout: +			;  		} -		smp_rmb(); -		err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); -		buf += sizeof(struct mce); - timeout: -		; -	} -	memset(mcelog.entry, 0, next * sizeof(struct mce)); -	mcelog.next = 0; +		memset(mcelog.entry + prev, 0, +		       (next - prev) * sizeof(struct mce)); +		prev = next; +		next = cmpxchg(&mcelog.next, prev, 0); +	} while (next != prev);  	synchronize_sched(); @@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {  	&mce_chrdev_ops,  }; -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ -	old_cr4 = read_cr4(); -	clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ -	if (old_cr4 & X86_CR4_MCE) -		set_in_cr4(X86_CR4_MCE); -} -  /*   * Old style boot options parsing. Only for compatibility.   */ @@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)  	return 1;  } -/* mce=off disables machine check. Note you can re-enable it later -   using sysfs. +/* mce=off disables machine check.     mce=TOLERANCELEVEL (number, see above)     mce=bootlog Log MCEs from before booting. Disabled by default on AMD.     mce=nobootlog Don't log MCEs from before booting. */ @@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);   * Sysfs support   */ +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ +	int i; + +	for (i = 0; i < banks; i++) +		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +	return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ +	return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ +	return mce_disable(); +} +  /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.     Only one CPU is active at this time, the others get readded later using     CPU hotplug. */ @@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)  	return 0;  } +static void mce_cpu_restart(void *data) +{ +	del_timer_sync(&__get_cpu_var(mce_timer)); +	if (mce_available(¤t_cpu_data)) +		mce_init(NULL); +	mce_init_timer(); +} +  /* Reinit MCEs after user configuration changes */  static void mce_restart(void)  { -	if (next_interval) -		cancel_delayed_work(&mcheck_work); -	/* Timer race is harmless here */ -	on_each_cpu(mce_init, NULL, 1);  	next_interval = check_interval * HZ; -	if (next_interval) -		schedule_delayed_work(&mcheck_work, -				      round_jiffies_relative(next_interval)); +	on_each_cpu(mce_cpu_restart, NULL, 1);  }  static struct sysdev_class mce_sysclass = { +	.suspend = mce_suspend, +	.shutdown = mce_shutdown,  	.resume = mce_resume,  	.name = "machinecheck",  }; @@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit  	}								\  	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -/* - * TBD should generate these dynamically based on number of available banks. - * Have only 6 contol banks in /sysfs until then. - */ -ACCESSOR(bank0ctl,bank[0],mce_restart()) -ACCESSOR(bank1ctl,bank[1],mce_restart()) -ACCESSOR(bank2ctl,bank[2],mce_restart()) -ACCESSOR(bank3ctl,bank[3],mce_restart()) -ACCESSOR(bank4ctl,bank[4],mce_restart()) -ACCESSOR(bank5ctl,bank[5],mce_restart()) +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +			 char *buf) +{ +	u64 b = bank[attr - bank_attrs]; +	return sprintf(buf, "%llx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +			const char *buf, size_t siz) +{ +	char *end; +	u64 new = simple_strtoull(buf, &end, 0); +	if (end == buf) +		return -EINVAL; +	bank[attr - bank_attrs] = new; +	mce_restart(); +	return end-buf; +}  static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,  				char *buf) @@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);  static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);  ACCESSOR(check_interval,check_interval,mce_restart())  static struct sysdev_attribute *mce_attributes[] = { -	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, -	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,  	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,  	NULL  }; @@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)  		if (err)  			goto error;  	} +	for (i = 0; i < banks; i++) { +		err = sysdev_create_file(&per_cpu(device_mce, cpu), +					&bank_attrs[i]); +		if (err) +			goto error2; +	}  	cpu_set(cpu, mce_device_initialized);  	return 0; +error2: +	while (--i >= 0) { +		sysdev_remove_file(&per_cpu(device_mce, cpu), +					&bank_attrs[i]); +	}  error: -	while (i--) { +	while (--i >= 0) {  		sysdev_remove_file(&per_cpu(device_mce,cpu),  				   mce_attributes[i]);  	} @@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)  	for (i = 0; mce_attributes[i]; i++)  		sysdev_remove_file(&per_cpu(device_mce,cpu),  			mce_attributes[i]); +	for (i = 0; i < banks; i++) +		sysdev_remove_file(&per_cpu(device_mce, cpu), +			&bank_attrs[i]);  	sysdev_unregister(&per_cpu(device_mce,cpu));  	cpu_clear(cpu, mce_device_initialized);  } +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void *h) +{ +	int i; +	unsigned long action = *(unsigned long *)h; + +	if (!mce_available(¤t_cpu_data)) +		return; +	if (!(action & CPU_TASKS_FROZEN)) +		cmci_clear(); +	for (i = 0; i < banks; i++) +		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +} + +static void mce_reenable_cpu(void *h) +{ +	int i; +	unsigned long action = *(unsigned long *)h; + +	if (!mce_available(¤t_cpu_data)) +		return; +	if (!(action & CPU_TASKS_FROZEN)) +		cmci_reenable(); +	for (i = 0; i < banks; i++) +		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +} +  /* Get notified when a cpu comes on/off. Be hotplug friendly. */  static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,  				      unsigned long action, void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu; +	struct timer_list *t = &per_cpu(mce_timer, cpu);  	switch (action) {  	case CPU_ONLINE: @@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,  			threshold_cpu_callback(action, cpu);  		mce_remove_device(cpu);  		break; +	case CPU_DOWN_PREPARE: +	case CPU_DOWN_PREPARE_FROZEN: +		del_timer_sync(t); +		smp_call_function_single(cpu, mce_disable_cpu, &action, 1); +		break; +	case CPU_DOWN_FAILED: +	case CPU_DOWN_FAILED_FROZEN: +		t->expires = round_jiffies_relative(jiffies + next_interval); +		add_timer_on(t, cpu); +		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); +		break; +	case CPU_POST_DEAD: +		/* intentionally ignoring frozen here */ +		cmci_rediscover(cpu); +		break;  	}  	return NOTIFY_OK;  } @@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {  	.notifier_call = mce_cpu_callback,  }; +static __init int mce_init_banks(void) +{ +	int i; + +	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, +				GFP_KERNEL); +	if (!bank_attrs) +		return -ENOMEM; + +	for (i = 0; i < banks; i++) { +		struct sysdev_attribute *a = &bank_attrs[i]; +		a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); +		if (!a->attr.name) +			goto nomem; +		a->attr.mode = 0644; +		a->show = show_bank; +		a->store = set_bank; +	} +	return 0; + +nomem: +	while (--i >= 0) +		kfree(bank_attrs[i].attr.name); +	kfree(bank_attrs); +	bank_attrs = NULL; +	return -ENOMEM; +} +  static __init int mce_init_device(void)  {  	int err; @@ -906,6 +1161,11 @@ static __init int mce_init_device(void)  	if (!mce_available(&boot_cpu_data))  		return -EIO; + +	err = mce_init_banks(); +	if (err) +		return err; +  	err = sysdev_class_register(&mce_sysclass);  	if (err)  		return err; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index f2ee0ae29bd..c5a32f92d07 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {  struct threshold_bank {  	struct kobject *kobj;  	struct threshold_block *blocks; -	cpumask_t cpus; +	cpumask_var_t cpus;  };  static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); @@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {  static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */ +static void amd_threshold_interrupt(void); +  /*   * CPU Initialization   */ @@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)  			tr.reset = 0;  			tr.old_limit = 0;  			threshold_restart_bank(&tr); + +			mce_threshold_vector = amd_threshold_interrupt;  		}  	}  } @@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)   * the interrupt goes off when error_count reaches threshold_limit.   * the handler will simply log mcelog w/ software defined bank number.   */ -asmlinkage void mce_threshold_interrupt(void) +static void amd_threshold_interrupt(void)  {  	unsigned int bank, block;  	struct mce m;  	u32 low = 0, high = 0, address = 0; -	ack_APIC_irq(); -	exit_idle(); -	irq_enter(); - -	memset(&m, 0, sizeof(m)); -	rdtscll(m.tsc); -	m.cpu = smp_processor_id(); +	mce_setup(&m);  	/* assume first bank caused it */  	for (bank = 0; bank < NR_BANKS; ++bank) { @@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)  			/* Log the machine check that caused the threshold  			   event. */ -			do_machine_check(NULL, 0); +			machine_check_poll(MCP_TIMESTAMP, +					&__get_cpu_var(mce_poll_banks));  			if (high & MASK_OVERFLOW_HI) {  				rdmsrl(address, m.misc); @@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)  				       + bank * NR_BLOCKS  				       + block;  				mce_log(&m); -				goto out; +				return;  			}  		}  	} -out: -	inc_irq_stat(irq_threshold_count); -	irq_exit();  }  /* @@ -481,7 +477,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  #ifdef CONFIG_SMP  	if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {	/* symlink */ -		i = first_cpu(per_cpu(cpu_core_map, cpu)); +		i = cpumask_first(&per_cpu(cpu_core_map, cpu));  		/* first core not up yet */  		if (cpu_data(i).cpu_core_id) @@ -501,7 +497,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		if (err)  			goto out; -		b->cpus = per_cpu(cpu_core_map, cpu); +		cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));  		per_cpu(threshold_banks, cpu)[bank] = b;  		goto out;  	} @@ -512,15 +508,20 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		err = -ENOMEM;  		goto out;  	} +	if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { +		kfree(b); +		err = -ENOMEM; +		goto out; +	}  	b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);  	if (!b->kobj)  		goto out_free;  #ifndef CONFIG_SMP -	b->cpus = CPU_MASK_ALL; +	cpumask_setall(b->cpus);  #else -	b->cpus = per_cpu(cpu_core_map, cpu); +	cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));  #endif  	per_cpu(threshold_banks, cpu)[bank] = b; @@ -529,7 +530,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  	if (err)  		goto out_free; -	for_each_cpu_mask_nr(i, b->cpus) { +	for_each_cpu(i, b->cpus) {  		if (i == cpu)  			continue; @@ -545,6 +546,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  out_free:  	per_cpu(threshold_banks, cpu)[bank] = NULL; +	free_cpumask_var(b->cpus);  	kfree(b);  out:  	return err; @@ -619,7 +621,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  #endif  	/* remove all sibling symlinks before unregistering */ -	for_each_cpu_mask_nr(i, b->cpus) { +	for_each_cpu(i, b->cpus) {  		if (i == cpu)  			continue; @@ -632,6 +634,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  free_out:  	kobject_del(b->kobj);  	kobject_put(b->kobj); +	free_cpumask_var(b->cpus);  	kfree(b);  	per_cpu(threshold_banks, cpu)[bank] = NULL;  } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index f44c3662436..aaa7d973093 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -1,17 +1,21 @@  /*   * Intel specific MCE features.   * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen   */  #include <linux/init.h>  #include <linux/interrupt.h>  #include <linux/percpu.h>  #include <asm/processor.h> +#include <asm/apic.h>  #include <asm/msr.h>  #include <asm/mce.h>  #include <asm/hw_irq.h>  #include <asm/idle.h>  #include <asm/therm_throt.h> +#include <asm/apic.h>  asmlinkage void smp_thermal_interrupt(void)  { @@ -24,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)  	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);  	if (therm_throt_process(msr_val & 1)) -		mce_log_therm_throt_event(smp_processor_id(), msr_val); +		mce_log_therm_throt_event(msr_val);  	inc_irq_stat(irq_thermal_count);  	irq_exit(); @@ -48,13 +52,13 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)  	 */  	rdmsr(MSR_IA32_MISC_ENABLE, l, h);  	h = apic_read(APIC_LVTTHMR); -	if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { +	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {  		printk(KERN_DEBUG  		       "CPU%d: Thermal monitoring handled by SMI\n", cpu);  		return;  	} -	if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) +	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))  		tm2 = 1;  	if (h & APIC_VECTOR_MASK) { @@ -72,7 +76,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)  	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);  	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); +	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);  	l = apic_read(APIC_LVTTHMR);  	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); @@ -84,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)  	return;  } +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 + +static int cmci_supported(int *banks) +{ +	u64 cap; + +	/* +	 * Vendor check is not strictly needed, but the initial +	 * initialization is vendor keyed and this +	 * makes sure none of the backdoors are entered otherwise. +	 */ +	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) +		return 0; +	if (!cpu_has_apic || lapic_get_maxlvt() < 6) +		return 0; +	rdmsrl(MSR_IA32_MCG_CAP, cap); +	*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); +	return !!(cap & MCG_CMCI_P); +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ +	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); +	mce_notify_user(); +} + +static void print_update(char *type, int *hdr, int num) +{ +	if (*hdr == 0) +		printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); +	*hdr = 1; +	printk(KERN_CONT " %s:%d", type, num); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static void cmci_discover(int banks, int boot) +{ +	unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); +	int hdr = 0; +	int i; + +	spin_lock(&cmci_discover_lock); +	for (i = 0; i < banks; i++) { +		u64 val; + +		if (test_bit(i, owned)) +			continue; + +		rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + +		/* Already owned by someone else? */ +		if (val & CMCI_EN) { +			if (test_and_clear_bit(i, owned) || boot) +				print_update("SHD", &hdr, i); +			__clear_bit(i, __get_cpu_var(mce_poll_banks)); +			continue; +		} + +		val |= CMCI_EN | CMCI_THRESHOLD; +		wrmsrl(MSR_IA32_MC0_CTL2 + i, val); +		rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + +		/* Did the enable bit stick? -- the bank supports CMCI */ +		if (val & CMCI_EN) { +			if (!test_and_set_bit(i, owned) || boot) +				print_update("CMCI", &hdr, i); +			__clear_bit(i, __get_cpu_var(mce_poll_banks)); +		} else { +			WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); +		} +	} +	spin_unlock(&cmci_discover_lock); +	if (hdr) +		printk(KERN_CONT "\n"); +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +void cmci_recheck(void) +{ +	unsigned long flags; +	int banks; + +	if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) +		return; +	local_irq_save(flags); +	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); +	local_irq_restore(flags); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void cmci_clear(void) +{ +	int i; +	int banks; +	u64 val; + +	if (!cmci_supported(&banks)) +		return; +	spin_lock(&cmci_discover_lock); +	for (i = 0; i < banks; i++) { +		if (!test_bit(i, __get_cpu_var(mce_banks_owned))) +			continue; +		/* Disable CMCI */ +		rdmsrl(MSR_IA32_MC0_CTL2 + i, val); +		val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); +		wrmsrl(MSR_IA32_MC0_CTL2 + i, val); +		__clear_bit(i, __get_cpu_var(mce_banks_owned)); +	} +	spin_unlock(&cmci_discover_lock); +} + +/* + * After a CPU went down cycle through all the others and rediscover + * Must run in process context. + */ +void cmci_rediscover(int dying) +{ +	int banks; +	int cpu; +	cpumask_var_t old; + +	if (!cmci_supported(&banks)) +		return; +	if (!alloc_cpumask_var(&old, GFP_KERNEL)) +		return; +	cpumask_copy(old, ¤t->cpus_allowed); + +	for_each_online_cpu (cpu) { +		if (cpu == dying) +			continue; +		if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) +			continue; +		/* Recheck banks in case CPUs don't all have the same */ +		if (cmci_supported(&banks)) +			cmci_discover(banks, 0); +	} + +	set_cpus_allowed_ptr(current, old); +	free_cpumask_var(old); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ +	int banks; +	if (cmci_supported(&banks)) +		cmci_discover(banks, 0); +} + +static __cpuinit void intel_init_cmci(void) +{ +	int banks; + +	if (!cmci_supported(&banks)) +		return; + +	mce_threshold_vector = intel_threshold_interrupt; +	cmci_discover(banks, 1); +	/* +	 * For CPU #0 this runs with still disabled APIC, but that's +	 * ok because only the vector is set up. We still do another +	 * check for the banks later for CPU #0 just to make sure +	 * to not miss any events. +	 */ +	apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); +	cmci_recheck(); +} +  void mce_intel_feature_init(struct cpuinfo_x86 *c)  {  	intel_init_thermal(c); +	intel_init_cmci();  } diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 9b60fce09f7..f53bdcbaf38 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -85,7 +85,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)  	 */  	rdmsr(MSR_IA32_MISC_ENABLE, l, h);  	h = apic_read(APIC_LVTTHMR); -	if ((l & (1<<3)) && (h & APIC_DM_SMI)) { +	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {  		printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",  				cpu);  		return; /* -EBUSY */ @@ -111,7 +111,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)  	vendor_thermal_interrupt = intel_thermal_interrupt;  	rdmsr(MSR_IA32_MISC_ENABLE, l, h); -	wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); +	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);  	l = apic_read(APIC_LVTTHMR);  	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c new file mode 100644 index 00000000000..23ee9e730f7 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -0,0 +1,29 @@ +/* + * Common corrected MCE threshold handler code: + */ +#include <linux/interrupt.h> +#include <linux/kernel.h> + +#include <asm/irq_vectors.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/mce.h> + +static void default_threshold_interrupt(void) +{ +	printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", +			 THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +asmlinkage void mce_threshold_interrupt(void) +{ +	exit_idle(); +	irq_enter(); +	inc_irq_stat(irq_threshold_count); +	mce_threshold_vector(); +	irq_exit(); +	/* Ack only at the end to avoid potential reentry */ +	ack_APIC_irq(); +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9abd48b2267..f6c70a164e3 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -19,7 +19,7 @@  #include <linux/nmi.h>  #include <linux/kprobes.h> -#include <asm/apic.h> +#include <asm/genapic.h>  #include <asm/intel_arch_perfmon.h>  struct nmi_watchdog_ctlblk { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 01b1244ef1c..d67e0e48bc2 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -7,11 +7,10 @@  /*   *	Get CPU information for use by the procfs.   */ -#ifdef CONFIG_X86_32  static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,  			      unsigned int cpu)  { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP  	if (c->x86_max_cores * smp_num_siblings > 1) {  		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);  		seq_printf(m, "siblings\t: %d\n", @@ -24,6 +23,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,  #endif  } +#ifdef CONFIG_X86_32  static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)  {  	/* @@ -50,22 +50,6 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)  		   c->wp_works_ok ? "yes" : "no");  }  #else -static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, -			      unsigned int cpu) -{ -#ifdef CONFIG_SMP -	if (c->x86_max_cores * smp_num_siblings > 1) { -		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); -		seq_printf(m, "siblings\t: %d\n", -			   cpus_weight(per_cpu(cpu_core_map, cpu))); -		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); -		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); -		seq_printf(m, "apicid\t\t: %d\n", c->apicid); -		seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); -	} -#endif -} -  static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)  {  	seq_printf(m, diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c689d19e35a..ff958248e61 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -24,12 +24,10 @@  #include <asm/apic.h>  #include <asm/hpet.h>  #include <linux/kdebug.h> -#include <asm/smp.h> +#include <asm/cpu.h>  #include <asm/reboot.h>  #include <asm/virtext.h> -#include <mach_ipi.h> -  #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6b1f6f6f866..87d103ded1c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -99,7 +99,7 @@ print_context_stack(struct thread_info *tinfo,  				frame = frame->next_frame;  				bp = (unsigned long) frame;  			} else { -				ops->address(data, addr, bp == 0); +				ops->address(data, addr, 0);  			}  			print_ftrace_graph_addr(addr, data, ops, tinfo, graph);  		} diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index c302d070704..d35db5993fd 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -106,7 +106,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  		const struct stacktrace_ops *ops, void *data)  {  	const unsigned cpu = get_cpu(); -	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; +	unsigned long *irq_stack_end = +		(unsigned long *)per_cpu(irq_stack_ptr, cpu);  	unsigned used = 0;  	struct thread_info *tinfo;  	int graph = 0; @@ -160,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  			stack = (unsigned long *) estack_end[-2];  			continue;  		} -		if (irqstack_end) { -			unsigned long *irqstack; -			irqstack = irqstack_end - -				(IRQSTACKSIZE - 64) / sizeof(*irqstack); +		if (irq_stack_end) { +			unsigned long *irq_stack; +			irq_stack = irq_stack_end - +				(IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); -			if (stack >= irqstack && stack < irqstack_end) { +			if (stack >= irq_stack && stack < irq_stack_end) {  				if (ops->stack(data, "IRQ") < 0)  					break;  				bp = print_context_stack(tinfo, stack, bp, -					ops, data, irqstack_end, &graph); +					ops, data, irq_stack_end, &graph);  				/*  				 * We link to the next stack (which would be  				 * the process stack normally) the last  				 * pointer (index -1 to end) in the IRQ stack:  				 */ -				stack = (unsigned long *) (irqstack_end[-1]); -				irqstack_end = NULL; +				stack = (unsigned long *) (irq_stack_end[-1]); +				irq_stack_end = NULL;  				ops->stack(data, "EOI");  				continue;  			} @@ -199,10 +200,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  	unsigned long *stack;  	int i;  	const int cpu = smp_processor_id(); -	unsigned long *irqstack_end = -		(unsigned long *) (cpu_pda(cpu)->irqstackptr); -	unsigned long *irqstack = -		(unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); +	unsigned long *irq_stack_end = +		(unsigned long *)(per_cpu(irq_stack_ptr, cpu)); +	unsigned long *irq_stack = +		(unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);  	/*  	 * debugging aid: "show_stack(NULL, NULL);" prints the @@ -218,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  	stack = sp;  	for (i = 0; i < kstack_depth_to_print; i++) { -		if (stack >= irqstack && stack <= irqstack_end) { -			if (stack == irqstack_end) { -				stack = (unsigned long *) (irqstack_end[-1]); +		if (stack >= irq_stack && stack <= irq_stack_end) { +			if (stack == irq_stack_end) { +				stack = (unsigned long *) (irq_stack_end[-1]);  				printk(" <EOI> ");  			}  		} else { @@ -241,7 +242,7 @@ void show_registers(struct pt_regs *regs)  	int i;  	unsigned long sp;  	const int cpu = smp_processor_id(); -	struct task_struct *cur = cpu_pda(cpu)->pcurrent; +	struct task_struct *cur = current;  	sp = regs->sp;  	printk("CPU %d ", cpu); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e85826829cf..508bec1cee2 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -858,6 +858,9 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)   */  void __init reserve_early(u64 start, u64 end, char *name)  { +	if (start >= end) +		return; +  	drop_overlaps_that_are_ok(start, end);  	__reserve_early(start, end, name, 0);  } diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 504ad198e4a..639ad98238a 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -13,8 +13,8 @@  #include <asm/setup.h>  #include <xen/hvc-console.h>  #include <asm/pci-direct.h> -#include <asm/pgtable.h>  #include <asm/fixmap.h> +#include <asm/pgtable.h>  #include <linux/usb/ehci_def.h>  /* Simple VGA output */ diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index eb1ef3b67dd..1736acc4d7a 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -366,10 +366,12 @@ void __init efi_init(void)  					SMBIOS_TABLE_GUID)) {  			efi.smbios = config_tables[i].table;  			printk(" SMBIOS=0x%lx ", config_tables[i].table); +#ifdef CONFIG_X86_UV  		} else if (!efi_guidcmp(config_tables[i].guid,  					UV_SYSTEM_TABLE_GUID)) {  			efi.uv_systab = config_tables[i].table;  			printk(" UVsystab=0x%lx ", config_tables[i].table); +#endif  		} else if (!efi_guidcmp(config_tables[i].guid,  					HCDP_TABLE_GUID)) {  			efi.hcdp = config_tables[i].table; diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index cb783b92c50..22c3b7828c5 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -36,6 +36,7 @@  #include <asm/proto.h>  #include <asm/efi.h>  #include <asm/cacheflush.h> +#include <asm/fixmap.h>  static pgd_t save_pgd __initdata;  static unsigned long efi_flags __initdata; diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S index ef00bb77d7e..fbe66e626c0 100644 --- a/arch/x86/kernel/efi_stub_32.S +++ b/arch/x86/kernel/efi_stub_32.S @@ -6,7 +6,7 @@   */  #include <linux/linkage.h> -#include <asm/page.h> +#include <asm/page_types.h>  /*   * efi_call_phys(void *, ...) is a function with variable parameters. @@ -113,6 +113,7 @@ ENTRY(efi_call_phys)  	movl	(%edx), %ecx  	pushl	%ecx  	ret +ENDPROC(efi_call_phys)  .previous  .data diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S index 99b47d48c9f..4c07ccab814 100644 --- a/arch/x86/kernel/efi_stub_64.S +++ b/arch/x86/kernel/efi_stub_64.S @@ -41,6 +41,7 @@ ENTRY(efi_call0)  	addq $32, %rsp  	RESTORE_XMM  	ret +ENDPROC(efi_call0)  ENTRY(efi_call1)  	SAVE_XMM @@ -50,6 +51,7 @@ ENTRY(efi_call1)  	addq $32, %rsp  	RESTORE_XMM  	ret +ENDPROC(efi_call1)  ENTRY(efi_call2)  	SAVE_XMM @@ -59,6 +61,7 @@ ENTRY(efi_call2)  	addq $32, %rsp  	RESTORE_XMM  	ret +ENDPROC(efi_call2)  ENTRY(efi_call3)  	SAVE_XMM @@ -69,6 +72,7 @@ ENTRY(efi_call3)  	addq $32, %rsp  	RESTORE_XMM  	ret +ENDPROC(efi_call3)  ENTRY(efi_call4)  	SAVE_XMM @@ -80,6 +84,7 @@ ENTRY(efi_call4)  	addq $32, %rsp  	RESTORE_XMM  	ret +ENDPROC(efi_call4)  ENTRY(efi_call5)  	SAVE_XMM @@ -92,6 +97,7 @@ ENTRY(efi_call5)  	addq $48, %rsp  	RESTORE_XMM  	ret +ENDPROC(efi_call5)  ENTRY(efi_call6)  	SAVE_XMM @@ -107,3 +113,4 @@ ENTRY(efi_call6)  	addq $48, %rsp  	RESTORE_XMM  	ret +ENDPROC(efi_call6) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 46469029e9d..899e8938e79 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -30,12 +30,13 @@   *	1C(%esp) - %ds   *	20(%esp) - %es   *	24(%esp) - %fs - *	28(%esp) - orig_eax - *	2C(%esp) - %eip - *	30(%esp) - %cs - *	34(%esp) - %eflags - *	38(%esp) - %oldesp - *	3C(%esp) - %oldss + *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS + *	2C(%esp) - orig_eax + *	30(%esp) - %eip + *	34(%esp) - %cs + *	38(%esp) - %eflags + *	3C(%esp) - %oldesp + *	40(%esp) - %oldss   *   * "current" is in register %ebx during any slow entries.   */ @@ -46,7 +47,7 @@  #include <asm/errno.h>  #include <asm/segment.h>  #include <asm/smp.h> -#include <asm/page.h> +#include <asm/page_types.h>  #include <asm/desc.h>  #include <asm/percpu.h>  #include <asm/dwarf2.h> @@ -101,121 +102,221 @@  #define resume_userspace_sig	resume_userspace  #endif -#define SAVE_ALL \ -	cld; \ -	pushl %fs; \ -	CFI_ADJUST_CFA_OFFSET 4;\ -	/*CFI_REL_OFFSET fs, 0;*/\ -	pushl %es; \ -	CFI_ADJUST_CFA_OFFSET 4;\ -	/*CFI_REL_OFFSET es, 0;*/\ -	pushl %ds; \ -	CFI_ADJUST_CFA_OFFSET 4;\ -	/*CFI_REL_OFFSET ds, 0;*/\ -	pushl %eax; \ -	CFI_ADJUST_CFA_OFFSET 4;\ -	CFI_REL_OFFSET eax, 0;\ -	pushl %ebp; \ -	CFI_ADJUST_CFA_OFFSET 4;\ -	CFI_REL_OFFSET ebp, 0;\ -	pushl %edi; \ -	CFI_ADJUST_CFA_OFFSET 4;\ -	CFI_REL_OFFSET edi, 0;\ -	pushl %esi; \ -	CFI_ADJUST_CFA_OFFSET 4;\ -	CFI_REL_OFFSET esi, 0;\ -	pushl %edx; \ -	CFI_ADJUST_CFA_OFFSET 4;\ -	CFI_REL_OFFSET edx, 0;\ -	pushl %ecx; \ -	CFI_ADJUST_CFA_OFFSET 4;\ -	CFI_REL_OFFSET ecx, 0;\ -	pushl %ebx; \ -	CFI_ADJUST_CFA_OFFSET 4;\ -	CFI_REL_OFFSET ebx, 0;\ -	movl $(__USER_DS), %edx; \ -	movl %edx, %ds; \ -	movl %edx, %es; \ -	movl $(__KERNEL_PERCPU), %edx; \ +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc.  Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS +	pushl $0 +	CFI_ADJUST_CFA_OFFSET 4 +.endm +.macro POP_GS pop=0 +	addl $(4 + \pop), %esp +	CFI_ADJUST_CFA_OFFSET -(4 + \pop) +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else	/* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS +	pushl %gs +	CFI_ADJUST_CFA_OFFSET 4 +	/*CFI_REL_OFFSET gs, 0*/ +.endm + +.macro POP_GS pop=0 +98:	popl %gs +	CFI_ADJUST_CFA_OFFSET -4 +	/*CFI_RESTORE gs*/ +  .if \pop <> 0 +	add $\pop, %esp +	CFI_ADJUST_CFA_OFFSET -\pop +  .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99:	movl $0, (%esp) +	jmp 98b +.section __ex_table, "a" +	.align 4 +	.long 98b, 99b +.popsection +.endm + +.macro PTGS_TO_GS +98:	mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99:	movl $0, PT_GS(%esp) +	jmp 98b +.section __ex_table, "a" +	.align 4 +	.long 98b, 99b +.popsection +.endm + +.macro GS_TO_REG reg +	movl %gs, \reg +	/*CFI_REGISTER gs, \reg*/ +.endm +.macro REG_TO_PTGS reg +	movl \reg, PT_GS(%esp) +	/*CFI_REL_OFFSET gs, PT_GS*/ +.endm +.macro SET_KERNEL_GS reg +	movl $(__KERNEL_STACK_CANARY), \reg +	movl \reg, %gs +.endm + +#endif	/* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL +	cld +	PUSH_GS +	pushl %fs +	CFI_ADJUST_CFA_OFFSET 4 +	/*CFI_REL_OFFSET fs, 0;*/ +	pushl %es +	CFI_ADJUST_CFA_OFFSET 4 +	/*CFI_REL_OFFSET es, 0;*/ +	pushl %ds +	CFI_ADJUST_CFA_OFFSET 4 +	/*CFI_REL_OFFSET ds, 0;*/ +	pushl %eax +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET eax, 0 +	pushl %ebp +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET ebp, 0 +	pushl %edi +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET edi, 0 +	pushl %esi +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET esi, 0 +	pushl %edx +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET edx, 0 +	pushl %ecx +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET ecx, 0 +	pushl %ebx +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET ebx, 0 +	movl $(__USER_DS), %edx +	movl %edx, %ds +	movl %edx, %es +	movl $(__KERNEL_PERCPU), %edx  	movl %edx, %fs +	SET_KERNEL_GS %edx +.endm -#define RESTORE_INT_REGS \ -	popl %ebx;	\ -	CFI_ADJUST_CFA_OFFSET -4;\ -	CFI_RESTORE ebx;\ -	popl %ecx;	\ -	CFI_ADJUST_CFA_OFFSET -4;\ -	CFI_RESTORE ecx;\ -	popl %edx;	\ -	CFI_ADJUST_CFA_OFFSET -4;\ -	CFI_RESTORE edx;\ -	popl %esi;	\ -	CFI_ADJUST_CFA_OFFSET -4;\ -	CFI_RESTORE esi;\ -	popl %edi;	\ -	CFI_ADJUST_CFA_OFFSET -4;\ -	CFI_RESTORE edi;\ -	popl %ebp;	\ -	CFI_ADJUST_CFA_OFFSET -4;\ -	CFI_RESTORE ebp;\ -	popl %eax;	\ -	CFI_ADJUST_CFA_OFFSET -4;\ +.macro RESTORE_INT_REGS +	popl %ebx +	CFI_ADJUST_CFA_OFFSET -4 +	CFI_RESTORE ebx +	popl %ecx +	CFI_ADJUST_CFA_OFFSET -4 +	CFI_RESTORE ecx +	popl %edx +	CFI_ADJUST_CFA_OFFSET -4 +	CFI_RESTORE edx +	popl %esi +	CFI_ADJUST_CFA_OFFSET -4 +	CFI_RESTORE esi +	popl %edi +	CFI_ADJUST_CFA_OFFSET -4 +	CFI_RESTORE edi +	popl %ebp +	CFI_ADJUST_CFA_OFFSET -4 +	CFI_RESTORE ebp +	popl %eax +	CFI_ADJUST_CFA_OFFSET -4  	CFI_RESTORE eax +.endm -#define RESTORE_REGS	\ -	RESTORE_INT_REGS; \ -1:	popl %ds;	\ -	CFI_ADJUST_CFA_OFFSET -4;\ -	/*CFI_RESTORE ds;*/\ -2:	popl %es;	\ -	CFI_ADJUST_CFA_OFFSET -4;\ -	/*CFI_RESTORE es;*/\ -3:	popl %fs;	\ -	CFI_ADJUST_CFA_OFFSET -4;\ -	/*CFI_RESTORE fs;*/\ -.pushsection .fixup,"ax";	\ -4:	movl $0,(%esp);	\ -	jmp 1b;		\ -5:	movl $0,(%esp);	\ -	jmp 2b;		\ -6:	movl $0,(%esp);	\ -	jmp 3b;		\ -.section __ex_table,"a";\ -	.align 4;	\ -	.long 1b,4b;	\ -	.long 2b,5b;	\ -	.long 3b,6b;	\ +.macro RESTORE_REGS pop=0 +	RESTORE_INT_REGS +1:	popl %ds +	CFI_ADJUST_CFA_OFFSET -4 +	/*CFI_RESTORE ds;*/ +2:	popl %es +	CFI_ADJUST_CFA_OFFSET -4 +	/*CFI_RESTORE es;*/ +3:	popl %fs +	CFI_ADJUST_CFA_OFFSET -4 +	/*CFI_RESTORE fs;*/ +	POP_GS \pop +.pushsection .fixup, "ax" +4:	movl $0, (%esp) +	jmp 1b +5:	movl $0, (%esp) +	jmp 2b +6:	movl $0, (%esp) +	jmp 3b +.section __ex_table, "a" +	.align 4 +	.long 1b, 4b +	.long 2b, 5b +	.long 3b, 6b  .popsection +	POP_GS_EX +.endm -#define RING0_INT_FRAME \ -	CFI_STARTPROC simple;\ -	CFI_SIGNAL_FRAME;\ -	CFI_DEF_CFA esp, 3*4;\ -	/*CFI_OFFSET cs, -2*4;*/\ +.macro RING0_INT_FRAME +	CFI_STARTPROC simple +	CFI_SIGNAL_FRAME +	CFI_DEF_CFA esp, 3*4 +	/*CFI_OFFSET cs, -2*4;*/  	CFI_OFFSET eip, -3*4 +.endm -#define RING0_EC_FRAME \ -	CFI_STARTPROC simple;\ -	CFI_SIGNAL_FRAME;\ -	CFI_DEF_CFA esp, 4*4;\ -	/*CFI_OFFSET cs, -2*4;*/\ +.macro RING0_EC_FRAME +	CFI_STARTPROC simple +	CFI_SIGNAL_FRAME +	CFI_DEF_CFA esp, 4*4 +	/*CFI_OFFSET cs, -2*4;*/  	CFI_OFFSET eip, -3*4 +.endm -#define RING0_PTREGS_FRAME \ -	CFI_STARTPROC simple;\ -	CFI_SIGNAL_FRAME;\ -	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ -	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ -	CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ -	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ -	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ -	CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ -	CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ -	CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ -	CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ -	CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ -	CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ +.macro RING0_PTREGS_FRAME +	CFI_STARTPROC simple +	CFI_SIGNAL_FRAME +	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX +	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ +	CFI_OFFSET eip, PT_EIP-PT_OLDESP +	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ +	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ +	CFI_OFFSET eax, PT_EAX-PT_OLDESP +	CFI_OFFSET ebp, PT_EBP-PT_OLDESP +	CFI_OFFSET edi, PT_EDI-PT_OLDESP +	CFI_OFFSET esi, PT_ESI-PT_OLDESP +	CFI_OFFSET edx, PT_EDX-PT_OLDESP +	CFI_OFFSET ecx, PT_ECX-PT_OLDESP  	CFI_OFFSET ebx, PT_EBX-PT_OLDESP +.endm  ENTRY(ret_from_fork)  	CFI_STARTPROC @@ -362,6 +463,7 @@ sysenter_exit:  	xorl %ebp,%ebp  	TRACE_IRQS_ON  1:	mov  PT_FS(%esp), %fs +	PTGS_TO_GS  	ENABLE_INTERRUPTS_SYSEXIT  #ifdef CONFIG_AUDITSYSCALL @@ -410,6 +512,7 @@ sysexit_audit:  	.align 4  	.long 1b,2b  .popsection +	PTGS_TO_GS_EX  ENDPROC(ia32_sysenter_target)  	# system call handler stub @@ -452,8 +555,7 @@ restore_all:  restore_nocheck:  	TRACE_IRQS_IRET  restore_nocheck_notrace: -	RESTORE_REGS -	addl $4, %esp			# skip orig_eax/error_code +	RESTORE_REGS 4			# skip orig_eax/error_code  	CFI_ADJUST_CFA_OFFSET -4  irq_return:  	INTERRUPT_RETURN @@ -595,28 +697,50 @@ syscall_badsys:  END(syscall_badsys)  	CFI_ENDPROC -#define FIXUP_ESPFIX_STACK \ -	/* since we are on a wrong stack, we cant make it a C code :( */ \ -	PER_CPU(gdt_page, %ebx); \ -	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ -	addl %esp, %eax; \ -	pushl $__KERNEL_DS; \ -	CFI_ADJUST_CFA_OFFSET 4; \ -	pushl %eax; \ -	CFI_ADJUST_CFA_OFFSET 4; \ -	lss (%esp), %esp; \ -	CFI_ADJUST_CFA_OFFSET -8; -#define UNWIND_ESPFIX_STACK \ -	movl %ss, %eax; \ -	/* see if on espfix stack */ \ -	cmpw $__ESPFIX_SS, %ax; \ -	jne 27f; \ -	movl $__KERNEL_DS, %eax; \ -	movl %eax, %ds; \ -	movl %eax, %es; \ -	/* switch to normal stack */ \ -	FIXUP_ESPFIX_STACK; \ -27:; +/* + * System calls that need a pt_regs pointer. + */ +#define PTREGSCALL(name) \ +	ALIGN; \ +ptregs_##name: \ +	leal 4(%esp),%eax; \ +	jmp sys_##name; + +PTREGSCALL(iopl) +PTREGSCALL(fork) +PTREGSCALL(clone) +PTREGSCALL(vfork) +PTREGSCALL(execve) +PTREGSCALL(sigaltstack) +PTREGSCALL(sigreturn) +PTREGSCALL(rt_sigreturn) +PTREGSCALL(vm86) +PTREGSCALL(vm86old) + +.macro FIXUP_ESPFIX_STACK +	/* since we are on a wrong stack, we cant make it a C code :( */ +	PER_CPU(gdt_page, %ebx) +	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) +	addl %esp, %eax +	pushl $__KERNEL_DS +	CFI_ADJUST_CFA_OFFSET 4 +	pushl %eax +	CFI_ADJUST_CFA_OFFSET 4 +	lss (%esp), %esp +	CFI_ADJUST_CFA_OFFSET -8 +.endm +.macro UNWIND_ESPFIX_STACK +	movl %ss, %eax +	/* see if on espfix stack */ +	cmpw $__ESPFIX_SS, %ax +	jne 27f +	movl $__KERNEL_DS, %eax +	movl %eax, %ds +	movl %eax, %es +	/* switch to normal stack */ +	FIXUP_ESPFIX_STACK +27: +.endm  /*   * Build the entry stubs and pointer table with some assembler magic. @@ -672,7 +796,7 @@ common_interrupt:  ENDPROC(common_interrupt)  	CFI_ENDPROC -#define BUILD_INTERRUPT(name, nr)	\ +#define BUILD_INTERRUPT3(name, nr, fn)	\  ENTRY(name)				\  	RING0_INT_FRAME;		\  	pushl $~(nr);			\ @@ -680,13 +804,15 @@ ENTRY(name)				\  	SAVE_ALL;			\  	TRACE_IRQS_OFF			\  	movl %esp,%eax;			\ -	call smp_##name;		\ +	call fn;			\  	jmp ret_from_intr;		\  	CFI_ENDPROC;			\  ENDPROC(name) +#define BUILD_INTERRUPT(name, nr)	BUILD_INTERRUPT3(name, nr, smp_##name) +  /* The include is where all of the SMP etc. interrupts come from */ -#include "entry_arch.h" +#include <asm/entry_arch.h>  ENTRY(coprocessor_error)  	RING0_INT_FRAME @@ -1068,7 +1194,10 @@ ENTRY(page_fault)  	CFI_ADJUST_CFA_OFFSET 4  	ALIGN  error_code: -	/* the function address is in %fs's slot on the stack */ +	/* the function address is in %gs's slot on the stack */ +	pushl %fs +	CFI_ADJUST_CFA_OFFSET 4 +	/*CFI_REL_OFFSET fs, 0*/  	pushl %es  	CFI_ADJUST_CFA_OFFSET 4  	/*CFI_REL_OFFSET es, 0*/ @@ -1097,20 +1226,15 @@ error_code:  	CFI_ADJUST_CFA_OFFSET 4  	CFI_REL_OFFSET ebx, 0  	cld -	pushl %fs -	CFI_ADJUST_CFA_OFFSET 4 -	/*CFI_REL_OFFSET fs, 0*/  	movl $(__KERNEL_PERCPU), %ecx  	movl %ecx, %fs  	UNWIND_ESPFIX_STACK -	popl %ecx -	CFI_ADJUST_CFA_OFFSET -4 -	/*CFI_REGISTER es, ecx*/ -	movl PT_FS(%esp), %edi		# get the function address +	GS_TO_REG %ecx +	movl PT_GS(%esp), %edi		# get the function address  	movl PT_ORIG_EAX(%esp), %edx	# get the error code  	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart -	mov  %ecx, PT_FS(%esp) -	/*CFI_REL_OFFSET fs, ES*/ +	REG_TO_PTGS %ecx +	SET_KERNEL_GS %ecx  	movl $(__USER_DS), %ecx  	movl %ecx, %ds  	movl %ecx, %es @@ -1134,26 +1258,27 @@ END(page_fault)   * by hand onto the new stack - while updating the return eip past   * the instruction that would have done it for sysenter.   */ -#define FIX_STACK(offset, ok, label)		\ -	cmpw $__KERNEL_CS,4(%esp);		\ -	jne ok;					\ -label:						\ -	movl TSS_sysenter_sp0+offset(%esp),%esp;	\ -	CFI_DEF_CFA esp, 0;			\ -	CFI_UNDEFINED eip;			\ -	pushfl;					\ -	CFI_ADJUST_CFA_OFFSET 4;		\ -	pushl $__KERNEL_CS;			\ -	CFI_ADJUST_CFA_OFFSET 4;		\ -	pushl $sysenter_past_esp;		\ -	CFI_ADJUST_CFA_OFFSET 4;		\ +.macro FIX_STACK offset ok label +	cmpw $__KERNEL_CS, 4(%esp) +	jne \ok +\label: +	movl TSS_sysenter_sp0 + \offset(%esp), %esp +	CFI_DEF_CFA esp, 0 +	CFI_UNDEFINED eip +	pushfl +	CFI_ADJUST_CFA_OFFSET 4 +	pushl $__KERNEL_CS +	CFI_ADJUST_CFA_OFFSET 4 +	pushl $sysenter_past_esp +	CFI_ADJUST_CFA_OFFSET 4  	CFI_REL_OFFSET eip, 0 +.endm  ENTRY(debug)  	RING0_INT_FRAME  	cmpl $ia32_sysenter_target,(%esp)  	jne debug_stack_correct -	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn  debug_stack_correct:  	pushl $-1			# mark this as an int  	CFI_ADJUST_CFA_OFFSET 4 @@ -1211,7 +1336,7 @@ nmi_stack_correct:  nmi_stack_fixup:  	RING0_INT_FRAME -	FIX_STACK(12,nmi_stack_correct, 1) +	FIX_STACK 12, nmi_stack_correct, 1  	jmp nmi_stack_correct  nmi_debug_stack_check: @@ -1222,7 +1347,7 @@ nmi_debug_stack_check:  	jb nmi_stack_correct  	cmpl $debug_esp_fix_insn,(%esp)  	ja nmi_stack_correct -	FIX_STACK(24,nmi_stack_correct, 1) +	FIX_STACK 24, nmi_stack_correct, 1  	jmp nmi_stack_correct  nmi_espfix_stack: @@ -1234,7 +1359,7 @@ nmi_espfix_stack:  	CFI_ADJUST_CFA_OFFSET 4  	pushl %esp  	CFI_ADJUST_CFA_OFFSET 4 -	addw $4, (%esp) +	addl $4, (%esp)  	/* copy the iret frame of 12 bytes */  	.rept 3  	pushl 16(%esp) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a1346217e43..7ba4621c0df 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -48,10 +48,11 @@  #include <asm/unistd.h>  #include <asm/thread_info.h>  #include <asm/hw_irq.h> -#include <asm/page.h> +#include <asm/page_types.h>  #include <asm/irqflags.h>  #include <asm/paravirt.h>  #include <asm/ftrace.h> +#include <asm/percpu.h>  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */  #include <linux/elf-em.h> @@ -76,20 +77,17 @@ ENTRY(ftrace_caller)  	movq 8(%rbp), %rsi  	subq $MCOUNT_INSN_SIZE, %rdi -.globl ftrace_call -ftrace_call: +GLOBAL(ftrace_call)  	call ftrace_stub  	MCOUNT_RESTORE_FRAME  #ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: +GLOBAL(ftrace_graph_call)  	jmp ftrace_stub  #endif -.globl ftrace_stub -ftrace_stub: +GLOBAL(ftrace_stub)  	retq  END(ftrace_caller) @@ -109,8 +107,7 @@ ENTRY(mcount)  	jnz ftrace_graph_caller  #endif -.globl ftrace_stub -ftrace_stub: +GLOBAL(ftrace_stub)  	retq  trace: @@ -147,9 +144,7 @@ ENTRY(ftrace_graph_caller)  	retq  END(ftrace_graph_caller) - -.globl return_to_handler -return_to_handler: +GLOBAL(return_to_handler)  	subq  $80, %rsp  	movq %rax, (%rsp) @@ -187,6 +182,7 @@ return_to_handler:  ENTRY(native_usergs_sysret64)  	swapgs  	sysretq +ENDPROC(native_usergs_sysret64)  #endif /* CONFIG_PARAVIRT */ @@ -209,7 +205,7 @@ ENTRY(native_usergs_sysret64)  	/* %rsp:at FRAMEEND */  	.macro FIXUP_TOP_OF_STACK tmp offset=0 -	movq %gs:pda_oldrsp,\tmp +	movq PER_CPU_VAR(old_rsp),\tmp  	movq \tmp,RSP+\offset(%rsp)  	movq $__USER_DS,SS+\offset(%rsp)  	movq $__USER_CS,CS+\offset(%rsp) @@ -220,7 +216,7 @@ ENTRY(native_usergs_sysret64)  	.macro RESTORE_TOP_OF_STACK tmp offset=0  	movq RSP+\offset(%rsp),\tmp -	movq \tmp,%gs:pda_oldrsp +	movq \tmp,PER_CPU_VAR(old_rsp)  	movq EFLAGS+\offset(%rsp),\tmp  	movq \tmp,R11+\offset(%rsp)  	.endm @@ -336,15 +332,15 @@ ENTRY(save_args)  	je 1f  	SWAPGS  	/* -	 * irqcount is used to check if a CPU is already on an interrupt stack +	 * irq_count is used to check if a CPU is already on an interrupt stack  	 * or not. While this is essentially redundant with preempt_count it is  	 * a little cheaper to use a separate counter in the PDA (short of  	 * moving irq_enter into assembly, which would be too much work)  	 */ -1:	incl %gs:pda_irqcount +1:	incl PER_CPU_VAR(irq_count)  	jne 2f  	popq_cfi %rax			/* move return address... */ -	mov %gs:pda_irqstackptr,%rsp +	mov PER_CPU_VAR(irq_stack_ptr),%rsp  	EMPTY_FRAME 0  	pushq_cfi %rbp			/* backlink for unwinder */  	pushq_cfi %rax			/* ... to the new stack */ @@ -409,6 +405,8 @@ END(save_paranoid)  ENTRY(ret_from_fork)  	DEFAULT_FRAME +	LOCK ; btr $TIF_FORK,TI_flags(%r8) +  	push kernel_eflags(%rip)  	CFI_ADJUST_CFA_OFFSET 8  	popf					# reset kernel eflags @@ -468,7 +466,7 @@ END(ret_from_fork)  ENTRY(system_call)  	CFI_STARTPROC	simple  	CFI_SIGNAL_FRAME -	CFI_DEF_CFA	rsp,PDA_STACKOFFSET +	CFI_DEF_CFA	rsp,KERNEL_STACK_OFFSET  	CFI_REGISTER	rip,rcx  	/*CFI_REGISTER	rflags,r11*/  	SWAPGS_UNSAFE_STACK @@ -479,8 +477,8 @@ ENTRY(system_call)  	 */  ENTRY(system_call_after_swapgs) -	movq	%rsp,%gs:pda_oldrsp -	movq	%gs:pda_kernelstack,%rsp +	movq	%rsp,PER_CPU_VAR(old_rsp) +	movq	PER_CPU_VAR(kernel_stack),%rsp  	/*  	 * No need to follow this irqs off/on section - it's straight  	 * and short: @@ -523,7 +521,7 @@ sysret_check:  	CFI_REGISTER	rip,rcx  	RESTORE_ARGS 0,-ARG_SKIP,1  	/*CFI_REGISTER	rflags,r11*/ -	movq	%gs:pda_oldrsp, %rsp +	movq	PER_CPU_VAR(old_rsp), %rsp  	USERGS_SYSRET64  	CFI_RESTORE_STATE @@ -630,16 +628,14 @@ tracesys:   * Syscall return path ending with IRET.   * Has correct top of stack, but partial stack frame.   */ -	.globl int_ret_from_sys_call -	.globl int_with_check -int_ret_from_sys_call: +GLOBAL(int_ret_from_sys_call)  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF  	testl $3,CS-ARGOFFSET(%rsp)  	je retint_restore_args  	movl $_TIF_ALLWORK_MASK,%edi  	/* edi:	mask to check */ -int_with_check: +GLOBAL(int_with_check)  	LOCKDEP_SYS_EXIT_IRQ  	GET_THREAD_INFO(%rcx)  	movl TI_flags(%rcx),%edx @@ -833,11 +829,11 @@ common_interrupt:  	XCPT_FRAME  	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */  	interrupt do_IRQ -	/* 0(%rsp): oldrsp-ARGOFFSET */ +	/* 0(%rsp): old_rsp-ARGOFFSET */  ret_from_intr:  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF -	decl %gs:pda_irqcount +	decl PER_CPU_VAR(irq_count)  	leaveq  	CFI_DEF_CFA_REGISTER	rsp  	CFI_ADJUST_CFA_OFFSET	-8 @@ -982,10 +978,14 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \  	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt  #endif +#ifdef CONFIG_X86_UV  apicinterrupt UV_BAU_MESSAGE \  	uv_bau_message_intr1 uv_bau_message_interrupt +#endif  apicinterrupt LOCAL_TIMER_VECTOR \  	apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt GENERIC_INTERRUPT_VECTOR \ +	generic_interrupt smp_generic_interrupt  #ifdef CONFIG_SMP  apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ @@ -1073,10 +1073,10 @@ ENTRY(\sym)  	TRACE_IRQS_OFF  	movq %rsp,%rdi		/* pt_regs pointer */  	xorl %esi,%esi		/* no error code */ -	movq %gs:pda_data_offset, %rbp -	subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) +	PER_CPU(init_tss, %rbp) +	subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)  	call \do_sym -	addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) +	addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)  	jmp paranoid_exit	/* %ebx: no swapgs flag */  	CFI_ENDPROC  END(\sym) @@ -1138,7 +1138,7 @@ ENTRY(native_load_gs_index)  	CFI_STARTPROC  	pushf  	CFI_ADJUST_CFA_OFFSET 8 -	DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) +	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)  	SWAPGS  gs_change:  	movl %edi,%gs @@ -1260,14 +1260,14 @@ ENTRY(call_softirq)  	CFI_REL_OFFSET rbp,0  	mov  %rsp,%rbp  	CFI_DEF_CFA_REGISTER rbp -	incl %gs:pda_irqcount -	cmove %gs:pda_irqstackptr,%rsp +	incl PER_CPU_VAR(irq_count) +	cmove PER_CPU_VAR(irq_stack_ptr),%rsp  	push  %rbp			# backlink for old unwinder  	call __do_softirq  	leaveq  	CFI_DEF_CFA_REGISTER	rsp  	CFI_ADJUST_CFA_OFFSET   -8 -	decl %gs:pda_irqcount +	decl PER_CPU_VAR(irq_count)  	ret  	CFI_ENDPROC  END(call_softirq) @@ -1297,15 +1297,15 @@ ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)  	movq %rdi, %rsp            # we don't return, adjust the stack frame  	CFI_ENDPROC  	DEFAULT_FRAME -11:	incl %gs:pda_irqcount +11:	incl PER_CPU_VAR(irq_count)  	movq %rsp,%rbp  	CFI_DEF_CFA_REGISTER rbp -	cmovzq %gs:pda_irqstackptr,%rsp +	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp  	pushq %rbp			# backlink for old unwinder  	call xen_evtchn_do_upcall  	popq %rsp  	CFI_DEF_CFA_REGISTER rsp -	decl %gs:pda_irqcount +	decl PER_CPU_VAR(irq_count)  	jmp  error_exit  	CFI_ENDPROC  END(do_hypervisor_callback) diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c deleted file mode 100644 index 53699c931ad..00000000000 --- a/arch/x86/kernel/es7000_32.c +++ /dev/null @@ -1,378 +0,0 @@ -/* - * Written by: Garry Forsgren, Unisys Corporation - *             Natalie Protasevich, Unisys Corporation - * This file contains the code to configure and interface - * with Unisys ES7000 series hardware system manager. - * - * Copyright (c) 2003 Unisys Corporation.  All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Unisys Corporation, Township Line & Union Meeting - * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: - * - * http://www.unisys.com - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/string.h> -#include <linux/spinlock.h> -#include <linux/errno.h> -#include <linux/notifier.h> -#include <linux/reboot.h> -#include <linux/init.h> -#include <linux/acpi.h> -#include <asm/io.h> -#include <asm/nmi.h> -#include <asm/smp.h> -#include <asm/atomic.h> -#include <asm/apicdef.h> -#include <mach_mpparse.h> -#include <asm/genapic.h> -#include <asm/setup.h> - -/* - * ES7000 chipsets - */ - -#define NON_UNISYS		0 -#define ES7000_CLASSIC		1 -#define ES7000_ZORRO		2 - - -#define	MIP_REG			1 -#define	MIP_PSAI_REG		4 - -#define	MIP_BUSY		1 -#define	MIP_SPIN		0xf0000 -#define	MIP_VALID		0x0100000000000000ULL -#define	MIP_PORT(VALUE)	((VALUE >> 32) & 0xffff) - -#define	MIP_RD_LO(VALUE)	(VALUE & 0xffffffff) - -struct mip_reg_info { -	unsigned long long mip_info; -	unsigned long long delivery_info; -	unsigned long long host_reg; -	unsigned long long mip_reg; -}; - -struct part_info { -	unsigned char type; -	unsigned char length; -	unsigned char part_id; -	unsigned char apic_mode; -	unsigned long snum; -	char ptype[16]; -	char sname[64]; -	char pname[64]; -}; - -struct psai { -	unsigned long long entry_type; -	unsigned long long addr; -	unsigned long long bep_addr; -}; - -struct es7000_mem_info { -	unsigned char type; -	unsigned char length; -	unsigned char resv[6]; -	unsigned long long  start; -	unsigned long long  size; -}; - -struct es7000_oem_table { -	unsigned long long hdr; -	struct mip_reg_info mip; -	struct part_info pif; -	struct es7000_mem_info shm; -	struct psai psai; -}; - -#ifdef CONFIG_ACPI - -struct oem_table { -	struct acpi_table_header Header; -	u32 OEMTableAddr; -	u32 OEMTableSize; -}; - -extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); -extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); -#endif - -struct mip_reg { -	unsigned long long off_0; -	unsigned long long off_8; -	unsigned long long off_10; -	unsigned long long off_18; -	unsigned long long off_20; -	unsigned long long off_28; -	unsigned long long off_30; -	unsigned long long off_38; -}; - -#define	MIP_SW_APIC		0x1020b -#define	MIP_FUNC(VALUE)		(VALUE & 0xff) - -/* - * ES7000 Globals - */ - -static volatile unsigned long	*psai = NULL; -static struct mip_reg		*mip_reg; -static struct mip_reg		*host_reg; -static int 			mip_port; -static unsigned long		mip_addr, host_addr; - -int es7000_plat; - -/* - * GSI override for ES7000 platforms. - */ - -static unsigned int base; - -static int -es7000_rename_gsi(int ioapic, int gsi) -{ -	if (es7000_plat == ES7000_ZORRO) -		return gsi; - -	if (!base) { -		int i; -		for (i = 0; i < nr_ioapics; i++) -			base += nr_ioapic_registers[i]; -	} - -	if (!ioapic && (gsi < 16)) -		gsi += base; -	return gsi; -} - -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) -{ -	unsigned long vect = 0, psaival = 0; - -	if (psai == NULL) -		return -1; - -	vect = ((unsigned long)__pa(eip)/0x1000) << 16; -	psaival = (0x1000000 | vect | cpu); - -	while (*psai & 0x1000000) -		; - -	*psai = psaival; - -	return 0; -} - -static void noop_wait_for_deassert(atomic_t *deassert_not_used) -{ -} - -static int __init es7000_update_genapic(void) -{ -	genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; - -	/* MPENTIUMIII */ -	if (boot_cpu_data.x86 == 6 && -	    (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) { -		es7000_update_genapic_to_cluster(); -		genapic->wait_for_init_deassert = noop_wait_for_deassert; -		genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; -	} - -	return 0; -} - -void __init -setup_unisys(void) -{ -	/* -	 * Determine the generation of the ES7000 currently running. -	 * -	 * es7000_plat = 1 if the machine is a 5xx ES7000 box -	 * es7000_plat = 2 if the machine is a x86_64 ES7000 box -	 * -	 */ -	if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) -		es7000_plat = ES7000_ZORRO; -	else -		es7000_plat = ES7000_CLASSIC; -	ioapic_renumber_irq = es7000_rename_gsi; - -	x86_quirks->update_genapic = es7000_update_genapic; -} - -/* - * Parse the OEM Table - */ - -int __init -parse_unisys_oem (char *oemptr) -{ -	int                     i; -	int 			success = 0; -	unsigned char           type, size; -	unsigned long           val; -	char                    *tp = NULL; -	struct psai             *psaip = NULL; -	struct mip_reg_info 	*mi; -	struct mip_reg		*host, *mip; - -	tp = oemptr; - -	tp += 8; - -	for (i=0; i <= 6; i++) { -		type = *tp++; -		size = *tp++; -		tp -= 2; -		switch (type) { -		case MIP_REG: -			mi = (struct mip_reg_info *)tp; -			val = MIP_RD_LO(mi->host_reg); -			host_addr = val; -			host = (struct mip_reg *)val; -			host_reg = __va(host); -			val = MIP_RD_LO(mi->mip_reg); -			mip_port = MIP_PORT(mi->mip_info); -			mip_addr = val; -			mip = (struct mip_reg *)val; -			mip_reg = __va(mip); -			pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", -				 (unsigned long)host_reg); -			pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", -				 (unsigned long)mip_reg); -			success++; -			break; -		case MIP_PSAI_REG: -			psaip = (struct psai *)tp; -			if (tp != NULL) { -				if (psaip->addr) -					psai = __va(psaip->addr); -				else -					psai = NULL; -				success++; -			} -			break; -		default: -			break; -		} -		tp += size; -	} - -	if (success < 2) { -		es7000_plat = NON_UNISYS; -	} else -		setup_unisys(); -	return es7000_plat; -} - -#ifdef CONFIG_ACPI -static unsigned long oem_addrX; -static unsigned long oem_size; -int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) -{ -	struct acpi_table_header *header = NULL; -	int i = 0; - -	while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { -		if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { -			struct oem_table *t = (struct oem_table *)header; - -			oem_addrX = t->OEMTableAddr; -			oem_size = t->OEMTableSize; - -			*oem_addr = (unsigned long)__acpi_map_table(oem_addrX, -								    oem_size); -			return 0; -		} -	} -	return -1; -} - -void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) -{ -} -#endif - -static void -es7000_spin(int n) -{ -	int i = 0; - -	while (i++ < n) -		rep_nop(); -} - -static int __init -es7000_mip_write(struct mip_reg *mip_reg) -{ -	int			status = 0; -	int			spin; - -	spin = MIP_SPIN; -	while (((unsigned long long)host_reg->off_38 & -		(unsigned long long)MIP_VALID) != 0) { -			if (--spin <= 0) { -				printk("es7000_mip_write: Timeout waiting for Host Valid Flag"); -				return -1; -			} -		es7000_spin(MIP_SPIN); -	} - -	memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); -	outb(1, mip_port); - -	spin = MIP_SPIN; - -	while (((unsigned long long)mip_reg->off_38 & -		(unsigned long long)MIP_VALID) == 0) { -		if (--spin <= 0) { -			printk("es7000_mip_write: Timeout waiting for MIP Valid Flag"); -			return -1; -		} -		es7000_spin(MIP_SPIN); -	} - -	status = ((unsigned long long)mip_reg->off_0 & -		(unsigned long long)0xffff0000000000ULL) >> 48; -	mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 & -		(unsigned long long)~MIP_VALID); -	return status; -} - -void __init -es7000_sw_apic(void) -{ -	if (es7000_plat) { -		int mip_status; -		struct mip_reg es7000_mip_reg; - -		printk("ES7000: Enabling APIC mode.\n"); -        	memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); -        	es7000_mip_reg.off_0 = MIP_SW_APIC; -        	es7000_mip_reg.off_38 = (MIP_VALID); -        	while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) -              		printk("es7000_sw_apic: command failed, status = %x\n", -				mip_status); -		return; -	} -} diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b9a4d8c4b93..f5b27224769 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -26,27 +26,6 @@  #include <asm/bios_ebda.h>  #include <asm/trampoline.h> -/* boot cpu pda */ -static struct x8664_pda _boot_cpu_pda; - -#ifdef CONFIG_SMP -/* - * We install an empty cpu_pda pointer table to indicate to early users - * (numa_set_node) that the cpu_pda pointer table for cpus other than - * the boot cpu is not yet setup. - */ -static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; -#else -static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; -#endif - -void __init x86_64_init_pda(void) -{ -	_cpu_pda = __cpu_pda; -	cpu_pda(0) = &_boot_cpu_pda; -	pda_init(0); -} -  static void __init zap_identity_mappings(void)  {  	pgd_t *pgd = pgd_offset_k(0UL); @@ -112,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)  	if (console_loglevel == 10)  		early_printk("Kernel alive\n"); -	x86_64_init_pda(); -  	x86_64_start_reservations(real_mode_data);  } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index e835b4eea70..c32ca19d591 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -11,14 +11,15 @@  #include <linux/init.h>  #include <linux/linkage.h>  #include <asm/segment.h> -#include <asm/page.h> -#include <asm/pgtable.h> +#include <asm/page_types.h> +#include <asm/pgtable_types.h>  #include <asm/desc.h>  #include <asm/cache.h>  #include <asm/thread_info.h>  #include <asm/asm-offsets.h>  #include <asm/setup.h>  #include <asm/processor-flags.h> +#include <asm/percpu.h>  /* Physical address */  #define pa(X) ((X) - __PAGE_OFFSET) @@ -429,14 +430,34 @@ is386:	movl $2,%ecx		# set MP  	ljmp $(__KERNEL_CS),$1f  1:	movl $(__KERNEL_DS),%eax	# reload all the segment registers  	movl %eax,%ss			# after changing gdt. -	movl %eax,%fs			# gets reset once there's real percpu  	movl $(__USER_DS),%eax		# DS/ES contains default USER segment  	movl %eax,%ds  	movl %eax,%es -	xorl %eax,%eax			# Clear GS and LDT +	movl $(__KERNEL_PERCPU), %eax +	movl %eax,%fs			# set this cpu's percpu + +#ifdef CONFIG_CC_STACKPROTECTOR +	/* +	 * The linker can't handle this by relocation.  Manually set +	 * base address in stack canary segment descriptor. +	 */ +	cmpb $0,ready +	jne 1f +	movl $per_cpu__gdt_page,%eax +	movl $per_cpu__stack_canary,%ecx +	subl $20, %ecx +	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) +	shrl $16, %ecx +	movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) +	movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +1: +#endif +	movl $(__KERNEL_STACK_CANARY),%eax  	movl %eax,%gs + +	xorl %eax,%eax			# Clear LDT  	lldt %ax  	cld			# gcc2 wants the direction flag cleared at all times @@ -446,8 +467,6 @@ is386:	movl $2,%ecx		# set MP  	movb $1, ready  	cmpb $0,%cl		# the first CPU calls start_kernel  	je   1f -	movl $(__KERNEL_PERCPU), %eax -	movl %eax,%fs		# set this cpu's percpu  	movl (stack_start), %esp  1:  #endif /* CONFIG_SMP */ @@ -548,12 +567,8 @@ early_fault:  	pushl %eax  	pushl %edx		/* trapno */  	pushl $fault_msg -#ifdef CONFIG_EARLY_PRINTK -	call early_printk -#else  	call printk  #endif -#endif  	call dump_stack  hlt_loop:  	hlt @@ -580,11 +595,10 @@ ignore_int:  	pushl 32(%esp)  	pushl 40(%esp)  	pushl $int_msg -#ifdef CONFIG_EARLY_PRINTK -	call early_printk -#else  	call printk -#endif + +	call dump_stack +  	addl $(5*4),%esp  	popl %ds  	popl %es @@ -660,7 +674,7 @@ early_recursion_flag:  	.long 0  int_msg: -	.asciz "Unknown interrupt or fault at EIP %p %p %p\n" +	.asciz "Unknown interrupt or fault at: %p %p %p\n"  fault_msg:  /* fault info: */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 0e275d49556..54b29bb24e7 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -19,6 +19,7 @@  #include <asm/msr.h>  #include <asm/cache.h>  #include <asm/processor-flags.h> +#include <asm/percpu.h>  #ifdef CONFIG_PARAVIRT  #include <asm/asm-offsets.h> @@ -226,12 +227,15 @@ ENTRY(secondary_startup_64)  	movl %eax,%fs  	movl %eax,%gs -	/*  -	 * Setup up a dummy PDA. this is just for some early bootup code -	 * that does in_interrupt()  -	 */  +	/* Set up %gs. +	 * +	 * The base of %gs always points to the bottom of the irqstack +	 * union.  If the stack protector canary is enabled, it is +	 * located at %gs:40.  Note that, on SMP, the boot cpu uses +	 * init data section till per cpu areas are set up. +	 */  	movl	$MSR_GS_BASE,%ecx -	movq	$empty_zero_page,%rax +	movq	initial_gs(%rip),%rax  	movq    %rax,%rdx  	shrq	$32,%rdx  	wrmsr	 @@ -257,6 +261,8 @@ ENTRY(secondary_startup_64)  	.align	8  	ENTRY(initial_code)  	.quad	x86_64_start_kernel +	ENTRY(initial_gs) +	.quad	INIT_PER_CPU_VAR(irq_stack_union)  	__FINITDATA  	ENTRY(stack_start) @@ -323,8 +329,6 @@ early_idt_ripmsg:  #endif /* CONFIG_EARLY_PRINTK */  	.previous -.balign PAGE_SIZE -  #define NEXT_PAGE(name) \  	.balign	PAGE_SIZE; \  ENTRY(name) @@ -401,7 +405,8 @@ NEXT_PAGE(level2_spare_pgt)  	.globl early_gdt_descr  early_gdt_descr:  	.word	GDT_ENTRIES*8-1 -	.quad   per_cpu__gdt_page +early_gdt_descr_base: +	.quad	INIT_PER_CPU_VAR(gdt_page)  ENTRY(phys_base)  	/* This must match the first entry in level2_kernel_pgt */ @@ -412,7 +417,7 @@ ENTRY(phys_base)  	.section .bss, "aw", @nobits  	.align L1_CACHE_BYTES  ENTRY(idt_table) -	.skip 256 * 16 +	.skip IDT_ENTRIES * 16  	.section .bss.page_aligned, "aw", @nobits  	.align PAGE_SIZE diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 11d5093eb28..df89102bef8 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -22,7 +22,6 @@  #include <asm/pgtable.h>  #include <asm/desc.h>  #include <asm/apic.h> -#include <asm/arch_hooks.h>  #include <asm/i8259.h>  /* diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index b12208f4dfe..99c4d308f16 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -85,19 +85,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)  	t->io_bitmap_max = bytes; -#ifdef CONFIG_X86_32 -	/* -	 * Sets the lazy trigger so that the next I/O operation will -	 * reload the correct bitmap. -	 * Reset the owner so that a process switch will not set -	 * tss->io_bitmap_base to IO_BITMAP_OFFSET. -	 */ -	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; -	tss->io_bitmap_owner = NULL; -#else  	/* Update the TSS: */  	memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); -#endif  	put_cpu(); @@ -131,9 +120,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)  }  #ifdef CONFIG_X86_32 -asmlinkage long sys_iopl(unsigned long regsp) +long sys_iopl(struct pt_regs *regs)  { -	struct pt_regs *regs = (struct pt_regs *)®sp;  	unsigned int level = regs->bx;  	struct thread_struct *t = ¤t->thread;  	int rc; diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c deleted file mode 100644 index 285bbf8831f..00000000000 --- a/arch/x86/kernel/ipi.c +++ /dev/null @@ -1,190 +0,0 @@ -#include <linux/cpumask.h> -#include <linux/interrupt.h> -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/cache.h> -#include <linux/cpu.h> -#include <linux/module.h> - -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/apic.h> -#include <asm/proto.h> - -#ifdef CONFIG_X86_32 -#include <mach_apic.h> -#include <mach_ipi.h> - -/* - * the following functions deal with sending IPIs between CPUs. - * - * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. - */ - -static inline int __prepare_ICR(unsigned int shortcut, int vector) -{ -	unsigned int icr = shortcut | APIC_DEST_LOGICAL; - -	switch (vector) { -	default: -		icr |= APIC_DM_FIXED | vector; -		break; -	case NMI_VECTOR: -		icr |= APIC_DM_NMI; -		break; -	} -	return icr; -} - -static inline int __prepare_ICR2(unsigned int mask) -{ -	return SET_APIC_DEST_FIELD(mask); -} - -void __send_IPI_shortcut(unsigned int shortcut, int vector) -{ -	/* -	 * Subtle. In the case of the 'never do double writes' workaround -	 * we have to lock out interrupts to be safe.  As we don't care -	 * of the value read we use an atomic rmw access to avoid costly -	 * cli/sti.  Otherwise we use an even cheaper single atomic write -	 * to the APIC. -	 */ -	unsigned int cfg; - -	/* -	 * Wait for idle. -	 */ -	apic_wait_icr_idle(); - -	/* -	 * No need to touch the target chip field -	 */ -	cfg = __prepare_ICR(shortcut, vector); - -	/* -	 * Send the IPI. The write to APIC_ICR fires this off. -	 */ -	apic_write(APIC_ICR, cfg); -} - -void send_IPI_self(int vector) -{ -	__send_IPI_shortcut(APIC_DEST_SELF, vector); -} - -/* - * This is used to send an IPI with no shorthand notation (the destination is - * specified in bits 56 to 63 of the ICR). - */ -static inline void __send_IPI_dest_field(unsigned long mask, int vector) -{ -	unsigned long cfg; - -	/* -	 * Wait for idle. -	 */ -	if (unlikely(vector == NMI_VECTOR)) -		safe_apic_wait_icr_idle(); -	else -		apic_wait_icr_idle(); - -	/* -	 * prepare target chip field -	 */ -	cfg = __prepare_ICR2(mask); -	apic_write(APIC_ICR2, cfg); - -	/* -	 * program the ICR -	 */ -	cfg = __prepare_ICR(0, vector); - -	/* -	 * Send the IPI. The write to APIC_ICR fires this off. -	 */ -	apic_write(APIC_ICR, cfg); -} - -/* - * This is only used on smaller machines. - */ -void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector) -{ -	unsigned long mask = cpumask_bits(cpumask)[0]; -	unsigned long flags; - -	local_irq_save(flags); -	WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); -	__send_IPI_dest_field(mask, vector); -	local_irq_restore(flags); -} - -void send_IPI_mask_sequence(const struct cpumask *mask, int vector) -{ -	unsigned long flags; -	unsigned int query_cpu; - -	/* -	 * Hack. The clustered APIC addressing mode doesn't allow us to send -	 * to an arbitrary mask, so I do a unicasts to each CPU instead. This -	 * should be modified to do 1 message per cluster ID - mbligh -	 */ - -	local_irq_save(flags); -	for_each_cpu(query_cpu, mask) -		__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); -	local_irq_restore(flags); -} - -void send_IPI_mask_allbutself(const struct cpumask *mask, int vector) -{ -	unsigned long flags; -	unsigned int query_cpu; -	unsigned int this_cpu = smp_processor_id(); - -	/* See Hack comment above */ - -	local_irq_save(flags); -	for_each_cpu(query_cpu, mask) -		if (query_cpu != this_cpu) -			__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), -					      vector); -	local_irq_restore(flags); -} - -/* must come after the send_IPI functions above for inlining */ -static int convert_apicid_to_cpu(int apic_id) -{ -	int i; - -	for_each_possible_cpu(i) { -		if (per_cpu(x86_cpu_to_apicid, i) == apic_id) -			return i; -	} -	return -1; -} - -int safe_smp_processor_id(void) -{ -	int apicid, cpuid; - -	if (!boot_cpu_has(X86_FEATURE_APIC)) -		return 0; - -	apicid = hard_smp_processor_id(); -	if (apicid == BAD_APICID) -		return 0; - -	cpuid = convert_apicid_to_cpu(apicid); - -	return cpuid >= 0 ? cpuid : 0; -} -#endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3973e2df7f8..b864341dcc4 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -6,13 +6,18 @@  #include <linux/kernel_stat.h>  #include <linux/seq_file.h>  #include <linux/smp.h> +#include <linux/ftrace.h>  #include <asm/apic.h>  #include <asm/io_apic.h>  #include <asm/irq.h> +#include <asm/idle.h>  atomic_t irq_err_count; +/* Function pointer for generic interrupt vector handling */ +void (*generic_interrupt_extension)(void) = NULL; +  /*   * 'what should we do if we get a hw irq event on an illegal vector'.   * each architecture has to answer this themselves. @@ -36,11 +41,7 @@ void ack_bad_irq(unsigned int irq)  #endif  } -#ifdef CONFIG_X86_32 -# define irq_stats(x)		(&per_cpu(irq_stat, x)) -#else -# define irq_stats(x)		cpu_pda(x) -#endif +#define irq_stats(x)		(&per_cpu(irq_stat, x))  /*   * /proc/interrupts printing:   */ @@ -58,6 +59,12 @@ static int show_other_interrupts(struct seq_file *p)  		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);  	seq_printf(p, "  Local timer interrupts\n");  #endif +	if (generic_interrupt_extension) { +		seq_printf(p, "PLT: "); +		for_each_online_cpu(j) +			seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); +		seq_printf(p, "  Platform interrupts\n"); +	}  #ifdef CONFIG_SMP  	seq_printf(p, "RES: ");  	for_each_online_cpu(j) @@ -165,6 +172,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)  #ifdef CONFIG_X86_LOCAL_APIC  	sum += irq_stats(cpu)->apic_timer_irqs;  #endif +	if (generic_interrupt_extension) +		sum += irq_stats(cpu)->generic_irqs;  #ifdef CONFIG_SMP  	sum += irq_stats(cpu)->irq_resched_count;  	sum += irq_stats(cpu)->irq_call_count; @@ -192,4 +201,63 @@ u64 arch_irq_stat(void)  	return sum;  } + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +{ +	struct pt_regs *old_regs = set_irq_regs(regs); + +	/* high bit used in ret_from_ code  */ +	unsigned vector = ~regs->orig_ax; +	unsigned irq; + +	exit_idle(); +	irq_enter(); + +	irq = __get_cpu_var(vector_irq)[vector]; + +	if (!handle_irq(irq, regs)) { +#ifdef CONFIG_X86_64 +		if (!disable_apic) +			ack_APIC_irq(); +#endif + +		if (printk_ratelimit()) +			printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", +			       __func__, smp_processor_id(), vector, irq); +	} + +	irq_exit(); + +	set_irq_regs(old_regs); +	return 1; +} + +/* + * Handler for GENERIC_INTERRUPT_VECTOR. + */ +void smp_generic_interrupt(struct pt_regs *regs) +{ +	struct pt_regs *old_regs = set_irq_regs(regs); + +	ack_APIC_irq(); + +	exit_idle(); + +	irq_enter(); + +	inc_irq_stat(generic_irqs); + +	if (generic_interrupt_extension) +		generic_interrupt_extension(); + +	irq_exit(); + +	set_irq_regs(old_regs); +} +  EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 74b9ff7341e..3b09634a515 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -16,6 +16,7 @@  #include <linux/cpu.h>  #include <linux/delay.h>  #include <linux/uaccess.h> +#include <linux/percpu.h>  #include <asm/apic.h> @@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }  union irq_ctx {  	struct thread_info      tinfo;  	u32                     stack[THREAD_SIZE/sizeof(u32)]; -}; +} __attribute__((aligned(PAGE_SIZE))); -static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; -static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; +static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); +static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); -static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; -static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);  static void call_on_stack(void *func, void *stack)  { @@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)  	u32 *isp, arg1, arg2;  	curctx = (union irq_ctx *) current_thread_info(); -	irqctx = hardirq_ctx[smp_processor_id()]; +	irqctx = __get_cpu_var(hardirq_ctx);  	/*  	 * this is where we switch to the IRQ stack. However, if we are @@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)  {  	union irq_ctx *irqctx; -	if (hardirq_ctx[cpu]) +	if (per_cpu(hardirq_ctx, cpu))  		return; -	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; +	irqctx = &per_cpu(hardirq_stack, cpu);  	irqctx->tinfo.task		= NULL;  	irqctx->tinfo.exec_domain	= NULL;  	irqctx->tinfo.cpu		= cpu;  	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET;  	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0); -	hardirq_ctx[cpu] = irqctx; +	per_cpu(hardirq_ctx, cpu) = irqctx; -	irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; +	irqctx = &per_cpu(softirq_stack, cpu);  	irqctx->tinfo.task		= NULL;  	irqctx->tinfo.exec_domain	= NULL;  	irqctx->tinfo.cpu		= cpu;  	irqctx->tinfo.preempt_count	= 0;  	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0); -	softirq_ctx[cpu] = irqctx; +	per_cpu(softirq_ctx, cpu) = irqctx;  	printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", -	       cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); +	       cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));  }  void irq_ctx_exit(int cpu)  { -	hardirq_ctx[cpu] = NULL; +	per_cpu(hardirq_ctx, cpu) = NULL;  }  asmlinkage void do_softirq(void) @@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)  	if (local_softirq_pending()) {  		curctx = current_thread_info(); -		irqctx = softirq_ctx[smp_processor_id()]; +		irqctx = __get_cpu_var(softirq_ctx);  		irqctx->tinfo.task = curctx->task;  		irqctx->tinfo.previous_esp = current_stack_pointer; @@ -191,33 +192,16 @@ static inline int  execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }  #endif -/* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - */ -unsigned int do_IRQ(struct pt_regs *regs) +bool handle_irq(unsigned irq, struct pt_regs *regs)  { -	struct pt_regs *old_regs; -	/* high bit used in ret_from_ code */ -	int overflow; -	unsigned vector = ~regs->orig_ax;  	struct irq_desc *desc; -	unsigned irq; - - -	old_regs = set_irq_regs(regs); -	irq_enter(); -	irq = __get_cpu_var(vector_irq)[vector]; +	int overflow;  	overflow = check_stack_overflow();  	desc = irq_to_desc(irq); -	if (unlikely(!desc)) { -		printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n", -					__func__, irq, vector, smp_processor_id()); -		BUG(); -	} +	if (unlikely(!desc)) +		return false;  	if (!execute_on_irq_stack(overflow, desc, irq)) {  		if (unlikely(overflow)) @@ -225,13 +209,10 @@ unsigned int do_IRQ(struct pt_regs *regs)  		desc->handle_irq(irq, desc);  	} -	irq_exit(); -	set_irq_regs(old_regs); -	return 1; +	return true;  }  #ifdef CONFIG_HOTPLUG_CPU -#include <mach_apic.h>  /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */  void fixup_irqs(void) @@ -248,7 +229,7 @@ void fixup_irqs(void)  		if (irq == 2)  			continue; -		affinity = &desc->affinity; +		affinity = desc->affinity;  		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {  			printk("Breaking affinity for irq %i\n", irq);  			affinity = cpu_all_mask; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 63c88e6ec02..977d8b43a0d 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,6 +18,13 @@  #include <linux/smp.h>  #include <asm/io_apic.h>  #include <asm/idle.h> +#include <asm/apic.h> + +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs);  /*   * Probabilistic stack overflow check: @@ -41,42 +48,18 @@ static inline void stack_overflow_check(struct pt_regs *regs)  #endif  } -/* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - */ -asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +bool handle_irq(unsigned irq, struct pt_regs *regs)  { -	struct pt_regs *old_regs = set_irq_regs(regs);  	struct irq_desc *desc; -	/* high bit used in ret_from_ code  */ -	unsigned vector = ~regs->orig_ax; -	unsigned irq; - -	exit_idle(); -	irq_enter(); -	irq = __get_cpu_var(vector_irq)[vector]; -  	stack_overflow_check(regs);  	desc = irq_to_desc(irq); -	if (likely(desc)) -		generic_handle_irq_desc(irq, desc); -	else { -		if (!disable_apic) -			ack_APIC_irq(); - -		if (printk_ratelimit()) -			printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", -				__func__, smp_processor_id(), vector); -	} - -	irq_exit(); +	if (unlikely(!desc)) +		return false; -	set_irq_regs(old_regs); -	return 1; +	generic_handle_irq_desc(irq, desc); +	return true;  }  #ifdef CONFIG_HOTPLUG_CPU @@ -100,7 +83,7 @@ void fixup_irqs(void)  		/* interrupt's are disabled at this point */  		spin_lock(&desc->lock); -		affinity = &desc->affinity; +		affinity = desc->affinity;  		if (!irq_has_action(irq) ||  		    cpumask_equal(affinity, cpu_online_mask)) {  			spin_unlock(&desc->lock); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 10a09c2f182..bc132610544 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -18,7 +18,7 @@  #include <asm/pgtable.h>  #include <asm/desc.h>  #include <asm/apic.h> -#include <asm/arch_hooks.h> +#include <asm/setup.h>  #include <asm/i8259.h>  #include <asm/traps.h> @@ -78,6 +78,15 @@ void __init init_ISA_irqs(void)  	}  } +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ +static struct irqaction irq2 = { +	.handler = no_action, +	.mask = CPU_MASK_NONE, +	.name = "cascade", +}; +  DEFINE_PER_CPU(vector_irq_t, vector_irq) = {  	[0 ... IRQ0_VECTOR - 1] = -1,  	[IRQ0_VECTOR] = 0, @@ -118,8 +127,8 @@ void __init native_init_IRQ(void)  {  	int i; -	/* all the set up before the call gates are initialised */ -	pre_intr_init_hook(); +	/* Execute any quirks before the call gates are initialised: */ +	x86_quirk_pre_intr_init();  	/*  	 * Cover the whole vector space, no vector can escape @@ -140,8 +149,15 @@ void __init native_init_IRQ(void)  	 */  	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); -	/* IPI for invalidation */ -	alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); +	/* IPIs for invalidation */ +	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); +	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); +	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); +	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); +	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); +	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); +	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); +	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);  	/* IPI for generic function call */  	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); @@ -159,6 +175,9 @@ void __init native_init_IRQ(void)  	/* self generated IPI for local APIC timer */  	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); +	/* generic IPI for platform specific use */ +	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); +  	/* IPI vectors for APIC spurious and error interrupts */  	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);  	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); @@ -169,10 +188,14 @@ void __init native_init_IRQ(void)  	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);  #endif -	/* setup after call gates are initialised (usually add in -	 * the architecture specific gates) +	if (!acpi_ioapic) +		setup_irq(2, &irq2); + +	/* +	 * Call quirks after call gates are initialised (usually add in +	 * the architecture specific gates):  	 */ -	intr_init_hook(); +	x86_quirk_intr_init();  	/*  	 * External FPU? Set up irq13 if so, for diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index da481a1e3f3..c7a49e0ffbf 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -147,6 +147,9 @@ static void __init apic_intr_init(void)  	/* self generated IPI for local APIC timer */  	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); +	/* generic IPI for platform specific use */ +	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); +  	/* IPI vectors for APIC spurious and error interrupts */  	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);  	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 10435a120d2..eedfaebe106 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -46,7 +46,7 @@  #include <asm/apicdef.h>  #include <asm/system.h> -#include <mach_ipi.h> +#include <asm/apic.h>  /*   * Put the error code here just in case the user cares: @@ -347,7 +347,7 @@ void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)   */  void kgdb_roundup_cpus(unsigned long flags)  { -	send_IPI_allbutself(APIC_DM_NMI); +	apic->send_IPI_allbutself(APIC_DM_NMI);  }  #endif diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 652fce6d2cc..137f2e8132d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -19,7 +19,6 @@  #include <linux/clocksource.h>  #include <linux/kvm_para.h>  #include <asm/pvclock.h> -#include <asm/arch_hooks.h>  #include <asm/msr.h>  #include <asm/apic.h>  #include <linux/percpu.h> diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 37f420018a4..f5fc8c781a6 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -121,7 +121,7 @@ static void machine_kexec_page_table_set_one(  static void machine_kexec_prepare_page_tables(struct kimage *image)  {  	void *control_page; -	pmd_t *pmd = 0; +	pmd_t *pmd = NULL;  	control_page = page_address(image->control_code_page);  #ifdef CONFIG_X86_PAE diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index c43caa3a91f..6993d51b7fd 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,15 +18,6 @@  #include <asm/mmu_context.h>  #include <asm/io.h> -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -static u64 kexec_pgd[512] PAGE_ALIGNED; -static u64 kexec_pud0[512] PAGE_ALIGNED; -static u64 kexec_pmd0[512] PAGE_ALIGNED; -static u64 kexec_pte0[512] PAGE_ALIGNED; -static u64 kexec_pud1[512] PAGE_ALIGNED; -static u64 kexec_pmd1[512] PAGE_ALIGNED; -static u64 kexec_pte1[512] PAGE_ALIGNED; -  static void init_level2_page(pmd_t *level2p, unsigned long addr)  {  	unsigned long end_addr; @@ -107,12 +98,65 @@ out:  	return result;  } +static void free_transition_pgtable(struct kimage *image) +{ +	free_page((unsigned long)image->arch.pud); +	free_page((unsigned long)image->arch.pmd); +	free_page((unsigned long)image->arch.pte); +} + +static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) +{ +	pud_t *pud; +	pmd_t *pmd; +	pte_t *pte; +	unsigned long vaddr, paddr; +	int result = -ENOMEM; + +	vaddr = (unsigned long)relocate_kernel; +	paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); +	pgd += pgd_index(vaddr); +	if (!pgd_present(*pgd)) { +		pud = (pud_t *)get_zeroed_page(GFP_KERNEL); +		if (!pud) +			goto err; +		image->arch.pud = pud; +		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); +	} +	pud = pud_offset(pgd, vaddr); +	if (!pud_present(*pud)) { +		pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); +		if (!pmd) +			goto err; +		image->arch.pmd = pmd; +		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); +	} +	pmd = pmd_offset(pud, vaddr); +	if (!pmd_present(*pmd)) { +		pte = (pte_t *)get_zeroed_page(GFP_KERNEL); +		if (!pte) +			goto err; +		image->arch.pte = pte; +		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); +	} +	pte = pte_offset_kernel(pmd, vaddr); +	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +	return 0; +err: +	free_transition_pgtable(image); +	return result; +} +  static int init_pgtable(struct kimage *image, unsigned long start_pgtable)  {  	pgd_t *level4p; +	int result;  	level4p = (pgd_t *)__va(start_pgtable); -	return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); +	result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); +	if (result) +		return result; +	return init_transition_pgtable(image, level4p);  }  static void set_idt(void *newidt, u16 limit) @@ -174,7 +218,7 @@ int machine_kexec_prepare(struct kimage *image)  void machine_kexec_cleanup(struct kimage *image)  { -	return; +	free_transition_pgtable(image);  }  /* @@ -195,22 +239,6 @@ void machine_kexec(struct kimage *image)  	memcpy(control_page, relocate_kernel, PAGE_SIZE);  	page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); -	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; -	page_list[PA_PGD] = virt_to_phys(&kexec_pgd); -	page_list[VA_PGD] = (unsigned long)kexec_pgd; -	page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0); -	page_list[VA_PUD_0] = (unsigned long)kexec_pud0; -	page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0); -	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; -	page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0); -	page_list[VA_PTE_0] = (unsigned long)kexec_pte0; -	page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1); -	page_list[VA_PUD_1] = (unsigned long)kexec_pud1; -	page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1); -	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; -	page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1); -	page_list[VA_PTE_1] = (unsigned long)kexec_pte1; -  	page_list[PA_TABLE_PAGE] =  	  (unsigned long)__pa(page_address(image->control_code_page)); diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 2dc183758be..845d80ce1ef 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -51,7 +51,6 @@  #include <linux/ioport.h>  #include <asm/uaccess.h>  #include <linux/init.h> -#include <asm/arch_hooks.h>  static unsigned char which_scsi; @@ -474,6 +473,4 @@ void __kprobes mca_handle_nmi(void)  	 * adapter was responsible for the error.  	 */  	bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); - -	mca_nmi_hook(); -} /* mca_handle_nmi */ +} diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index b7f4c929e61..5e9f4fc5138 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -87,9 +87,9 @@  #include <linux/cpu.h>  #include <linux/firmware.h>  #include <linux/platform_device.h> +#include <linux/uaccess.h>  #include <asm/msr.h> -#include <asm/uaccess.h>  #include <asm/processor.h>  #include <asm/microcode.h> @@ -196,7 +196,7 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)  	return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;  } -static inline int  +static inline int  update_match_revision(struct microcode_header_intel *mc_header,	int rev)  {  	return (mc_header->rev <= rev) ? 0 : 1; @@ -442,8 +442,8 @@ static int request_microcode_fw(int cpu, struct device *device)  		return ret;  	} -	ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, -			&get_ucode_fw); +	ret = generic_load_microcode(cpu, (void *)firmware->data, +				     firmware->size, &get_ucode_fw);  	release_firmware(firmware); @@ -460,7 +460,7 @@ static int request_microcode_user(int cpu, const void __user *buf, size_t size)  	/* We should bind the task to the CPU */  	BUG_ON(cpu != raw_smp_processor_id()); -	return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); +	return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);  }  static void microcode_fini_cpu(int cpu) diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c index 3db0a5442eb..0edd819050e 100644 --- a/arch/x86/kernel/module_32.c +++ b/arch/x86/kernel/module_32.c @@ -42,7 +42,7 @@ void module_free(struct module *mod, void *module_region)  {  	vfree(module_region);  	/* FIXME: If module_region == mod->init_region, trim exception -           table entries. */ +	   table entries. */  }  /* We don't need anything special. */ @@ -113,13 +113,13 @@ int module_finalize(const Elf_Ehdr *hdr,  		*para = NULL;  	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; -	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {  +	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {  		if (!strcmp(".text", secstrings + s->sh_name))  			text = s;  		if (!strcmp(".altinstructions", secstrings + s->sh_name))  			alt = s;  		if (!strcmp(".smp_locks", secstrings + s->sh_name)) -			locks= s; +			locks = s;  		if (!strcmp(".parainstructions", secstrings + s->sh_name))  			para = s;  	} diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index 6ba87830d4b..c23880b90b5 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -30,14 +30,14 @@  #include <asm/page.h>  #include <asm/pgtable.h> -#define DEBUGP(fmt...)  +#define DEBUGP(fmt...)  #ifndef CONFIG_UML  void module_free(struct module *mod, void *module_region)  {  	vfree(module_region);  	/* FIXME: If module_region == mod->init_region, trim exception -           table entries. */ +	   table entries. */  }  void *module_alloc(unsigned long size) @@ -77,7 +77,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,  	Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;  	Elf64_Sym *sym;  	void *loc; -	u64 val;  +	u64 val;  	DEBUGP("Applying relocate section %u to %u\n", relsec,  	       sechdrs[relsec].sh_info); @@ -91,11 +91,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,  		sym = (Elf64_Sym *)sechdrs[symindex].sh_addr  			+ ELF64_R_SYM(rel[i].r_info); -	        DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", -		       (int)ELF64_R_TYPE(rel[i].r_info),  -		       sym->st_value, rel[i].r_addend, (u64)loc); +		DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", +			(int)ELF64_R_TYPE(rel[i].r_info), +			sym->st_value, rel[i].r_addend, (u64)loc); -		val = sym->st_value + rel[i].r_addend;  +		val = sym->st_value + rel[i].r_addend;  		switch (ELF64_R_TYPE(rel[i].r_info)) {  		case R_X86_64_NONE: @@ -113,16 +113,16 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,  			if ((s64)val != *(s32 *)loc)  				goto overflow;  			break; -		case R_X86_64_PC32:  +		case R_X86_64_PC32:  			val -= (u64)loc;  			*(u32 *)loc = val;  #if 0  			if ((s64)val != *(s32 *)loc) -				goto overflow;  +				goto overflow;  #endif  			break;  		default: -			printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", +			printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",  			       me->name, ELF64_R_TYPE(rel[i].r_info));  			return -ENOEXEC;  		} @@ -130,7 +130,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,  	return 0;  overflow: -	printk(KERN_ERR "overflow in relocation type %d val %Lx\n",  +	printk(KERN_ERR "overflow in relocation type %d val %Lx\n",  	       (int)ELF64_R_TYPE(rel[i].r_info), val);  	printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",  	       me->name); @@ -143,13 +143,13 @@ int apply_relocate(Elf_Shdr *sechdrs,  		   unsigned int relsec,  		   struct module *me)  { -	printk("non add relocation not supported\n"); +	printk(KERN_ERR "non add relocation not supported\n");  	return -ENOSYS; -}  +}  int module_finalize(const Elf_Ehdr *hdr, -                    const Elf_Shdr *sechdrs, -                    struct module *me) +		    const Elf_Shdr *sechdrs, +		    struct module *me)  {  	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,  		*para = NULL; @@ -161,7 +161,7 @@ int module_finalize(const Elf_Ehdr *hdr,  		if (!strcmp(".altinstructions", secstrings + s->sh_name))  			alt = s;  		if (!strcmp(".smp_locks", secstrings + s->sh_name)) -			locks= s; +			locks = s;  		if (!strcmp(".parainstructions", secstrings + s->sh_name))  			para = s;  	} diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index a649a4ccad4..e8192401da4 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -3,7 +3,7 @@   *	compliant MP-table parsing routines.   *   *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> - *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> + *	(c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>   *      (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>   */ @@ -29,12 +29,7 @@  #include <asm/setup.h>  #include <asm/smp.h> -#include <mach_apic.h> -#ifdef CONFIG_X86_32 -#include <mach_apicdef.h> -#include <mach_mpparse.h> -#endif - +#include <asm/apic.h>  /*   * Checksum an MP configuration block.   */ @@ -144,11 +139,11 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)  	if (bad_ioapic(m->apicaddr))  		return; -	mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr; -	mp_ioapics[nr_ioapics].mp_apicid = m->apicid; -	mp_ioapics[nr_ioapics].mp_type = m->type; -	mp_ioapics[nr_ioapics].mp_apicver = m->apicver; -	mp_ioapics[nr_ioapics].mp_flags = m->flags; +	mp_ioapics[nr_ioapics].apicaddr = m->apicaddr; +	mp_ioapics[nr_ioapics].apicid = m->apicid; +	mp_ioapics[nr_ioapics].type = m->type; +	mp_ioapics[nr_ioapics].apicver = m->apicver; +	mp_ioapics[nr_ioapics].flags = m->flags;  	nr_ioapics++;  } @@ -160,55 +155,55 @@ static void print_MP_intsrc_info(struct mpc_intsrc *m)  		m->srcbusirq, m->dstapic, m->dstirq);  } -static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) +static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)  {  	apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"  		" IRQ %02x, APIC ID %x, APIC INT %02x\n", -		mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, -		(mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, -		mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); +		mp_irq->irqtype, mp_irq->irqflag & 3, +		(mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, +		mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);  }  static void __init assign_to_mp_irq(struct mpc_intsrc *m, -				    struct mp_config_intsrc *mp_irq) +				    struct mpc_intsrc *mp_irq)  { -	mp_irq->mp_dstapic = m->dstapic; -	mp_irq->mp_type = m->type; -	mp_irq->mp_irqtype = m->irqtype; -	mp_irq->mp_irqflag = m->irqflag; -	mp_irq->mp_srcbus = m->srcbus; -	mp_irq->mp_srcbusirq = m->srcbusirq; -	mp_irq->mp_dstirq = m->dstirq; +	mp_irq->dstapic = m->dstapic; +	mp_irq->type = m->type; +	mp_irq->irqtype = m->irqtype; +	mp_irq->irqflag = m->irqflag; +	mp_irq->srcbus = m->srcbus; +	mp_irq->srcbusirq = m->srcbusirq; +	mp_irq->dstirq = m->dstirq;  } -static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, +static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,  					struct mpc_intsrc *m)  { -	m->dstapic = mp_irq->mp_dstapic; -	m->type = mp_irq->mp_type; -	m->irqtype = mp_irq->mp_irqtype; -	m->irqflag = mp_irq->mp_irqflag; -	m->srcbus = mp_irq->mp_srcbus; -	m->srcbusirq = mp_irq->mp_srcbusirq; -	m->dstirq = mp_irq->mp_dstirq; +	m->dstapic = mp_irq->dstapic; +	m->type = mp_irq->type; +	m->irqtype = mp_irq->irqtype; +	m->irqflag = mp_irq->irqflag; +	m->srcbus = mp_irq->srcbus; +	m->srcbusirq = mp_irq->srcbusirq; +	m->dstirq = mp_irq->dstirq;  } -static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, +static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,  					struct mpc_intsrc *m)  { -	if (mp_irq->mp_dstapic != m->dstapic) +	if (mp_irq->dstapic != m->dstapic)  		return 1; -	if (mp_irq->mp_type != m->type) +	if (mp_irq->type != m->type)  		return 2; -	if (mp_irq->mp_irqtype != m->irqtype) +	if (mp_irq->irqtype != m->irqtype)  		return 3; -	if (mp_irq->mp_irqflag != m->irqflag) +	if (mp_irq->irqflag != m->irqflag)  		return 4; -	if (mp_irq->mp_srcbus != m->srcbus) +	if (mp_irq->srcbus != m->srcbus)  		return 5; -	if (mp_irq->mp_srcbusirq != m->srcbusirq) +	if (mp_irq->srcbusirq != m->srcbusirq)  		return 6; -	if (mp_irq->mp_dstirq != m->dstirq) +	if (mp_irq->dstirq != m->dstirq)  		return 7;  	return 0; @@ -292,16 +287,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)  		return 0;  #ifdef CONFIG_X86_32 -	/* -	 * need to make sure summit and es7000's mps_oem_check is safe to be -	 * called early via genericarch 's mps_oem_check -	 */ -	if (early) { -#ifdef CONFIG_X86_NUMAQ -		numaq_mps_oem_check(mpc, oem, str); -#endif -	} else -		mps_oem_check(mpc, oem, str); +	generic_mps_oem_check(mpc, oem, str);  #endif  	/* save the local APIC address, it might be non-default */  	if (!acpi_lapic) @@ -386,13 +372,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)  			(*x86_quirks->mpc_record)++;  	} -#ifdef CONFIG_X86_GENERICARCH -       generic_bigsmp_probe(); +#ifdef CONFIG_X86_BIGSMP +	generic_bigsmp_probe();  #endif -#ifdef CONFIG_X86_32 -	setup_apic_routing(); -#endif +	if (apic->setup_apic_routing) +		apic->setup_apic_routing(); +  	if (!num_processors)  		printk(KERN_ERR "MPTABLE: no processors registered!\n");  	return num_processors; @@ -417,7 +403,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)  	intsrc.type = MP_INTSRC;  	intsrc.irqflag = 0;	/* conforming */  	intsrc.srcbus = 0; -	intsrc.dstapic = mp_ioapics[0].mp_apicid; +	intsrc.dstapic = mp_ioapics[0].apicid;  	intsrc.irqtype = mp_INT; @@ -570,14 +556,27 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)  	}  } -static struct intel_mp_floating *mpf_found; +static struct mpf_intel *mpf_found; + +static unsigned long __init get_mpc_size(unsigned long physptr) +{ +	struct mpc_table *mpc; +	unsigned long size; + +	mpc = early_ioremap(physptr, PAGE_SIZE); +	size = mpc->length; +	early_iounmap(mpc, PAGE_SIZE); +	apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size); + +	return size; +}  /*   * Scan the memory blocks for an SMP configuration block.   */  static void __init __get_smp_config(unsigned int early)  { -	struct intel_mp_floating *mpf = mpf_found; +	struct mpf_intel *mpf = mpf_found;  	if (!mpf)  		return; @@ -598,9 +597,9 @@ static void __init __get_smp_config(unsigned int early)  	}  	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", -	       mpf->mpf_specification); +	       mpf->specification);  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) -	if (mpf->mpf_feature2 & (1 << 7)) { +	if (mpf->feature2 & (1 << 7)) {  		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");  		pic_mode = 1;  	} else { @@ -611,7 +610,7 @@ static void __init __get_smp_config(unsigned int early)  	/*  	 * Now see if we need to read further.  	 */ -	if (mpf->mpf_feature1 != 0) { +	if (mpf->feature1 != 0) {  		if (early) {  			/*  			 * local APIC has default address @@ -621,16 +620,20 @@ static void __init __get_smp_config(unsigned int early)  		}  		printk(KERN_INFO "Default MP configuration #%d\n", -		       mpf->mpf_feature1); -		construct_default_ISA_mptable(mpf->mpf_feature1); +		       mpf->feature1); +		construct_default_ISA_mptable(mpf->feature1); -	} else if (mpf->mpf_physptr) { +	} else if (mpf->physptr) { +		struct mpc_table *mpc; +		unsigned long size; +		size = get_mpc_size(mpf->physptr); +		mpc = early_ioremap(mpf->physptr, size);  		/*  		 * Read the physical hardware table.  Anything here will  		 * override the defaults.  		 */ -		if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { +		if (!smp_read_mpc(mpc, early)) {  #ifdef CONFIG_X86_LOCAL_APIC  			smp_found_config = 0;  #endif @@ -638,8 +641,10 @@ static void __init __get_smp_config(unsigned int early)  			       "BIOS bug, MP table errors detected!...\n");  			printk(KERN_ERR "... disabling SMP support. "  			       "(tell your hw vendor)\n"); +			early_iounmap(mpc, size);  			return;  		} +		early_iounmap(mpc, size);  		if (early)  			return; @@ -688,33 +693,33 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,  				  unsigned reserve)  {  	unsigned int *bp = phys_to_virt(base); -	struct intel_mp_floating *mpf; +	struct mpf_intel *mpf;  	apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",  			bp, length);  	BUILD_BUG_ON(sizeof(*mpf) != 16);  	while (length > 0) { -		mpf = (struct intel_mp_floating *)bp; +		mpf = (struct mpf_intel *)bp;  		if ((*bp == SMP_MAGIC_IDENT) && -		    (mpf->mpf_length == 1) && +		    (mpf->length == 1) &&  		    !mpf_checksum((unsigned char *)bp, 16) && -		    ((mpf->mpf_specification == 1) -		     || (mpf->mpf_specification == 4))) { +		    ((mpf->specification == 1) +		     || (mpf->specification == 4))) {  #ifdef CONFIG_X86_LOCAL_APIC  			smp_found_config = 1;  #endif  			mpf_found = mpf; -			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", -			       mpf, virt_to_phys(mpf)); +			printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", +			       mpf, (u64)virt_to_phys(mpf));  			if (!reserve)  				return 1; -			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, +			reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),  					BOOTMEM_DEFAULT); -			if (mpf->mpf_physptr) { -				unsigned long size = PAGE_SIZE; +			if (mpf->physptr) { +				unsigned long size = get_mpc_size(mpf->physptr);  #ifdef CONFIG_X86_32  				/*  				 * We cannot access to MPC table to compute @@ -722,15 +727,24 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,  				 * the bottom is mapped now.  				 * PC-9800's MPC table places on the very last  				 * of physical memory; so that simply reserving -				 * PAGE_SIZE from mpg->mpf_physptr yields BUG() +				 * PAGE_SIZE from mpf->physptr yields BUG()  				 * in reserve_bootmem. +				 * also need to make sure physptr is below than +				 * max_low_pfn +				 * we don't need reserve the area above max_low_pfn  				 */  				unsigned long end = max_low_pfn * PAGE_SIZE; -				if (mpf->mpf_physptr + size > end) -					size = end - mpf->mpf_physptr; -#endif -				reserve_bootmem_generic(mpf->mpf_physptr, size, + +				if (mpf->physptr < end) { +					if (mpf->physptr + size > end) +						size = end - mpf->physptr; +					reserve_bootmem_generic(mpf->physptr, size, +							BOOTMEM_DEFAULT); +				} +#else +				reserve_bootmem_generic(mpf->physptr, size,  						BOOTMEM_DEFAULT); +#endif  			}  			return 1; @@ -809,15 +823,15 @@ static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)  	/* not legacy */  	for (i = 0; i < mp_irq_entries; i++) { -		if (mp_irqs[i].mp_irqtype != mp_INT) +		if (mp_irqs[i].irqtype != mp_INT)  			continue; -		if (mp_irqs[i].mp_irqflag != 0x0f) +		if (mp_irqs[i].irqflag != 0x0f)  			continue; -		if (mp_irqs[i].mp_srcbus != m->srcbus) +		if (mp_irqs[i].srcbus != m->srcbus)  			continue; -		if (mp_irqs[i].mp_srcbusirq != m->srcbusirq) +		if (mp_irqs[i].srcbusirq != m->srcbusirq)  			continue;  		if (irq_used[i]) {  			/* already claimed */ @@ -922,10 +936,10 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,  		if (irq_used[i])  			continue; -		if (mp_irqs[i].mp_irqtype != mp_INT) +		if (mp_irqs[i].irqtype != mp_INT)  			continue; -		if (mp_irqs[i].mp_irqflag != 0x0f) +		if (mp_irqs[i].irqflag != 0x0f)  			continue;  		if (nr_m_spare > 0) { @@ -1001,7 +1015,7 @@ static int __init update_mp_table(void)  {  	char str[16];  	char oem[10]; -	struct intel_mp_floating *mpf; +	struct mpf_intel *mpf;  	struct mpc_table *mpc, *mpc_new;  	if (!enable_update_mptable) @@ -1014,19 +1028,19 @@ static int __init update_mp_table(void)  	/*  	 * Now see if we need to go further.  	 */ -	if (mpf->mpf_feature1 != 0) +	if (mpf->feature1 != 0)  		return 0; -	if (!mpf->mpf_physptr) +	if (!mpf->physptr)  		return 0; -	mpc = phys_to_virt(mpf->mpf_physptr); +	mpc = phys_to_virt(mpf->physptr);  	if (!smp_check_mpc(mpc, oem, str))  		return 0; -	printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf)); -	printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); +	printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf)); +	printk(KERN_INFO "physptr: %x\n", mpf->physptr);  	if (mpc_new_phys && mpc->length > mpc_new_length) {  		mpc_new_phys = 0; @@ -1047,23 +1061,23 @@ static int __init update_mp_table(void)  		}  		printk(KERN_INFO "use in-positon replacing\n");  	} else { -		mpf->mpf_physptr = mpc_new_phys; +		mpf->physptr = mpc_new_phys;  		mpc_new = phys_to_virt(mpc_new_phys);  		memcpy(mpc_new, mpc, mpc->length);  		mpc = mpc_new;  		/* check if we can modify that */ -		if (mpc_new_phys - mpf->mpf_physptr) { -			struct intel_mp_floating *mpf_new; +		if (mpc_new_phys - mpf->physptr) { +			struct mpf_intel *mpf_new;  			/* steal 16 bytes from [0, 1k) */  			printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);  			mpf_new = phys_to_virt(0x400 - 16);  			memcpy(mpf_new, mpf, 16);  			mpf = mpf_new; -			mpf->mpf_physptr = mpc_new_phys; +			mpf->physptr = mpc_new_phys;  		} -		mpf->mpf_checksum = 0; -		mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); -		printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); +		mpf->checksum = 0; +		mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); +		printk(KERN_INFO "physptr new: %x\n", mpf->physptr);  	}  	/* diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 726266695b2..3cf3413ec62 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -35,10 +35,10 @@  #include <linux/device.h>  #include <linux/cpu.h>  #include <linux/notifier.h> +#include <linux/uaccess.h>  #include <asm/processor.h>  #include <asm/msr.h> -#include <asm/uaccess.h>  #include <asm/system.h>  static struct class *msr_class; diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c deleted file mode 100644 index f2191d4f271..00000000000 --- a/arch/x86/kernel/numaq_32.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Written by: Patricia Gaughen, IBM Corporation - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved.           - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT.  See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <gone@us.ibm.com> - */ - -#include <linux/mm.h> -#include <linux/bootmem.h> -#include <linux/mmzone.h> -#include <linux/module.h> -#include <linux/nodemask.h> -#include <asm/numaq.h> -#include <asm/topology.h> -#include <asm/processor.h> -#include <asm/genapic.h> -#include <asm/e820.h> -#include <asm/setup.h> - -#define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) - -/* - * Function: smp_dump_qct() - * - * Description: gets memory layout from the quad config table.  This - * function also updates node_online_map with the nodes (quads) present. - */ -static void __init smp_dump_qct(void) -{ -	int node; -	struct eachquadmem *eq; -	struct sys_cfg_data *scd = -		(struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR); - -	nodes_clear(node_online_map); -	for_each_node(node) { -		if (scd->quads_present31_0 & (1 << node)) { -			node_set_online(node); -			eq = &scd->eq[node]; -			/* Convert to pages */ -			node_start_pfn[node] = MB_TO_PAGES( -				eq->hi_shrd_mem_start - eq->priv_mem_size); -			node_end_pfn[node] = MB_TO_PAGES( -				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); - -			e820_register_active_regions(node, node_start_pfn[node], -							node_end_pfn[node]); -			memory_present(node, -				node_start_pfn[node], node_end_pfn[node]); -			node_remap_size[node] = node_memmap_size_bytes(node, -							node_start_pfn[node], -							node_end_pfn[node]); -		} -	} -} - - -void __cpuinit numaq_tsc_disable(void) -{ -	if (!found_numaq) -		return; - -	if (num_online_nodes() > 1) { -		printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); -		setup_clear_cpu_cap(X86_FEATURE_TSC); -	} -} - -static int __init numaq_pre_time_init(void) -{ -	numaq_tsc_disable(); -	return 0; -} - -int found_numaq; -/* - * Have to match translation table entries to main table entries by counter - * hence the mpc_record variable .... can't see a less disgusting way of - * doing this .... - */ -struct mpc_config_translation { -	unsigned char mpc_type; -	unsigned char trans_len; -	unsigned char trans_type; -	unsigned char trans_quad; -	unsigned char trans_global; -	unsigned char trans_local; -	unsigned short trans_reserved; -}; - -/* x86_quirks member */ -static int mpc_record; -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] -    __cpuinitdata; - -static inline int generate_logical_apicid(int quad, int phys_apicid) -{ -	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); -} - -/* x86_quirks member */ -static int mpc_apic_id(struct mpc_cpu *m) -{ -	int quad = translation_table[mpc_record]->trans_quad; -	int logical_apicid = generate_logical_apicid(quad, m->apicid); - -	printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", -	       m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8, -	       (m->cpufeature & CPU_MODEL_MASK) >> 4, -	       m->apicver, quad, logical_apicid); -	return logical_apicid; -} - -int mp_bus_id_to_node[MAX_MP_BUSSES]; - -int mp_bus_id_to_local[MAX_MP_BUSSES]; - -/* x86_quirks member */ -static void mpc_oem_bus_info(struct mpc_bus *m, char *name) -{ -	int quad = translation_table[mpc_record]->trans_quad; -	int local = translation_table[mpc_record]->trans_local; - -	mp_bus_id_to_node[m->busid] = quad; -	mp_bus_id_to_local[m->busid] = local; -	printk(KERN_INFO "Bus #%d is %s (node %d)\n", -	       m->busid, name, quad); -} - -int quad_local_to_mp_bus_id [NR_CPUS/4][4]; - -/* x86_quirks member */ -static void mpc_oem_pci_bus(struct mpc_bus *m) -{ -	int quad = translation_table[mpc_record]->trans_quad; -	int local = translation_table[mpc_record]->trans_local; - -	quad_local_to_mp_bus_id[quad][local] = m->busid; -} - -static void __init MP_translation_info(struct mpc_config_translation *m) -{ -	printk(KERN_INFO -	       "Translation: record %d, type %d, quad %d, global %d, local %d\n", -	       mpc_record, m->trans_type, m->trans_quad, m->trans_global, -	       m->trans_local); - -	if (mpc_record >= MAX_MPC_ENTRY) -		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); -	else -		translation_table[mpc_record] = m;	/* stash this for later */ -	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) -		node_set_online(m->trans_quad); -} - -static int __init mpf_checksum(unsigned char *mp, int len) -{ -	int sum = 0; - -	while (len--) -		sum += *mp++; - -	return sum & 0xFF; -} - -/* - * Read/parse the MPC oem tables - */ - -static void __init smp_read_mpc_oem(struct mpc_oemtable *oemtable, -				    unsigned short oemsize) -{ -	int count = sizeof(*oemtable);	/* the header size */ -	unsigned char *oemptr = ((unsigned char *)oemtable) + count; - -	mpc_record = 0; -	printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", -	       oemtable); -	if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { -		printk(KERN_WARNING -		       "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", -		       oemtable->signature[0], oemtable->signature[1], -		       oemtable->signature[2], oemtable->signature[3]); -		return; -	} -	if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) { -		printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); -		return; -	} -	while (count < oemtable->length) { -		switch (*oemptr) { -		case MP_TRANSLATION: -			{ -				struct mpc_config_translation *m = -				    (struct mpc_config_translation *)oemptr; -				MP_translation_info(m); -				oemptr += sizeof(*m); -				count += sizeof(*m); -				++mpc_record; -				break; -			} -		default: -			{ -				printk(KERN_WARNING -				       "Unrecognised OEM table entry type! - %d\n", -				       (int)*oemptr); -				return; -			} -		} -	} -} - -static int __init numaq_setup_ioapic_ids(void) -{ -	/* so can skip it */ -	return 1; -} - -static int __init numaq_update_genapic(void) -{ -	genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; - -	return 0; -} - -static struct x86_quirks numaq_x86_quirks __initdata = { -	.arch_pre_time_init	= numaq_pre_time_init, -	.arch_time_init		= NULL, -	.arch_pre_intr_init	= NULL, -	.arch_memory_setup	= NULL, -	.arch_intr_init		= NULL, -	.arch_trap_init		= NULL, -	.mach_get_smp_config	= NULL, -	.mach_find_smp_config	= NULL, -	.mpc_record		= &mpc_record, -	.mpc_apic_id		= mpc_apic_id, -	.mpc_oem_bus_info	= mpc_oem_bus_info, -	.mpc_oem_pci_bus	= mpc_oem_pci_bus, -	.smp_read_mpc_oem	= smp_read_mpc_oem, -	.setup_ioapic_ids	= numaq_setup_ioapic_ids, -	.update_genapic		= numaq_update_genapic, -}; - -void numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) -{ -	if (strncmp(oem, "IBM NUMA", 8)) -		printk("Warning!  Not a NUMA-Q system!\n"); -	else -		found_numaq = 1; -} - -static __init void early_check_numaq(void) -{ -	/* -	 * Find possible boot-time SMP configuration: -	 */ -	early_find_smp_config(); -	/* -	 * get boot-time SMP configuration: -	 */ -	if (smp_found_config) -		early_get_smp_config(); - -	if (found_numaq) -		x86_quirks = &numaq_x86_quirks; -} - -int __init get_memcfg_numaq(void) -{ -	early_check_numaq(); -	if (!found_numaq) -		return 0; -	smp_dump_qct(); -	return 1; -} diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 95777b0faa7..3a7c5a44082 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -26,13 +26,3 @@ struct pv_lock_ops pv_lock_ops = {  };  EXPORT_SYMBOL(pv_lock_ops); -void __init paravirt_use_bytelocks(void) -{ -#ifdef CONFIG_SMP -	pv_lock_ops.spin_is_locked = __byte_spin_is_locked; -	pv_lock_ops.spin_is_contended = __byte_spin_is_contended; -	pv_lock_ops.spin_lock = __byte_spin_lock; -	pv_lock_ops.spin_trylock = __byte_spin_trylock; -	pv_lock_ops.spin_unlock = __byte_spin_unlock; -#endif -} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c6520a4e85d..63dd358d8ee 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -28,7 +28,6 @@  #include <asm/paravirt.h>  #include <asm/desc.h>  #include <asm/setup.h> -#include <asm/arch_hooks.h>  #include <asm/pgtable.h>  #include <asm/time.h>  #include <asm/pgalloc.h> @@ -44,6 +43,17 @@ void _paravirt_nop(void)  {  } +/* identity function, which can be inlined */ +u32 _paravirt_ident_32(u32 x) +{ +	return x; +} + +u64 _paravirt_ident_64(u64 x) +{ +	return x; +} +  static void __init default_banner(void)  {  	printk(KERN_INFO "Booting paravirtualized kernel on %s\n", @@ -138,9 +148,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,  	if (opfunc == NULL)  		/* If there's no function, patch it with a ud2a (BUG) */  		ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); -	else if (opfunc == paravirt_nop) +	else if (opfunc == _paravirt_nop)  		/* If the operation is a nop, then nop the callsite */  		ret = paravirt_patch_nop(); + +	/* identity functions just return their single argument */ +	else if (opfunc == _paravirt_ident_32) +		ret = paravirt_patch_ident_32(insnbuf, len); +	else if (opfunc == _paravirt_ident_64) +		ret = paravirt_patch_ident_64(insnbuf, len); +  	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||  		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||  		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || @@ -318,10 +335,10 @@ struct pv_time_ops pv_time_ops = {  struct pv_irq_ops pv_irq_ops = {  	.init_IRQ = native_init_IRQ, -	.save_fl = native_save_fl, -	.restore_fl = native_restore_fl, -	.irq_disable = native_irq_disable, -	.irq_enable = native_irq_enable, +	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), +	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), +	.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), +	.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),  	.safe_halt = native_safe_halt,  	.halt = native_halt,  #ifdef CONFIG_X86_64 @@ -399,6 +416,14 @@ struct pv_apic_ops pv_apic_ops = {  #endif  }; +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) +/* 32-bit pagetable entries */ +#define PTE_IDENT	__PV_IS_CALLEE_SAVE(_paravirt_ident_32) +#else +/* 64-bit pagetable entries */ +#define PTE_IDENT	__PV_IS_CALLEE_SAVE(_paravirt_ident_64) +#endif +  struct pv_mmu_ops pv_mmu_ops = {  #ifndef CONFIG_X86_64  	.pagetable_setup_start = native_pagetable_setup_start, @@ -450,22 +475,23 @@ struct pv_mmu_ops pv_mmu_ops = {  	.pmd_clear = native_pmd_clear,  #endif  	.set_pud = native_set_pud, -	.pmd_val = native_pmd_val, -	.make_pmd = native_make_pmd, + +	.pmd_val = PTE_IDENT, +	.make_pmd = PTE_IDENT,  #if PAGETABLE_LEVELS == 4 -	.pud_val = native_pud_val, -	.make_pud = native_make_pud, +	.pud_val = PTE_IDENT, +	.make_pud = PTE_IDENT, +  	.set_pgd = native_set_pgd,  #endif  #endif /* PAGETABLE_LEVELS >= 3 */ -	.pte_val = native_pte_val, -	.pte_flags = native_pte_flags, -	.pgd_val = native_pgd_val, +	.pte_val = PTE_IDENT, +	.pgd_val = PTE_IDENT, -	.make_pte = native_make_pte, -	.make_pgd = native_make_pgd, +	.make_pte = PTE_IDENT, +	.make_pgd = PTE_IDENT,  	.dup_mmap = paravirt_nop,  	.exit_mmap = paravirt_nop, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 9fe644f4861..d9f32e6d6ab 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");  DEF_NATIVE(pv_cpu_ops, clts, "clts");  DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) +{ +	/* arg in %eax, return in %eax */ +	return 0; +} + +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) +{ +	/* arg in %edx:%eax, return in %edx:%eax */ +	return 0; +} +  unsigned native_patch(u8 type, u16 clobbers, void *ibuf,  		      unsigned long addr, unsigned len)  { diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 061d01df9ae..3f08f34f93e 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");  DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");  DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); +DEF_NATIVE(, mov32, "mov %edi, %eax"); +DEF_NATIVE(, mov64, "mov %rdi, %rax"); + +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) +{ +	return paravirt_patch_insns(insnbuf, len, +				    start__mov32, end__mov32); +} + +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) +{ +	return paravirt_patch_insns(insnbuf, len, +				    start__mov64, end__mov64); +} +  unsigned native_patch(u8 type, u16 clobbers, void *ibuf,  		      unsigned long addr, unsigned len)  { diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c index 675a48c404a..071e7fea42e 100644 --- a/arch/x86/kernel/probe_roms_32.c +++ b/arch/x86/kernel/probe_roms_32.c @@ -18,7 +18,7 @@  #include <asm/setup.h>  #include <asm/sections.h>  #include <asm/io.h> -#include <setup_arch.h> +#include <asm/setup_arch.h>  static struct resource system_rom_resource = {  	.name	= "System ROM", diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6d12f7e37f8..6afa5232dbb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,8 +1,8 @@  #include <linux/errno.h>  #include <linux/kernel.h>  #include <linux/mm.h> -#include <asm/idle.h>  #include <linux/smp.h> +#include <linux/prctl.h>  #include <linux/slab.h>  #include <linux/sched.h>  #include <linux/module.h> @@ -11,6 +11,9 @@  #include <linux/ftrace.h>  #include <asm/system.h>  #include <asm/apic.h> +#include <asm/idle.h> +#include <asm/uaccess.h> +#include <asm/i387.h>  unsigned long idle_halt;  EXPORT_SYMBOL(idle_halt); @@ -56,6 +59,192 @@ void arch_task_cache_init(void)  }  /* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ +	struct task_struct *me = current; +	struct thread_struct *t = &me->thread; + +	if (me->thread.io_bitmap_ptr) { +		struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + +		kfree(t->io_bitmap_ptr); +		t->io_bitmap_ptr = NULL; +		clear_thread_flag(TIF_IO_BITMAP); +		/* +		 * Careful, clear this in the TSS too: +		 */ +		memset(tss->io_bitmap, 0xff, t->io_bitmap_max); +		t->io_bitmap_max = 0; +		put_cpu(); +	} + +	ds_exit_thread(current); +} + +void flush_thread(void) +{ +	struct task_struct *tsk = current; + +#ifdef CONFIG_X86_64 +	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { +		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); +		if (test_tsk_thread_flag(tsk, TIF_IA32)) { +			clear_tsk_thread_flag(tsk, TIF_IA32); +		} else { +			set_tsk_thread_flag(tsk, TIF_IA32); +			current_thread_info()->status |= TS_COMPAT; +		} +	} +#endif + +	clear_tsk_thread_flag(tsk, TIF_DEBUG); + +	tsk->thread.debugreg0 = 0; +	tsk->thread.debugreg1 = 0; +	tsk->thread.debugreg2 = 0; +	tsk->thread.debugreg3 = 0; +	tsk->thread.debugreg6 = 0; +	tsk->thread.debugreg7 = 0; +	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); +	/* +	 * Forget coprocessor state.. +	 */ +	tsk->fpu_counter = 0; +	clear_fpu(tsk); +	clear_used_math(); +} + +static void hard_disable_TSC(void) +{ +	write_cr4(read_cr4() | X86_CR4_TSD); +} + +void disable_TSC(void) +{ +	preempt_disable(); +	if (!test_and_set_thread_flag(TIF_NOTSC)) +		/* +		 * Must flip the CPU state synchronously with +		 * TIF_NOTSC in the current running context. +		 */ +		hard_disable_TSC(); +	preempt_enable(); +} + +static void hard_enable_TSC(void) +{ +	write_cr4(read_cr4() & ~X86_CR4_TSD); +} + +static void enable_TSC(void) +{ +	preempt_disable(); +	if (test_and_clear_thread_flag(TIF_NOTSC)) +		/* +		 * Must flip the CPU state synchronously with +		 * TIF_NOTSC in the current running context. +		 */ +		hard_enable_TSC(); +	preempt_enable(); +} + +int get_tsc_mode(unsigned long adr) +{ +	unsigned int val; + +	if (test_thread_flag(TIF_NOTSC)) +		val = PR_TSC_SIGSEGV; +	else +		val = PR_TSC_ENABLE; + +	return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ +	if (val == PR_TSC_SIGSEGV) +		disable_TSC(); +	else if (val == PR_TSC_ENABLE) +		enable_TSC(); +	else +		return -EINVAL; + +	return 0; +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, +		      struct tss_struct *tss) +{ +	struct thread_struct *prev, *next; + +	prev = &prev_p->thread; +	next = &next_p->thread; + +	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || +	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) +		ds_switch_to(prev_p, next_p); +	else if (next->debugctlmsr != prev->debugctlmsr) +		update_debugctlmsr(next->debugctlmsr); + +	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { +		set_debugreg(next->debugreg0, 0); +		set_debugreg(next->debugreg1, 1); +		set_debugreg(next->debugreg2, 2); +		set_debugreg(next->debugreg3, 3); +		/* no 4 and 5 */ +		set_debugreg(next->debugreg6, 6); +		set_debugreg(next->debugreg7, 7); +	} + +	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ +	    test_tsk_thread_flag(next_p, TIF_NOTSC)) { +		/* prev and next are different */ +		if (test_tsk_thread_flag(next_p, TIF_NOTSC)) +			hard_disable_TSC(); +		else +			hard_enable_TSC(); +	} + +	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +		/* +		 * Copy the relevant range of the IO bitmap. +		 * Normally this is 128 bytes or less: +		 */ +		memcpy(tss->io_bitmap, next->io_bitmap_ptr, +		       max(prev->io_bitmap_max, next->io_bitmap_max)); +	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { +		/* +		 * Clear any possible leftover bits: +		 */ +		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); +	} +} + +int sys_fork(struct pt_regs *regs) +{ +	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +int sys_vfork(struct pt_regs *regs) +{ +	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, +		       NULL, NULL); +} + + +/*   * Idle related variables and functions   */  unsigned long boot_option_idle_override = 0; @@ -350,7 +539,7 @@ static void c1e_idle(void)  void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)  { -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP  	if (pm_idle == poll_idle && smp_num_siblings > 1) {  		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"  			" performance may degrade.\n"); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd4da2af08a..14014d766ca 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -11,6 +11,7 @@  #include <stdarg.h> +#include <linux/stackprotector.h>  #include <linux/cpu.h>  #include <linux/errno.h>  #include <linux/sched.h> @@ -66,9 +67,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");  DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;  EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); -  /*   * Return saved PC of a blocked thread.   */ @@ -94,6 +92,15 @@ void cpu_idle(void)  {  	int cpu = smp_processor_id(); +	/* +	 * If we're the non-boot CPU, nothing set the stack canary up +	 * for us.  CPU0 already has it initialized but no harm in +	 * doing it again.  This is a good place for updating it, as +	 * we wont ever return from this function (so the invalid +	 * canaries already on the stack wont ever trigger). +	 */ +	boot_init_stack_canary(); +  	current_thread_info()->status |= TS_POLLING;  	/* endless idle loop with no priority at all */ @@ -108,7 +115,6 @@ void cpu_idle(void)  				play_dead();  			local_irq_disable(); -			__get_cpu_var(irq_stat).idle_timestamp = jiffies;  			/* Don't trace irqs off for idle */  			stop_critical_timings();  			pm_idle(); @@ -132,7 +138,7 @@ void __show_regs(struct pt_regs *regs, int all)  	if (user_mode_vm(regs)) {  		sp = regs->sp;  		ss = regs->ss & 0xffff; -		savesegment(gs, gs); +		gs = get_user_gs(regs);  	} else {  		sp = (unsigned long) (®s->sp);  		savesegment(ss, ss); @@ -213,6 +219,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)  	regs.ds = __USER_DS;  	regs.es = __USER_DS;  	regs.fs = __KERNEL_PERCPU; +	regs.gs = __KERNEL_STACK_CANARY;  	regs.orig_ax = -1;  	regs.ip = (unsigned long) kernel_thread_helper;  	regs.cs = __KERNEL_CS | get_kernel_rpl(); @@ -223,55 +230,6 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)  }  EXPORT_SYMBOL(kernel_thread); -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ -	/* The process may have allocated an io port bitmap... nuke it. */ -	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { -		struct task_struct *tsk = current; -		struct thread_struct *t = &tsk->thread; -		int cpu = get_cpu(); -		struct tss_struct *tss = &per_cpu(init_tss, cpu); - -		kfree(t->io_bitmap_ptr); -		t->io_bitmap_ptr = NULL; -		clear_thread_flag(TIF_IO_BITMAP); -		/* -		 * Careful, clear this in the TSS too: -		 */ -		memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); -		t->io_bitmap_max = 0; -		tss->io_bitmap_owner = NULL; -		tss->io_bitmap_max = 0; -		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; -		put_cpu(); -	} - -	ds_exit_thread(current); -} - -void flush_thread(void) -{ -	struct task_struct *tsk = current; - -	tsk->thread.debugreg0 = 0; -	tsk->thread.debugreg1 = 0; -	tsk->thread.debugreg2 = 0; -	tsk->thread.debugreg3 = 0; -	tsk->thread.debugreg6 = 0; -	tsk->thread.debugreg7 = 0; -	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); -	clear_tsk_thread_flag(tsk, TIF_DEBUG); -	/* -	 * Forget coprocessor state.. -	 */ -	tsk->fpu_counter = 0; -	clear_fpu(tsk); -	clear_used_math(); -} -  void release_thread(struct task_struct *dead_task)  {  	BUG_ON(dead_task->mm); @@ -305,7 +263,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,  	p->thread.ip = (unsigned long) ret_from_fork; -	savesegment(gs, p->thread.gs); +	task_user_gs(p) = get_user_gs(regs);  	tsk = current;  	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -343,7 +301,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,  void  start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)  { -	__asm__("movl %0, %%gs" : : "r"(0)); +	set_user_gs(regs, 0);  	regs->fs		= 0;  	set_fs(USER_DS);  	regs->ds		= __USER_DS; @@ -359,127 +317,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)  }  EXPORT_SYMBOL_GPL(start_thread); -static void hard_disable_TSC(void) -{ -	write_cr4(read_cr4() | X86_CR4_TSD); -} - -void disable_TSC(void) -{ -	preempt_disable(); -	if (!test_and_set_thread_flag(TIF_NOTSC)) -		/* -		 * Must flip the CPU state synchronously with -		 * TIF_NOTSC in the current running context. -		 */ -		hard_disable_TSC(); -	preempt_enable(); -} - -static void hard_enable_TSC(void) -{ -	write_cr4(read_cr4() & ~X86_CR4_TSD); -} - -static void enable_TSC(void) -{ -	preempt_disable(); -	if (test_and_clear_thread_flag(TIF_NOTSC)) -		/* -		 * Must flip the CPU state synchronously with -		 * TIF_NOTSC in the current running context. -		 */ -		hard_enable_TSC(); -	preempt_enable(); -} - -int get_tsc_mode(unsigned long adr) -{ -	unsigned int val; - -	if (test_thread_flag(TIF_NOTSC)) -		val = PR_TSC_SIGSEGV; -	else -		val = PR_TSC_ENABLE; - -	return put_user(val, (unsigned int __user *)adr); -} - -int set_tsc_mode(unsigned int val) -{ -	if (val == PR_TSC_SIGSEGV) -		disable_TSC(); -	else if (val == PR_TSC_ENABLE) -		enable_TSC(); -	else -		return -EINVAL; - -	return 0; -} - -static noinline void -__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, -		 struct tss_struct *tss) -{ -	struct thread_struct *prev, *next; - -	prev = &prev_p->thread; -	next = &next_p->thread; - -	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || -	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) -		ds_switch_to(prev_p, next_p); -	else if (next->debugctlmsr != prev->debugctlmsr) -		update_debugctlmsr(next->debugctlmsr); - -	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { -		set_debugreg(next->debugreg0, 0); -		set_debugreg(next->debugreg1, 1); -		set_debugreg(next->debugreg2, 2); -		set_debugreg(next->debugreg3, 3); -		/* no 4 and 5 */ -		set_debugreg(next->debugreg6, 6); -		set_debugreg(next->debugreg7, 7); -	} - -	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ -	    test_tsk_thread_flag(next_p, TIF_NOTSC)) { -		/* prev and next are different */ -		if (test_tsk_thread_flag(next_p, TIF_NOTSC)) -			hard_disable_TSC(); -		else -			hard_enable_TSC(); -	} - -	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { -		/* -		 * Disable the bitmap via an invalid offset. We still cache -		 * the previous bitmap owner and the IO bitmap contents: -		 */ -		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; -		return; -	} - -	if (likely(next == tss->io_bitmap_owner)) { -		/* -		 * Previous owner of the bitmap (hence the bitmap content) -		 * matches the next task, we dont have to do anything but -		 * to set a valid offset in the TSS: -		 */ -		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; -		return; -	} -	/* -	 * Lazy TSS's I/O bitmap copy. We set an invalid offset here -	 * and we let the task to get a GPF in case an I/O instruction -	 * is performed.  The handler of the GPF will verify that the -	 * faulting task has a valid I/O bitmap and, it true, does the -	 * real copy and restart the instruction.  This will save us -	 * redundant copies when the currently switched task does not -	 * perform any I/O during its timeslice. -	 */ -	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; -}  /*   *	switch_to(x,yn) should switch tasks from x to y. @@ -540,7 +377,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	 * used %fs or %gs (it does not today), or if the kernel is  	 * running inside of a hypervisor layer.  	 */ -	savesegment(gs, prev->gs); +	lazy_save_gs(prev->gs);  	/*  	 * Load the per-thread Thread-Local Storage descriptor. @@ -586,64 +423,44 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	 * Restore %gs if needed (which is common)  	 */  	if (prev->gs | next->gs) -		loadsegment(gs, next->gs); +		lazy_load_gs(next->gs); -	x86_write_percpu(current_task, next_p); +	percpu_write(current_task, next_p);  	return prev_p;  } -asmlinkage int sys_fork(struct pt_regs regs) -{ -	return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); -} - -asmlinkage int sys_clone(struct pt_regs regs) +int sys_clone(struct pt_regs *regs)  {  	unsigned long clone_flags;  	unsigned long newsp;  	int __user *parent_tidptr, *child_tidptr; -	clone_flags = regs.bx; -	newsp = regs.cx; -	parent_tidptr = (int __user *)regs.dx; -	child_tidptr = (int __user *)regs.di; +	clone_flags = regs->bx; +	newsp = regs->cx; +	parent_tidptr = (int __user *)regs->dx; +	child_tidptr = (int __user *)regs->di;  	if (!newsp) -		newsp = regs.sp; -	return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); -} - -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -asmlinkage int sys_vfork(struct pt_regs regs) -{ -	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); +		newsp = regs->sp; +	return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);  }  /*   * sys_execve() executes a new program.   */ -asmlinkage int sys_execve(struct pt_regs regs) +int sys_execve(struct pt_regs *regs)  {  	int error;  	char *filename; -	filename = getname((char __user *) regs.bx); +	filename = getname((char __user *) regs->bx);  	error = PTR_ERR(filename);  	if (IS_ERR(filename))  		goto out;  	error = do_execve(filename, -			(char __user * __user *) regs.cx, -			(char __user * __user *) regs.dx, -			®s); +			(char __user * __user *) regs->cx, +			(char __user * __user *) regs->dx, +			regs);  	if (error == 0) {  		/* Make sure we don't return using sysenter.. */  		set_thread_flag(TIF_IRET); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 85b4cb5c198..abb7e6a7f0c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -16,6 +16,7 @@  #include <stdarg.h> +#include <linux/stackprotector.h>  #include <linux/cpu.h>  #include <linux/errno.h>  #include <linux/sched.h> @@ -47,7 +48,6 @@  #include <asm/processor.h>  #include <asm/i387.h>  #include <asm/mmu_context.h> -#include <asm/pda.h>  #include <asm/prctl.h>  #include <asm/desc.h>  #include <asm/proto.h> @@ -58,6 +58,12 @@  asmlinkage extern void ret_from_fork(void); +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + +DEFINE_PER_CPU(unsigned long, old_rsp); +static DEFINE_PER_CPU(unsigned char, is_idle); +  unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;  static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -76,13 +82,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);  void enter_idle(void)  { -	write_pda(isidle, 1); +	percpu_write(is_idle, 1);  	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);  }  static void __exit_idle(void)  { -	if (test_and_clear_bit_pda(0, isidle) == 0) +	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)  		return;  	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);  } @@ -112,6 +118,16 @@ static inline void play_dead(void)  void cpu_idle(void)  {  	current_thread_info()->status |= TS_POLLING; + +	/* +	 * If we're the non-boot CPU, nothing set the stack canary up +	 * for us.  CPU0 already has it initialized but no harm in +	 * doing it again.  This is a good place for updating it, as +	 * we wont ever return from this function (so the invalid +	 * canaries already on the stack wont ever trigger). +	 */ +	boot_init_stack_canary(); +  	/* endless idle loop with no priority at all */  	while (1) {  		tick_nohz_stop_sched_tick(1); @@ -221,61 +237,6 @@ void show_regs(struct pt_regs *regs)  	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);  } -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ -	struct task_struct *me = current; -	struct thread_struct *t = &me->thread; - -	if (me->thread.io_bitmap_ptr) { -		struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); - -		kfree(t->io_bitmap_ptr); -		t->io_bitmap_ptr = NULL; -		clear_thread_flag(TIF_IO_BITMAP); -		/* -		 * Careful, clear this in the TSS too: -		 */ -		memset(tss->io_bitmap, 0xff, t->io_bitmap_max); -		t->io_bitmap_max = 0; -		put_cpu(); -	} - -	ds_exit_thread(current); -} - -void flush_thread(void) -{ -	struct task_struct *tsk = current; - -	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { -		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); -		if (test_tsk_thread_flag(tsk, TIF_IA32)) { -			clear_tsk_thread_flag(tsk, TIF_IA32); -		} else { -			set_tsk_thread_flag(tsk, TIF_IA32); -			current_thread_info()->status |= TS_COMPAT; -		} -	} -	clear_tsk_thread_flag(tsk, TIF_DEBUG); - -	tsk->thread.debugreg0 = 0; -	tsk->thread.debugreg1 = 0; -	tsk->thread.debugreg2 = 0; -	tsk->thread.debugreg3 = 0; -	tsk->thread.debugreg6 = 0; -	tsk->thread.debugreg7 = 0; -	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); -	/* -	 * Forget coprocessor state.. -	 */ -	tsk->fpu_counter = 0; -	clear_fpu(tsk); -	clear_used_math(); -} -  void release_thread(struct task_struct *dead_task)  {  	if (dead_task->mm) { @@ -397,7 +358,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)  	load_gs_index(0);  	regs->ip		= new_ip;  	regs->sp		= new_sp; -	write_pda(oldrsp, new_sp); +	percpu_write(old_rsp, new_sp);  	regs->cs		= __USER_CS;  	regs->ss		= __USER_DS;  	regs->flags		= 0x200; @@ -409,118 +370,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)  }  EXPORT_SYMBOL_GPL(start_thread); -static void hard_disable_TSC(void) -{ -	write_cr4(read_cr4() | X86_CR4_TSD); -} - -void disable_TSC(void) -{ -	preempt_disable(); -	if (!test_and_set_thread_flag(TIF_NOTSC)) -		/* -		 * Must flip the CPU state synchronously with -		 * TIF_NOTSC in the current running context. -		 */ -		hard_disable_TSC(); -	preempt_enable(); -} - -static void hard_enable_TSC(void) -{ -	write_cr4(read_cr4() & ~X86_CR4_TSD); -} - -static void enable_TSC(void) -{ -	preempt_disable(); -	if (test_and_clear_thread_flag(TIF_NOTSC)) -		/* -		 * Must flip the CPU state synchronously with -		 * TIF_NOTSC in the current running context. -		 */ -		hard_enable_TSC(); -	preempt_enable(); -} - -int get_tsc_mode(unsigned long adr) -{ -	unsigned int val; - -	if (test_thread_flag(TIF_NOTSC)) -		val = PR_TSC_SIGSEGV; -	else -		val = PR_TSC_ENABLE; - -	return put_user(val, (unsigned int __user *)adr); -} - -int set_tsc_mode(unsigned int val) -{ -	if (val == PR_TSC_SIGSEGV) -		disable_TSC(); -	else if (val == PR_TSC_ENABLE) -		enable_TSC(); -	else -		return -EINVAL; - -	return 0; -} - -/* - * This special macro can be used to load a debugging register - */ -#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) - -static inline void __switch_to_xtra(struct task_struct *prev_p, -				    struct task_struct *next_p, -				    struct tss_struct *tss) -{ -	struct thread_struct *prev, *next; - -	prev = &prev_p->thread, -	next = &next_p->thread; - -	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || -	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) -		ds_switch_to(prev_p, next_p); -	else if (next->debugctlmsr != prev->debugctlmsr) -		update_debugctlmsr(next->debugctlmsr); - -	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { -		loaddebug(next, 0); -		loaddebug(next, 1); -		loaddebug(next, 2); -		loaddebug(next, 3); -		/* no 4 and 5 */ -		loaddebug(next, 6); -		loaddebug(next, 7); -	} - -	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ -	    test_tsk_thread_flag(next_p, TIF_NOTSC)) { -		/* prev and next are different */ -		if (test_tsk_thread_flag(next_p, TIF_NOTSC)) -			hard_disable_TSC(); -		else -			hard_enable_TSC(); -	} - -	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { -		/* -		 * Copy the relevant range of the IO bitmap. -		 * Normally this is 128 bytes or less: -		 */ -		memcpy(tss->io_bitmap, next->io_bitmap_ptr, -		       max(prev->io_bitmap_max, next->io_bitmap_max)); -	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { -		/* -		 * Clear any possible leftover bits: -		 */ -		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); -	} -} -  /*   *	switch_to(x,y) should switch tasks from x to y.   * @@ -618,21 +467,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	/*  	 * Switch the PDA and FPU contexts.  	 */ -	prev->usersp = read_pda(oldrsp); -	write_pda(oldrsp, next->usersp); -	write_pda(pcurrent, next_p); +	prev->usersp = percpu_read(old_rsp); +	percpu_write(old_rsp, next->usersp); +	percpu_write(current_task, next_p); -	write_pda(kernelstack, +	percpu_write(kernel_stack,  		  (unsigned long)task_stack_page(next_p) + -		  THREAD_SIZE - PDA_STACKOFFSET); -#ifdef CONFIG_CC_STACKPROTECTOR -	write_pda(stack_canary, next_p->stack_canary); -	/* -	 * Build time only check to make sure the stack_canary is at -	 * offset 40 in the pda; this is a gcc ABI requirement -	 */ -	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); -#endif +		  THREAD_SIZE - KERNEL_STACK_OFFSET);  	/*  	 * Now maybe reload the debug registers and handle I/O bitmaps @@ -686,11 +527,6 @@ void set_personality_64bit(void)  	current->personality &= ~READ_IMPLIES_EXEC;  } -asmlinkage long sys_fork(struct pt_regs *regs) -{ -	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); -} -  asmlinkage long  sys_clone(unsigned long clone_flags, unsigned long newsp,  	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) @@ -700,22 +536,6 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,  	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);  } -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -asmlinkage long sys_vfork(struct pt_regs *regs) -{ -	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, -		    NULL, NULL); -} -  unsigned long get_wchan(struct task_struct *p)  {  	unsigned long stack; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 06ca07f6ad8..3d9672e59c1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -75,10 +75,7 @@ static inline bool invalid_selector(u16 value)  static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)  {  	BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); -	regno >>= 2; -	if (regno > FS) -		--regno; -	return ®s->bx + regno; +	return ®s->bx + (regno >> 2);  }  static u16 get_segment_reg(struct task_struct *task, unsigned long offset) @@ -90,9 +87,10 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)  	if (offset != offsetof(struct user_regs_struct, gs))  		retval = *pt_regs_access(task_pt_regs(task), offset);  	else { -		retval = task->thread.gs;  		if (task == current) -			savesegment(gs, retval); +			retval = get_user_gs(task_pt_regs(task)); +		else +			retval = task_user_gs(task);  	}  	return retval;  } @@ -126,13 +124,10 @@ static int set_segment_reg(struct task_struct *task,  		break;  	case offsetof(struct user_regs_struct, gs): -		task->thread.gs = value;  		if (task == current) -			/* -			 * The user-mode %gs is not affected by -			 * kernel entry, so we must update the CPU. -			 */ -			loadsegment(gs, value); +			set_user_gs(task_pt_regs(task), value); +		else +			task_user_gs(task) = value;  	}  	return 0; @@ -273,7 +268,7 @@ static unsigned long debugreg_addr_limit(struct task_struct *task)  	if (test_tsk_thread_flag(task, TIF_IA32))  		return IA32_PAGE_OFFSET - 3;  #endif -	return TASK_SIZE64 - 7; +	return TASK_SIZE_MAX - 7;  }  #endif	/* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 4526b3a75ed..2aef36d8aca 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -14,6 +14,7 @@  #include <asm/reboot.h>  #include <asm/pci_x86.h>  #include <asm/virtext.h> +#include <asm/cpu.h>  #ifdef CONFIG_X86_32  # include <linux/dmi.h> @@ -23,8 +24,6 @@  # include <asm/iommu.h>  #endif -#include <mach_ipi.h> -  /*   * Power off function, if any   */ @@ -658,7 +657,7 @@ static int crash_nmi_callback(struct notifier_block *self,  static void smp_send_nmi_allbutself(void)  { -	send_IPI_allbutself(NMI_VECTOR); +	apic->send_IPI_allbutself(NMI_VECTOR);  }  static struct notifier_block crash_nmi_nb = { diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index a160f311972..2064d0aa8d2 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -7,7 +7,7 @@   */  #include <linux/linkage.h> -#include <asm/page.h> +#include <asm/page_types.h>  #include <asm/kexec.h>  #include <asm/processor-flags.h> diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index f5afe665a82..d32cfb27a47 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -7,10 +7,10 @@   */  #include <linux/linkage.h> -#include <asm/page.h> +#include <asm/page_types.h>  #include <asm/kexec.h>  #include <asm/processor-flags.h> -#include <asm/pgtable.h> +#include <asm/pgtable_types.h>  /*   * Must be relocatable PIC code callable as a C function @@ -29,122 +29,6 @@ relocate_kernel:  	 * %rdx start address  	 */ -	/* map the control page at its virtual address */ - -	movq	$0x0000ff8000000000, %r10        /* mask */ -	mov	$(39 - 3), %cl                   /* bits to shift */ -	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ - -	movq	%r11, %r9 -	andq	%r10, %r9 -	shrq	%cl, %r9 - -	movq	PTR(VA_PGD)(%rsi), %r8 -	addq	%r8, %r9 -	movq	PTR(PA_PUD_0)(%rsi), %r8 -	orq	$PAGE_ATTR, %r8 -	movq	%r8, (%r9) - -	shrq	$9, %r10 -	sub	$9, %cl - -	movq	%r11, %r9 -	andq	%r10, %r9 -	shrq	%cl, %r9 - -	movq	PTR(VA_PUD_0)(%rsi), %r8 -	addq	%r8, %r9 -	movq	PTR(PA_PMD_0)(%rsi), %r8 -	orq	$PAGE_ATTR, %r8 -	movq	%r8, (%r9) - -	shrq	$9, %r10 -	sub	$9, %cl - -	movq	%r11, %r9 -	andq	%r10, %r9 -	shrq	%cl, %r9 - -	movq	PTR(VA_PMD_0)(%rsi), %r8 -	addq	%r8, %r9 -	movq	PTR(PA_PTE_0)(%rsi), %r8 -	orq	$PAGE_ATTR, %r8 -	movq	%r8, (%r9) - -	shrq	$9, %r10 -	sub	$9, %cl - -	movq	%r11, %r9 -	andq	%r10, %r9 -	shrq	%cl, %r9 - -	movq	PTR(VA_PTE_0)(%rsi), %r8 -	addq	%r8, %r9 -	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8 -	orq	$PAGE_ATTR, %r8 -	movq	%r8, (%r9) - -	/* identity map the control page at its physical address */ - -	movq	$0x0000ff8000000000, %r10        /* mask */ -	mov	$(39 - 3), %cl                   /* bits to shift */ -	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ - -	movq	%r11, %r9 -	andq	%r10, %r9 -	shrq	%cl, %r9 - -	movq	PTR(VA_PGD)(%rsi), %r8 -	addq	%r8, %r9 -	movq	PTR(PA_PUD_1)(%rsi), %r8 -	orq	$PAGE_ATTR, %r8 -	movq	%r8, (%r9) - -	shrq	$9, %r10 -	sub	$9, %cl - -	movq	%r11, %r9 -	andq	%r10, %r9 -	shrq	%cl, %r9 - -	movq	PTR(VA_PUD_1)(%rsi), %r8 -	addq	%r8, %r9 -	movq	PTR(PA_PMD_1)(%rsi), %r8 -	orq	$PAGE_ATTR, %r8 -	movq	%r8, (%r9) - -	shrq	$9, %r10 -	sub	$9, %cl - -	movq	%r11, %r9 -	andq	%r10, %r9 -	shrq	%cl, %r9 - -	movq	PTR(VA_PMD_1)(%rsi), %r8 -	addq	%r8, %r9 -	movq	PTR(PA_PTE_1)(%rsi), %r8 -	orq	$PAGE_ATTR, %r8 -	movq	%r8, (%r9) - -	shrq	$9, %r10 -	sub	$9, %cl - -	movq	%r11, %r9 -	andq	%r10, %r9 -	shrq	%cl, %r9 - -	movq	PTR(VA_PTE_1)(%rsi), %r8 -	addq	%r8, %r9 -	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8 -	orq	$PAGE_ATTR, %r8 -	movq	%r8, (%r9) - -relocate_new_kernel: -	/* %rdi indirection_page -	 * %rsi page_list -	 * %rdx start address -	 */ -  	/* zero out flags, and disable interrupts */  	pushq $0  	popfq @@ -156,9 +40,8 @@ relocate_new_kernel:  	/* get physical address of page table now too */  	movq	PTR(PA_TABLE_PAGE)(%rsi), %rcx -	/* switch to new set of page tables */ -	movq	PTR(PA_PGD)(%rsi), %r9 -	movq	%r9, %cr3 +	/* Switch to the identity mapped page tables */ +	movq	%rcx, %cr3  	/* setup a new stack at the end of the physical control page */  	lea	PAGE_SIZE(%r8), %rsp @@ -194,9 +77,7 @@ identity_mapped:  	jmp 1f  1: -	/* Switch to the identity mapped page tables, -	 * and flush the TLB. -	*/ +	/* Flush the TLB (needed?) */  	movq	%rcx, %cr3  	/* Do the copies */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6a8811a6932..f28c56e6bf9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -74,14 +74,15 @@  #include <asm/e820.h>  #include <asm/mpspec.h>  #include <asm/setup.h> -#include <asm/arch_hooks.h>  #include <asm/efi.h> +#include <asm/timer.h> +#include <asm/i8259.h>  #include <asm/sections.h>  #include <asm/dmi.h>  #include <asm/io_apic.h>  #include <asm/ist.h>  #include <asm/vmi.h> -#include <setup_arch.h> +#include <asm/setup_arch.h>  #include <asm/bios_ebda.h>  #include <asm/cacheflush.h>  #include <asm/processor.h> @@ -89,7 +90,7 @@  #include <asm/system.h>  #include <asm/vsyscall.h> -#include <asm/smp.h> +#include <asm/cpu.h>  #include <asm/desc.h>  #include <asm/dma.h>  #include <asm/iommu.h> @@ -97,7 +98,6 @@  #include <asm/mmu_context.h>  #include <asm/proto.h> -#include <mach_apic.h>  #include <asm/paravirt.h>  #include <asm/hypervisor.h> @@ -112,6 +112,20 @@  #define ARCH_SETUP  #endif +unsigned int boot_cpu_id __read_mostly; + +#ifdef CONFIG_X86_64 +int default_cpu_present_to_apicid(int mps_cpu) +{ +	return __default_cpu_present_to_apicid(mps_cpu); +} + +int default_check_phys_apicid_present(int boot_cpu_physical_apicid) +{ +	return __default_check_phys_apicid_present(boot_cpu_physical_apicid); +} +#endif +  #ifndef CONFIG_DEBUG_BOOT_PARAMS  struct boot_params __initdata boot_params;  #else @@ -188,7 +202,9 @@ struct ist_info ist_info;  #endif  #else -struct cpuinfo_x86 boot_cpu_data __read_mostly; +struct cpuinfo_x86 boot_cpu_data __read_mostly = { +	.x86_phys_bits = MAX_PHYSMEM_BITS, +};  EXPORT_SYMBOL(boot_cpu_data);  #endif @@ -586,20 +602,7 @@ static int __init setup_elfcorehdr(char *arg)  early_param("elfcorehdr", setup_elfcorehdr);  #endif -static int __init default_update_genapic(void) -{ -#ifdef CONFIG_X86_SMP -# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) -	genapic->wakeup_cpu = wakeup_secondary_cpu_via_init; -# endif -#endif - -	return 0; -} - -static struct x86_quirks default_x86_quirks __initdata = { -	.update_genapic         = default_update_genapic, -}; +static struct x86_quirks default_x86_quirks __initdata;  struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; @@ -656,7 +659,6 @@ void __init setup_arch(char **cmdline_p)  #ifdef CONFIG_X86_32  	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));  	visws_early_detect(); -	pre_setup_arch_hook();  #else  	printk(KERN_INFO "Command line: %s\n", boot_command_line);  #endif @@ -824,8 +826,7 @@ void __init setup_arch(char **cmdline_p)  #else  	num_physpages = max_pfn; - 	if (cpu_has_x2apic) - 		check_x2apic(); +	check_x2apic();  	/* How many end-of-memory variables you have, grandma! */  	/* need this before calling reserve_initrd */ @@ -865,9 +866,7 @@ void __init setup_arch(char **cmdline_p)  	reserve_initrd(); -#ifdef CONFIG_X86_64  	vsmp_init(); -#endif  	io_delay_init(); @@ -893,12 +892,11 @@ void __init setup_arch(char **cmdline_p)  	 */  	acpi_reserve_bootmem();  #endif -#ifdef CONFIG_X86_FIND_SMP_CONFIG  	/*  	 * Find and reserve possible boot-time SMP configuration:  	 */  	find_smp_config(); -#endif +  	reserve_crashkernel();  #ifdef CONFIG_X86_64 @@ -925,9 +923,7 @@ void __init setup_arch(char **cmdline_p)  	map_vsyscall();  #endif -#ifdef CONFIG_X86_GENERICARCH  	generic_apic_probe(); -#endif  	early_quirks(); @@ -978,4 +974,95 @@ void __init setup_arch(char **cmdline_p)  #endif  } +#ifdef CONFIG_X86_32 + +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + *	Perform any necessary interrupt initialisation prior to setting up + *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA + *	interrupts should be initialised here if the machine emulates a PC + *	in any way. + **/ +void __init x86_quirk_pre_intr_init(void) +{ +	if (x86_quirks->arch_pre_intr_init) { +		if (x86_quirks->arch_pre_intr_init()) +			return; +	} +	init_ISA_irqs(); +} + +/** + * x86_quirk_intr_init - post gate setup interrupt initialisation + * + * Description: + *	Fill in any interrupts that may have been left out by the general + *	init_IRQ() routine.  interrupts having to do with the machine rather + *	than the devices on the I/O bus (like APIC interrupts in intel MP + *	systems) are started here. + **/ +void __init x86_quirk_intr_init(void) +{ +	if (x86_quirks->arch_intr_init) { +		if (x86_quirks->arch_intr_init()) +			return; +	} +} + +/** + * x86_quirk_trap_init - initialise system specific traps + * + * Description: + *	Called as the final act of trap_init().  Used in VISWS to initialise + *	the various board specific APIC traps. + **/ +void __init x86_quirk_trap_init(void) +{ +	if (x86_quirks->arch_trap_init) { +		if (x86_quirks->arch_trap_init()) +			return; +	} +} + +static struct irqaction irq0  = { +	.handler = timer_interrupt, +	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, +	.mask = CPU_MASK_NONE, +	.name = "timer" +}; + +/** + * x86_quirk_pre_time_init - do any specific initialisations before. + * + **/ +void __init x86_quirk_pre_time_init(void) +{ +	if (x86_quirks->arch_pre_time_init) +		x86_quirks->arch_pre_time_init(); +} +/** + * x86_quirk_time_init - do any specific initialisations for the system timer. + * + * Description: + *	Must plug the system timer interrupt source at HZ into the IRQ listed + *	in irq_vectors.h:TIMER_IRQ + **/ +void __init x86_quirk_time_init(void) +{ +	if (x86_quirks->arch_time_init) { +		/* +		 * A nonzero return code does not mean failure, it means +		 * that the architecture quirk does not want any +		 * generic (timer) setup to be performed after this: +		 */ +		if (x86_quirks->arch_time_init()) +			return; +	} + +	irq0.mask = cpumask_of_cpu(0); +	setup_irq(0, &irq0); +} +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 01161077a49..efa615f2bf4 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -7,402 +7,482 @@  #include <linux/crash_dump.h>  #include <linux/smp.h>  #include <linux/topology.h> +#include <linux/pfn.h>  #include <asm/sections.h>  #include <asm/processor.h>  #include <asm/setup.h>  #include <asm/mpspec.h>  #include <asm/apicdef.h>  #include <asm/highmem.h> +#include <asm/proto.h> +#include <asm/cpumask.h> +#include <asm/cpu.h> +#include <asm/stackprotector.h> -#ifdef CONFIG_X86_LOCAL_APIC -unsigned int num_processors; -unsigned disabled_cpus __cpuinitdata; -/* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid = -1U; -EXPORT_SYMBOL(boot_cpu_physical_apicid); -unsigned int max_physical_apicid; - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map; +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +# define DBG(x...) printk(KERN_DEBUG x) +#else +# define DBG(x...)  #endif -/* map cpu index to physical APIC ID */ -DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); +DEFINE_PER_CPU(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) -#define	X86_64_NUMA	1 - -/* map cpu index to node index */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); +#ifdef CONFIG_X86_64 +#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) +#else +#define BOOT_PERCPU_OFFSET 0 +#endif -/* which logical CPUs are on which nodes */ -cpumask_t *node_to_cpumask_map; -EXPORT_SYMBOL(node_to_cpumask_map); +DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +EXPORT_PER_CPU_SYMBOL(this_cpu_off); -/* setup node_to_cpumask_map */ -static void __init setup_node_to_cpumask_map(void); +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { +	[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, +}; +EXPORT_SYMBOL(__per_cpu_offset); +/* + * On x86_64 symbols referenced from code should be reachable using + * 32bit relocations.  Reserve space for static percpu variables in + * modules so that they are always served from the first chunk which + * is located at the percpu segment base.  On x86_32, anything can + * address anywhere.  No need to reserve space in the first chunk. + */ +#ifdef CONFIG_X86_64 +#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE  #else -static inline void setup_node_to_cpumask_map(void) { } +#define PERCPU_FIRST_CHUNK_RESERVE	0  #endif -#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) -/* - * Copy data used in early init routines from the initial arrays to the - * per cpu data areas.  These arrays then become expendable and the - * *_early_ptr's are zeroed indicating that the static arrays are gone. +/** + * pcpu_need_numa - determine percpu allocation needs to consider NUMA + * + * If NUMA is not configured or there is only one NUMA node available, + * there is no reason to consider NUMA.  This function determines + * whether percpu allocation should consider NUMA or not. + * + * RETURNS: + * true if NUMA should be considered; otherwise, false.   */ -static void __init setup_per_cpu_maps(void) +static bool __init pcpu_need_numa(void)  { -	int cpu; +#ifdef CONFIG_NEED_MULTIPLE_NODES +	pg_data_t *last = NULL; +	unsigned int cpu;  	for_each_possible_cpu(cpu) { -		per_cpu(x86_cpu_to_apicid, cpu) = -				early_per_cpu_map(x86_cpu_to_apicid, cpu); -		per_cpu(x86_bios_cpu_apicid, cpu) = -				early_per_cpu_map(x86_bios_cpu_apicid, cpu); -#ifdef X86_64_NUMA -		per_cpu(x86_cpu_to_node_map, cpu) = -				early_per_cpu_map(x86_cpu_to_node_map, cpu); -#endif -	} +		int node = early_cpu_to_node(cpu); -	/* indicate the early static arrays will soon be gone */ -	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; -	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; -#ifdef X86_64_NUMA -	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; +		if (node_online(node) && NODE_DATA(node) && +		    last && last != NODE_DATA(node)) +			return true; + +		last = NODE_DATA(node); +	}  #endif +	return false;  } -#ifdef CONFIG_X86_32 -/* - * Great future not-so-futuristic plan: make i386 and x86_64 do it - * the same way +/** + * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu + * @cpu: cpu to allocate for + * @size: size allocation in bytes + * @align: alignment + * + * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper + * does the right thing for NUMA regardless of the current + * configuration. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure.   */ -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(__per_cpu_offset); -static inline void setup_cpu_pda_map(void) { } - -#elif !defined(CONFIG_SMP) -static inline void setup_cpu_pda_map(void) { } +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, +					unsigned long align) +{ +	const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NEED_MULTIPLE_NODES +	int node = early_cpu_to_node(cpu); +	void *ptr; -#else /* CONFIG_SMP && CONFIG_X86_64 */ +	if (!node_online(node) || !NODE_DATA(node)) { +		ptr = __alloc_bootmem_nopanic(size, align, goal); +		pr_info("cpu %d has no node %d or node-local memory\n", +			cpu, node); +		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", +			 cpu, size, __pa(ptr)); +	} else { +		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), +						   size, align, goal); +		pr_debug("per cpu data for cpu%d %lu bytes on node%d at " +			 "%016lx\n", cpu, size, node, __pa(ptr)); +	} +	return ptr; +#else +	return __alloc_bootmem_nopanic(size, align, goal); +#endif +}  /* - * Allocate cpu_pda pointer table and array via alloc_bootmem. + * Remap allocator + * + * This allocator uses PMD page as unit.  A PMD page is allocated for + * each cpu and each is remapped into vmalloc area using PMD mapping. + * As PMD page is quite large, only part of it is used for the first + * chunk.  Unused part is returned to the bootmem allocator. + * + * So, the PMD pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk.  The double + * mapping does add one more PMD TLB entry pressure but still is much + * better than only using 4k mappings while still being NUMA friendly.   */ -static void __init setup_cpu_pda_map(void) -{ -	char *pda; -	struct x8664_pda **new_cpu_pda; -	unsigned long size; -	int cpu; - -	size = roundup(sizeof(struct x8664_pda), cache_line_size()); - -	/* allocate cpu_pda array and pointer table */ -	{ -		unsigned long tsize = nr_cpu_ids * sizeof(void *); -		unsigned long asize = size * (nr_cpu_ids - 1); +#ifdef CONFIG_NEED_MULTIPLE_NODES +static size_t pcpur_size __initdata; +static void **pcpur_ptrs __initdata; -		tsize = roundup(tsize, cache_line_size()); -		new_cpu_pda = alloc_bootmem(tsize + asize); -		pda = (char *)new_cpu_pda + tsize; -	} +static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +{ +	size_t off = (size_t)pageno << PAGE_SHIFT; -	/* initialize pointer table to static pda's */ -	for_each_possible_cpu(cpu) { -		if (cpu == 0) { -			/* leave boot cpu pda in place */ -			new_cpu_pda[0] = cpu_pda(0); -			continue; -		} -		new_cpu_pda[cpu] = (struct x8664_pda *)pda; -		new_cpu_pda[cpu]->in_bootmem = 1; -		pda += size; -	} +	if (off >= pcpur_size) +		return NULL; -	/* point to new pointer table */ -	_cpu_pda = new_cpu_pda; +	return virt_to_page(pcpur_ptrs[cpu] + off);  } -#endif /* CONFIG_SMP && CONFIG_X86_64 */ +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ +	static struct vm_struct vm; +	pg_data_t *last; +	size_t ptrs_size, dyn_size; +	unsigned int cpu; +	ssize_t ret; -#ifdef CONFIG_X86_64 +	/* +	 * If large page isn't supported, there's no benefit in doing +	 * this.  Also, on non-NUMA, embedding is better. +	 */ +	if (!cpu_has_pse || pcpu_need_numa()) +		return -EINVAL; -/* correctly size the local cpu masks */ -static void __init setup_cpu_local_masks(void) -{ -	alloc_bootmem_cpumask_var(&cpu_initialized_mask); -	alloc_bootmem_cpumask_var(&cpu_callin_mask); -	alloc_bootmem_cpumask_var(&cpu_callout_mask); -	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); -} +	last = NULL; +	for_each_possible_cpu(cpu) { +		int node = early_cpu_to_node(cpu); -#else /* CONFIG_X86_32 */ +		if (node_online(node) && NODE_DATA(node) && +		    last && last != NODE_DATA(node)) +			goto proceed; -static inline void setup_cpu_local_masks(void) -{ -} +		last = NODE_DATA(node); +	} +	return -EINVAL; -#endif /* CONFIG_X86_32 */ +proceed: +	/* +	 * Currently supports only single page.  Supporting multiple +	 * pages won't be too difficult if it ever becomes necessary. +	 */ +	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + +			       PERCPU_DYNAMIC_RESERVE); +	if (pcpur_size > PMD_SIZE) { +		pr_warning("PERCPU: static data is larger than large page, " +			   "can't use large page\n"); +		return -EINVAL; +	} +	dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ -void __init setup_per_cpu_areas(void) -{ -	ssize_t size, old_size; -	char *ptr; -	int cpu; -	unsigned long align = 1; +	/* allocate pointer array and alloc large pages */ +	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); +	pcpur_ptrs = alloc_bootmem(ptrs_size); -	/* Setup cpu_pda map */ -	setup_cpu_pda_map(); +	for_each_possible_cpu(cpu) { +		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); +		if (!pcpur_ptrs[cpu]) +			goto enomem; -	/* Copy section for each CPU (we discard the original) */ -	old_size = PERCPU_ENOUGH_ROOM; -	align = max_t(unsigned long, PAGE_SIZE, align); -	size = roundup(old_size, align); +		/* +		 * Only use pcpur_size bytes and give back the rest. +		 * +		 * Ingo: The 2MB up-rounding bootmem is needed to make +		 * sure the partial 2MB page is still fully RAM - it's +		 * not well-specified to have a PAT-incompatible area +		 * (unmapped RAM, device memory, etc.) in that hole. +		 */ +		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), +			     PMD_SIZE - pcpur_size); -	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", -		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); +		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); +	} -	pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); +	/* allocate address and map */ +	vm.flags = VM_ALLOC; +	vm.size = num_possible_cpus() * PMD_SIZE; +	vm_area_register_early(&vm, PMD_SIZE);  	for_each_possible_cpu(cpu) { -#ifndef CONFIG_NEED_MULTIPLE_NODES -		ptr = __alloc_bootmem(size, align, -				 __pa(MAX_DMA_ADDRESS)); -#else -		int node = early_cpu_to_node(cpu); -		if (!node_online(node) || !NODE_DATA(node)) { -			ptr = __alloc_bootmem(size, align, -					 __pa(MAX_DMA_ADDRESS)); -			pr_info("cpu %d has no node %d or node-local memory\n", -				cpu, node); -			pr_debug("per cpu data for cpu%d at %016lx\n", -				 cpu, __pa(ptr)); -		} else { -			ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, -							__pa(MAX_DMA_ADDRESS)); -			pr_debug("per cpu data for cpu%d on node%d at %016lx\n", -				cpu, node, __pa(ptr)); -		} -#endif -		per_cpu_offset(cpu) = ptr - __per_cpu_start; -		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); +		pmd_t *pmd; + +		pmd = populate_extra_pmd((unsigned long)vm.addr +					 + cpu * PMD_SIZE); +		set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), +				     PAGE_KERNEL_LARGE));  	} -	/* Setup percpu data maps */ -	setup_per_cpu_maps(); +	/* we're ready, commit */ +	pr_info("PERCPU: Remapped at %p with large pages, static data " +		"%zu bytes\n", vm.addr, static_size); -	/* Setup node to cpumask map */ -	setup_node_to_cpumask_map(); +	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, +				     PERCPU_FIRST_CHUNK_RESERVE, +				     PMD_SIZE, dyn_size, vm.addr, NULL); +	goto out_free_ar; -	/* Setup cpu initialized, callin, callout masks */ -	setup_cpu_local_masks(); +enomem: +	for_each_possible_cpu(cpu) +		if (pcpur_ptrs[cpu]) +			free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); +	ret = -ENOMEM; +out_free_ar: +	free_bootmem(__pa(pcpur_ptrs), ptrs_size); +	return ret; +} +#else +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ +	return -EINVAL;  } -  #endif -#ifdef X86_64_NUMA -  /* - * Allocate node_to_cpumask_map based on number of available nodes - * Requires node_possible_map to be valid. + * Embedding allocator   * - * Note: node_to_cpumask() is not valid until after this is done. + * The first chunk is sized to just contain the static area plus + * module and dynamic reserves, and allocated as a contiguous area + * using bootmem allocator and used as-is without being mapped into + * vmalloc area.  This enables the first chunk to piggy back on the + * linear physical PMD mapping and doesn't add any additional pressure + * to TLB.  Note that if the needed size is smaller than the minimum + * unit size, the leftover is returned to the bootmem allocator.   */ -static void __init setup_node_to_cpumask_map(void) -{ -	unsigned int node, num = 0; -	cpumask_t *map; +static void *pcpue_ptr __initdata; +static size_t pcpue_size __initdata; +static size_t pcpue_unit_size __initdata; -	/* setup nr_node_ids if not done yet */ -	if (nr_node_ids == MAX_NUMNODES) { -		for_each_node_mask(node, node_possible_map) -			num = node; -		nr_node_ids = num + 1; -	} - -	/* allocate the map */ -	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); +static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) +{ +	size_t off = (size_t)pageno << PAGE_SHIFT; -	pr_debug("Node to cpumask map at %p for %d nodes\n", -		 map, nr_node_ids); +	if (off >= pcpue_size) +		return NULL; -	/* node_to_cpumask() will now work */ -	node_to_cpumask_map = map; +	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);  } -void __cpuinit numa_set_node(int cpu, int node) +static ssize_t __init setup_pcpu_embed(size_t static_size)  { -	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); +	unsigned int cpu; +	size_t dyn_size; -	if (cpu_pda(cpu) && node != NUMA_NO_NODE) -		cpu_pda(cpu)->nodenumber = node; +	/* +	 * If large page isn't supported, there's no benefit in doing +	 * this.  Also, embedding allocation doesn't play well with +	 * NUMA. +	 */ +	if (!cpu_has_pse || pcpu_need_numa()) +		return -EINVAL; -	if (cpu_to_node_map) -		cpu_to_node_map[cpu] = node; +	/* allocate and copy */ +	pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + +			       PERCPU_DYNAMIC_RESERVE); +	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); +	dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; -	else if (per_cpu_offset(cpu)) -		per_cpu(x86_cpu_to_node_map, cpu) = node; +	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, +				       PAGE_SIZE); +	if (!pcpue_ptr) +		return -ENOMEM; -	else -		pr_debug("Setting node for non-present cpu %d\n", cpu); -} +	for_each_possible_cpu(cpu) { +		void *ptr = pcpue_ptr + cpu * pcpue_unit_size; -void __cpuinit numa_clear_node(int cpu) -{ -	numa_set_node(cpu, NUMA_NO_NODE); +		free_bootmem(__pa(ptr + pcpue_size), +			     pcpue_unit_size - pcpue_size); +		memcpy(ptr, __per_cpu_load, static_size); +	} + +	/* we're ready, commit */ +	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", +		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + +	return pcpu_setup_first_chunk(pcpue_get_page, static_size, +				      PERCPU_FIRST_CHUNK_RESERVE, +				      pcpue_unit_size, dyn_size, +				      pcpue_ptr, NULL);  } -#ifndef CONFIG_DEBUG_PER_CPU_MAPS +/* + * 4k page allocator + * + * This is the basic allocator.  Static percpu area is allocated + * page-by-page and most of initialization is done by the generic + * setup function. + */ +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; -void __cpuinit numa_add_cpu(int cpu) +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)  { -	cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +	if (pageno < pcpu4k_nr_static_pages) +		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; +	return NULL;  } -void __cpuinit numa_remove_cpu(int cpu) +static void __init pcpu4k_populate_pte(unsigned long addr)  { -	cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); +	populate_extra_pte(addr);  } -#else /* CONFIG_DEBUG_PER_CPU_MAPS */ - -/* - * --------- debug versions of the numa functions --------- - */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) +static ssize_t __init setup_pcpu_4k(size_t static_size)  { -	int node = cpu_to_node(cpu); -	cpumask_t *mask; -	char buf[64]; +	size_t pages_size; +	unsigned int cpu; +	int i, j; +	ssize_t ret; -	if (node_to_cpumask_map == NULL) { -		printk(KERN_ERR "node_to_cpumask_map NULL\n"); -		dump_stack(); -		return; -	} +	pcpu4k_nr_static_pages = PFN_UP(static_size); -	mask = &node_to_cpumask_map[node]; -	if (enable) -		cpu_set(cpu, *mask); -	else -		cpu_clear(cpu, *mask); +	/* unaligned allocations can't be freed, round up to page size */ +	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() +			       * sizeof(pcpu4k_pages[0])); +	pcpu4k_pages = alloc_bootmem(pages_size); -	cpulist_scnprintf(buf, sizeof(buf), mask); -	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", -		enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); -} +	/* allocate and copy */ +	j = 0; +	for_each_possible_cpu(cpu) +		for (i = 0; i < pcpu4k_nr_static_pages; i++) { +			void *ptr; -void __cpuinit numa_add_cpu(int cpu) -{ -	numa_set_cpumask(cpu, 1); -} +			ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); +			if (!ptr) +				goto enomem; -void __cpuinit numa_remove_cpu(int cpu) -{ -	numa_set_cpumask(cpu, 0); +			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); +			pcpu4k_pages[j++] = virt_to_page(ptr); +		} + +	/* we're ready, commit */ +	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", +		pcpu4k_nr_static_pages, static_size); + +	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, +				     PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL, +				     pcpu4k_populate_pte); +	goto out_free_ar; + +enomem: +	while (--j >= 0) +		free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); +	ret = -ENOMEM; +out_free_ar: +	free_bootmem(__pa(pcpu4k_pages), pages_size); +	return ret;  } -int cpu_to_node(int cpu) +static inline void setup_percpu_segment(int cpu)  { -	if (early_per_cpu_ptr(x86_cpu_to_node_map)) { -		printk(KERN_WARNING -			"cpu_to_node(%d): usage too early!\n", cpu); -		dump_stack(); -		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; -	} -	return per_cpu(x86_cpu_to_node_map, cpu); +#ifdef CONFIG_X86_32 +	struct desc_struct gdt; + +	pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, +			0x2 | DESCTYPE_S, 0x8); +	gdt.s = 1; +	write_gdt_entry(get_cpu_gdt_table(cpu), +			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); +#endif  } -EXPORT_SYMBOL(cpu_to_node);  /* - * Same function as cpu_to_node() but used if called before the - * per_cpu areas are setup. + * Great future plan: + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. + * Always point %gs to its beginning   */ -int early_cpu_to_node(int cpu) +void __init setup_per_cpu_areas(void)  { -	if (early_per_cpu_ptr(x86_cpu_to_node_map)) -		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - -	if (!per_cpu_offset(cpu)) { -		printk(KERN_WARNING -			"early_cpu_to_node(%d): no per_cpu area!\n", cpu); -		dump_stack(); -		return NUMA_NO_NODE; -	} -	return per_cpu(x86_cpu_to_node_map, cpu); -} +	size_t static_size = __per_cpu_end - __per_cpu_start; +	unsigned int cpu; +	unsigned long delta; +	size_t pcpu_unit_size; +	ssize_t ret; +	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", +		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); -/* empty cpumask */ -static const cpumask_t cpu_mask_none; +	/* +	 * Allocate percpu area.  If PSE is supported, try to make use +	 * of large page mappings.  Please read comments on top of +	 * each allocator for details. +	 */ +	ret = setup_pcpu_remap(static_size); +	if (ret < 0) +		ret = setup_pcpu_embed(static_size); +	if (ret < 0) +		ret = setup_pcpu_4k(static_size); +	if (ret < 0) +		panic("cannot allocate static percpu area (%zu bytes, err=%zd)", +		      static_size, ret); -/* - * Returns a pointer to the bitmask of CPUs on Node 'node'. - */ -const cpumask_t *cpumask_of_node(int node) -{ -	if (node_to_cpumask_map == NULL) { -		printk(KERN_WARNING -			"cpumask_of_node(%d): no node_to_cpumask_map!\n", -			node); -		dump_stack(); -		return (const cpumask_t *)&cpu_online_map; -	} -	if (node >= nr_node_ids) { -		printk(KERN_WARNING -			"cpumask_of_node(%d): node > nr_node_ids(%d)\n", -			node, nr_node_ids); -		dump_stack(); -		return &cpu_mask_none; -	} -	return &node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(cpumask_of_node); +	pcpu_unit_size = ret; -/* - * Returns a bitmask of CPUs on Node 'node'. - * - * Side note: this function creates the returned cpumask on the stack - * so with a high NR_CPUS count, excessive stack space is used.  The - * node_to_cpumask_ptr function should be used whenever possible. - */ -cpumask_t node_to_cpumask(int node) -{ -	if (node_to_cpumask_map == NULL) { -		printk(KERN_WARNING -			"node_to_cpumask(%d): no node_to_cpumask_map!\n", node); -		dump_stack(); -		return cpu_online_map; -	} -	if (node >= nr_node_ids) { -		printk(KERN_WARNING -			"node_to_cpumask(%d): node > nr_node_ids(%d)\n", -			node, nr_node_ids); -		dump_stack(); -		return cpu_mask_none; +	/* alrighty, percpu areas up and running */ +	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; +	for_each_possible_cpu(cpu) { +		per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; +		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); +		per_cpu(cpu_number, cpu) = cpu; +		setup_percpu_segment(cpu); +		setup_stack_canary_segment(cpu); +		/* +		 * Copy data used in early init routines from the +		 * initial arrays to the per cpu data areas.  These +		 * arrays then become expendable and the *_early_ptr's +		 * are zeroed indicating that the static arrays are +		 * gone. +		 */ +#ifdef CONFIG_X86_LOCAL_APIC +		per_cpu(x86_cpu_to_apicid, cpu) = +			early_per_cpu_map(x86_cpu_to_apicid, cpu); +		per_cpu(x86_bios_cpu_apicid, cpu) = +			early_per_cpu_map(x86_bios_cpu_apicid, cpu); +#endif +#ifdef CONFIG_X86_64 +		per_cpu(irq_stack_ptr, cpu) = +			per_cpu(irq_stack_union.irq_stack, cpu) + +			IRQ_STACK_SIZE - 64; +#ifdef CONFIG_NUMA +		per_cpu(x86_cpu_to_node_map, cpu) = +			early_per_cpu_map(x86_cpu_to_node_map, cpu); +#endif +#endif +		/* +		 * Up to this point, the boot CPU has been using .data.init +		 * area.  Reload any changed state for the boot CPU. +		 */ +		if (cpu == boot_cpu_id) +			switch_to_new_gdt(cpu);  	} -	return node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(node_to_cpumask); - -/* - * --------- end of debug versions of the numa functions --------- - */ -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ +	/* indicate the early static arrays will soon be gone */ +#ifdef CONFIG_X86_LOCAL_APIC +	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; +	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; +#endif +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) +	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; +#endif -#endif /* X86_64_NUMA */ +	/* Setup node to cpumask map */ +	setup_node_to_cpumask_map(); +	/* Setup cpu initialized, callin, callout masks */ +	setup_cpu_local_masks(); +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index df0587f24c5..d2cc6428c58 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -50,27 +50,23 @@  # define FIX_EFLAGS	__FIX_EFLAGS  #endif -#define COPY(x)			{		\ -	err |= __get_user(regs->x, &sc->x);	\ -} +#define COPY(x)			do {			\ +	get_user_ex(regs->x, &sc->x);			\ +} while (0) -#define COPY_SEG(seg)		{			\ -		unsigned short tmp;			\ -		err |= __get_user(tmp, &sc->seg);	\ -		regs->seg = tmp;			\ -} +#define GET_SEG(seg)		({			\ +	unsigned short tmp;				\ +	get_user_ex(tmp, &sc->seg);			\ +	tmp;						\ +}) -#define COPY_SEG_CPL3(seg)	{			\ -		unsigned short tmp;			\ -		err |= __get_user(tmp, &sc->seg);	\ -		regs->seg = tmp | 3;			\ -} +#define COPY_SEG(seg)		do {			\ +	regs->seg = GET_SEG(seg);			\ +} while (0) -#define GET_SEG(seg)		{			\ -		unsigned short tmp;			\ -		err |= __get_user(tmp, &sc->seg);	\ -		loadsegment(seg, tmp);			\ -} +#define COPY_SEG_CPL3(seg)	do {			\ +	regs->seg = GET_SEG(seg) | 3;			\ +} while (0)  static int  restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, @@ -83,45 +79,49 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,  	/* Always make any pending restarted system calls return -EINTR */  	current_thread_info()->restart_block.fn = do_no_restart_syscall; +	get_user_try { +  #ifdef CONFIG_X86_32 -	GET_SEG(gs); -	COPY_SEG(fs); -	COPY_SEG(es); -	COPY_SEG(ds); +		set_user_gs(regs, GET_SEG(gs)); +		COPY_SEG(fs); +		COPY_SEG(es); +		COPY_SEG(ds);  #endif /* CONFIG_X86_32 */ -	COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); -	COPY(dx); COPY(cx); COPY(ip); +		COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); +		COPY(dx); COPY(cx); COPY(ip);  #ifdef CONFIG_X86_64 -	COPY(r8); -	COPY(r9); -	COPY(r10); -	COPY(r11); -	COPY(r12); -	COPY(r13); -	COPY(r14); -	COPY(r15); +		COPY(r8); +		COPY(r9); +		COPY(r10); +		COPY(r11); +		COPY(r12); +		COPY(r13); +		COPY(r14); +		COPY(r15);  #endif /* CONFIG_X86_64 */  #ifdef CONFIG_X86_32 -	COPY_SEG_CPL3(cs); -	COPY_SEG_CPL3(ss); +		COPY_SEG_CPL3(cs); +		COPY_SEG_CPL3(ss);  #else /* !CONFIG_X86_32 */ -	/* Kernel saves and restores only the CS segment register on signals, -	 * which is the bare minimum needed to allow mixed 32/64-bit code. -	 * App's signal handler can save/restore other segments if needed. */ -	COPY_SEG_CPL3(cs); +		/* Kernel saves and restores only the CS segment register on signals, +		 * which is the bare minimum needed to allow mixed 32/64-bit code. +		 * App's signal handler can save/restore other segments if needed. */ +		COPY_SEG_CPL3(cs);  #endif /* CONFIG_X86_32 */ -	err |= __get_user(tmpflags, &sc->flags); -	regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); -	regs->orig_ax = -1;		/* disable syscall checks */ +		get_user_ex(tmpflags, &sc->flags); +		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); +		regs->orig_ax = -1;		/* disable syscall checks */ -	err |= __get_user(buf, &sc->fpstate); -	err |= restore_i387_xstate(buf); +		get_user_ex(buf, &sc->fpstate); +		err |= restore_i387_xstate(buf); + +		get_user_ex(*pax, &sc->ax); +	} get_user_catch(err); -	err |= __get_user(*pax, &sc->ax);  	return err;  } @@ -131,57 +131,55 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,  {  	int err = 0; -#ifdef CONFIG_X86_32 -	{ -		unsigned int tmp; +	put_user_try { -		savesegment(gs, tmp); -		err |= __put_user(tmp, (unsigned int __user *)&sc->gs); -	} -	err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); -	err |= __put_user(regs->es, (unsigned int __user *)&sc->es); -	err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); +#ifdef CONFIG_X86_32 +		put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs); +		put_user_ex(regs->fs, (unsigned int __user *)&sc->fs); +		put_user_ex(regs->es, (unsigned int __user *)&sc->es); +		put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);  #endif /* CONFIG_X86_32 */ -	err |= __put_user(regs->di, &sc->di); -	err |= __put_user(regs->si, &sc->si); -	err |= __put_user(regs->bp, &sc->bp); -	err |= __put_user(regs->sp, &sc->sp); -	err |= __put_user(regs->bx, &sc->bx); -	err |= __put_user(regs->dx, &sc->dx); -	err |= __put_user(regs->cx, &sc->cx); -	err |= __put_user(regs->ax, &sc->ax); +		put_user_ex(regs->di, &sc->di); +		put_user_ex(regs->si, &sc->si); +		put_user_ex(regs->bp, &sc->bp); +		put_user_ex(regs->sp, &sc->sp); +		put_user_ex(regs->bx, &sc->bx); +		put_user_ex(regs->dx, &sc->dx); +		put_user_ex(regs->cx, &sc->cx); +		put_user_ex(regs->ax, &sc->ax);  #ifdef CONFIG_X86_64 -	err |= __put_user(regs->r8, &sc->r8); -	err |= __put_user(regs->r9, &sc->r9); -	err |= __put_user(regs->r10, &sc->r10); -	err |= __put_user(regs->r11, &sc->r11); -	err |= __put_user(regs->r12, &sc->r12); -	err |= __put_user(regs->r13, &sc->r13); -	err |= __put_user(regs->r14, &sc->r14); -	err |= __put_user(regs->r15, &sc->r15); +		put_user_ex(regs->r8, &sc->r8); +		put_user_ex(regs->r9, &sc->r9); +		put_user_ex(regs->r10, &sc->r10); +		put_user_ex(regs->r11, &sc->r11); +		put_user_ex(regs->r12, &sc->r12); +		put_user_ex(regs->r13, &sc->r13); +		put_user_ex(regs->r14, &sc->r14); +		put_user_ex(regs->r15, &sc->r15);  #endif /* CONFIG_X86_64 */ -	err |= __put_user(current->thread.trap_no, &sc->trapno); -	err |= __put_user(current->thread.error_code, &sc->err); -	err |= __put_user(regs->ip, &sc->ip); +		put_user_ex(current->thread.trap_no, &sc->trapno); +		put_user_ex(current->thread.error_code, &sc->err); +		put_user_ex(regs->ip, &sc->ip);  #ifdef CONFIG_X86_32 -	err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); -	err |= __put_user(regs->flags, &sc->flags); -	err |= __put_user(regs->sp, &sc->sp_at_signal); -	err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); +		put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); +		put_user_ex(regs->flags, &sc->flags); +		put_user_ex(regs->sp, &sc->sp_at_signal); +		put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);  #else /* !CONFIG_X86_32 */ -	err |= __put_user(regs->flags, &sc->flags); -	err |= __put_user(regs->cs, &sc->cs); -	err |= __put_user(0, &sc->gs); -	err |= __put_user(0, &sc->fs); +		put_user_ex(regs->flags, &sc->flags); +		put_user_ex(regs->cs, &sc->cs); +		put_user_ex(0, &sc->gs); +		put_user_ex(0, &sc->fs);  #endif /* CONFIG_X86_32 */ -	err |= __put_user(fpstate, &sc->fpstate); +		put_user_ex(fpstate, &sc->fpstate); -	/* non-iBCS2 extensions.. */ -	err |= __put_user(mask, &sc->oldmask); -	err |= __put_user(current->thread.cr2, &sc->cr2); +		/* non-iBCS2 extensions.. */ +		put_user_ex(mask, &sc->oldmask); +		put_user_ex(current->thread.cr2, &sc->cr2); +	} put_user_catch(err);  	return err;  } @@ -189,40 +187,35 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,  /*   * Set up a signal frame.   */ -#ifdef CONFIG_X86_32 -static const struct { -	u16 poplmovl; -	u32 val; -	u16 int80; -} __attribute__((packed)) retcode = { -	0xb858,		/* popl %eax; movl $..., %eax */ -	__NR_sigreturn, -	0x80cd,		/* int $0x80 */ -}; - -static const struct { -	u8  movl; -	u32 val; -	u16 int80; -	u8  pad; -} __attribute__((packed)) rt_retcode = { -	0xb8,		/* movl $..., %eax */ -	__NR_rt_sigreturn, -	0x80cd,		/* int $0x80 */ -	0 -};  /*   * Determine which stack to use..   */ +static unsigned long align_sigframe(unsigned long sp) +{ +#ifdef CONFIG_X86_32 +	/* +	 * Align the stack pointer according to the i386 ABI, +	 * i.e. so that on function entry ((sp + 4) & 15) == 0. +	 */ +	sp = ((sp + 4) & -16ul) - 4; +#else /* !CONFIG_X86_32 */ +	sp = round_down(sp, 16) - 8; +#endif +	return sp; +} +  static inline void __user *  get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, -	     void **fpstate) +	     void __user **fpstate)  { -	unsigned long sp; -  	/* Default to using normal stack */ -	sp = regs->sp; +	unsigned long sp = regs->sp; + +#ifdef CONFIG_X86_64 +	/* redzone */ +	sp -= 128; +#endif /* CONFIG_X86_64 */  	/*  	 * If we are on the alternate signal stack and would overflow it, don't. @@ -236,30 +229,52 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,  		if (sas_ss_flags(sp) == 0)  			sp = current->sas_ss_sp + current->sas_ss_size;  	} else { +#ifdef CONFIG_X86_32  		/* This is the legacy signal stack switching. */  		if ((regs->ss & 0xffff) != __USER_DS &&  			!(ka->sa.sa_flags & SA_RESTORER) &&  				ka->sa.sa_restorer)  			sp = (unsigned long) ka->sa.sa_restorer; +#endif /* CONFIG_X86_32 */  	}  	if (used_math()) { -		sp = sp - sig_xstate_size; -		*fpstate = (struct _fpstate *) sp; +		sp -= sig_xstate_size; +#ifdef CONFIG_X86_64 +		sp = round_down(sp, 64); +#endif /* CONFIG_X86_64 */ +		*fpstate = (void __user *)sp; +  		if (save_i387_xstate(*fpstate) < 0)  			return (void __user *)-1L;  	} -	sp -= frame_size; -	/* -	 * Align the stack pointer according to the i386 ABI, -	 * i.e. so that on function entry ((sp + 4) & 15) == 0. -	 */ -	sp = ((sp + 4) & -16ul) - 4; - -	return (void __user *) sp; +	return (void __user *)align_sigframe(sp - frame_size);  } +#ifdef CONFIG_X86_32 +static const struct { +	u16 poplmovl; +	u32 val; +	u16 int80; +} __attribute__((packed)) retcode = { +	0xb858,		/* popl %eax; movl $..., %eax */ +	__NR_sigreturn, +	0x80cd,		/* int $0x80 */ +}; + +static const struct { +	u8  movl; +	u32 val; +	u16 int80; +	u8  pad; +} __attribute__((packed)) rt_retcode = { +	0xb8,		/* movl $..., %eax */ +	__NR_rt_sigreturn, +	0x80cd,		/* int $0x80 */ +	0 +}; +  static int  __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,  	      struct pt_regs *regs) @@ -336,43 +351,41 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))  		return -EFAULT; -	err |= __put_user(sig, &frame->sig); -	err |= __put_user(&frame->info, &frame->pinfo); -	err |= __put_user(&frame->uc, &frame->puc); -	err |= copy_siginfo_to_user(&frame->info, info); -	if (err) -		return -EFAULT; +	put_user_try { +		put_user_ex(sig, &frame->sig); +		put_user_ex(&frame->info, &frame->pinfo); +		put_user_ex(&frame->uc, &frame->puc); +		err |= copy_siginfo_to_user(&frame->info, info); -	/* Create the ucontext.  */ -	if (cpu_has_xsave) -		err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); -	else -		err |= __put_user(0, &frame->uc.uc_flags); -	err |= __put_user(0, &frame->uc.uc_link); -	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); -	err |= __put_user(sas_ss_flags(regs->sp), -			  &frame->uc.uc_stack.ss_flags); -	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); -	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, -				regs, set->sig[0]); -	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); -	if (err) -		return -EFAULT; +		/* Create the ucontext.  */ +		if (cpu_has_xsave) +			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); +		else +			put_user_ex(0, &frame->uc.uc_flags); +		put_user_ex(0, &frame->uc.uc_link); +		put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); +		put_user_ex(sas_ss_flags(regs->sp), +			    &frame->uc.uc_stack.ss_flags); +		put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); +		err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, +					regs, set->sig[0]); +		err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); -	/* Set up to return from userspace.  */ -	restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); -	if (ka->sa.sa_flags & SA_RESTORER) -		restorer = ka->sa.sa_restorer; -	err |= __put_user(restorer, &frame->pretcode); +		/* Set up to return from userspace.  */ +		restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); +		if (ka->sa.sa_flags & SA_RESTORER) +			restorer = ka->sa.sa_restorer; +		put_user_ex(restorer, &frame->pretcode); -	/* -	 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 -	 * -	 * WE DO NOT USE IT ANY MORE! It's only left here for historical -	 * reasons and because gdb uses it as a signature to notice -	 * signal handler stack frames. -	 */ -	err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode); +		/* +		 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 +		 * +		 * WE DO NOT USE IT ANY MORE! It's only left here for historical +		 * reasons and because gdb uses it as a signature to notice +		 * signal handler stack frames. +		 */ +		put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); +	} put_user_catch(err);  	if (err)  		return -EFAULT; @@ -392,24 +405,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  	return 0;  }  #else /* !CONFIG_X86_32 */ -/* - * Determine which stack to use.. - */ -static void __user * -get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size) -{ -	/* Default to using normal stack - redzone*/ -	sp -= 128; - -	/* This is the X/Open sanctioned signal stack switching.  */ -	if (ka->sa.sa_flags & SA_ONSTACK) { -		if (sas_ss_flags(sp) == 0) -			sp = current->sas_ss_sp + current->sas_ss_size; -	} - -	return (void __user *)round_down(sp - size, 64); -} -  static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  			    sigset_t *set, struct pt_regs *regs)  { @@ -418,15 +413,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  	int err = 0;  	struct task_struct *me = current; -	if (used_math()) { -		fp = get_stack(ka, regs->sp, sig_xstate_size); -		frame = (void __user *)round_down( -			(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - -		if (save_i387_xstate(fp) < 0) -			return -EFAULT; -	} else -		frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8; +	frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);  	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))  		return -EFAULT; @@ -436,28 +423,30 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  			return -EFAULT;  	} -	/* Create the ucontext.  */ -	if (cpu_has_xsave) -		err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); -	else -		err |= __put_user(0, &frame->uc.uc_flags); -	err |= __put_user(0, &frame->uc.uc_link); -	err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); -	err |= __put_user(sas_ss_flags(regs->sp), -			  &frame->uc.uc_stack.ss_flags); -	err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); -	err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); -	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); +	put_user_try { +		/* Create the ucontext.  */ +		if (cpu_has_xsave) +			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); +		else +			put_user_ex(0, &frame->uc.uc_flags); +		put_user_ex(0, &frame->uc.uc_link); +		put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); +		put_user_ex(sas_ss_flags(regs->sp), +			    &frame->uc.uc_stack.ss_flags); +		put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size); +		err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); +		err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); -	/* Set up to return from userspace.  If provided, use a stub -	   already in userspace.  */ -	/* x86-64 should always use SA_RESTORER. */ -	if (ka->sa.sa_flags & SA_RESTORER) { -		err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); -	} else { -		/* could use a vstub here */ -		return -EFAULT; -	} +		/* Set up to return from userspace.  If provided, use a stub +		   already in userspace.  */ +		/* x86-64 should always use SA_RESTORER. */ +		if (ka->sa.sa_flags & SA_RESTORER) { +			put_user_ex(ka->sa.sa_restorer, &frame->pretcode); +		} else { +			/* could use a vstub here */ +			err |= -EFAULT; +		} +	} put_user_catch(err);  	if (err)  		return -EFAULT; @@ -509,31 +498,41 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,  	      struct old_sigaction __user *oact)  {  	struct k_sigaction new_ka, old_ka; -	int ret; +	int ret = 0;  	if (act) {  		old_sigset_t mask; -		if (!access_ok(VERIFY_READ, act, sizeof(*act)) || -		    __get_user(new_ka.sa.sa_handler, &act->sa_handler) || -		    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) +		if (!access_ok(VERIFY_READ, act, sizeof(*act)))  			return -EFAULT; -		__get_user(new_ka.sa.sa_flags, &act->sa_flags); -		__get_user(mask, &act->sa_mask); +		get_user_try { +			get_user_ex(new_ka.sa.sa_handler, &act->sa_handler); +			get_user_ex(new_ka.sa.sa_flags, &act->sa_flags); +			get_user_ex(mask, &act->sa_mask); +			get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer); +		} get_user_catch(ret); + +		if (ret) +			return -EFAULT;  		siginitset(&new_ka.sa.sa_mask, mask);  	}  	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);  	if (!ret && oact) { -		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || -		    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || -		    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) +		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))  			return -EFAULT; -		__put_user(old_ka.sa.sa_flags, &oact->sa_flags); -		__put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); +		put_user_try { +			put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler); +			put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags); +			put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); +			put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer); +		} put_user_catch(ret); + +		if (ret) +			return -EFAULT;  	}  	return ret; @@ -541,14 +540,9 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,  #endif /* CONFIG_X86_32 */  #ifdef CONFIG_X86_32 -asmlinkage int sys_sigaltstack(unsigned long bx) +int sys_sigaltstack(struct pt_regs *regs)  { -	/* -	 * This is needed to make gcc realize it doesn't own the -	 * "struct pt_regs" -	 */ -	struct pt_regs *regs = (struct pt_regs *)&bx; -	const stack_t __user *uss = (const stack_t __user *)bx; +	const stack_t __user *uss = (const stack_t __user *)regs->bx;  	stack_t __user *uoss = (stack_t __user *)regs->cx;  	return do_sigaltstack(uss, uoss, regs->sp); @@ -566,14 +560,12 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,   * Do a signal return; undo the signal stack.   */  #ifdef CONFIG_X86_32 -asmlinkage unsigned long sys_sigreturn(unsigned long __unused) +unsigned long sys_sigreturn(struct pt_regs *regs)  {  	struct sigframe __user *frame; -	struct pt_regs *regs;  	unsigned long ax;  	sigset_t set; -	regs = (struct pt_regs *) &__unused;  	frame = (struct sigframe __user *)(regs->sp - 8);  	if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -600,7 +592,7 @@ badframe:  }  #endif /* CONFIG_X86_32 */ -static long do_rt_sigreturn(struct pt_regs *regs) +long sys_rt_sigreturn(struct pt_regs *regs)  {  	struct rt_sigframe __user *frame;  	unsigned long ax; @@ -631,25 +623,6 @@ badframe:  	return 0;  } -#ifdef CONFIG_X86_32 -/* - * Note: do not pass in pt_regs directly as with tail-call optimization - * GCC will incorrectly stomp on the caller's frame and corrupt user-space - * register state: - */ -asmlinkage int sys_rt_sigreturn(unsigned long __unused) -{ -	struct pt_regs *regs = (struct pt_regs *)&__unused; - -	return do_rt_sigreturn(regs); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -{ -	return do_rt_sigreturn(regs); -} -#endif /* CONFIG_X86_32 */ -  /*   * OK, we're invoking a handler:   */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index e6faa3316bd..13f33ea8cca 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -2,7 +2,7 @@   *	Intel SMP support routines.   *   *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> - *	(c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> + *	(c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>   *      (c) 2002,2003 Andi Kleen, SuSE Labs.   *   *	i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com> @@ -26,8 +26,7 @@  #include <asm/tlbflush.h>  #include <asm/mmu_context.h>  #include <asm/proto.h> -#include <mach_ipi.h> -#include <mach_apic.h> +#include <asm/apic.h>  /*   *	Some notes on x86 processor bugs affecting SMP operation:   * @@ -118,12 +117,12 @@ static void native_smp_send_reschedule(int cpu)  		WARN_ON(1);  		return;  	} -	send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); +	apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);  }  void native_send_call_func_single_ipi(int cpu)  { -	send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); +	apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);  }  void native_send_call_func_ipi(const struct cpumask *mask) @@ -131,7 +130,7 @@ void native_send_call_func_ipi(const struct cpumask *mask)  	cpumask_var_t allbutself;  	if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) { -		send_IPI_mask(mask, CALL_FUNCTION_VECTOR); +		apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);  		return;  	} @@ -140,9 +139,9 @@ void native_send_call_func_ipi(const struct cpumask *mask)  	if (cpumask_equal(mask, allbutself) &&  	    cpumask_equal(cpu_online_mask, cpu_callout_mask)) -		send_IPI_allbutself(CALL_FUNCTION_VECTOR); +		apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);  	else -		send_IPI_mask(mask, CALL_FUNCTION_VECTOR); +		apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);  	free_cpumask_var(allbutself);  } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bb1a3b1fc87..ef7d10170c3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -2,7 +2,7 @@   *	x86 SMP booting functions   *   *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> - *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> + *	(c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>   *	Copyright 2001 Andi Kleen, SuSE Labs.   *   *	Much of the core SMP work is based on previous work by Thomas Radke, to @@ -53,7 +53,6 @@  #include <asm/nmi.h>  #include <asm/irq.h>  #include <asm/idle.h> -#include <asm/smp.h>  #include <asm/trampoline.h>  #include <asm/cpu.h>  #include <asm/numa.h> @@ -61,13 +60,12 @@  #include <asm/tlbflush.h>  #include <asm/mtrr.h>  #include <asm/vmi.h> -#include <asm/genapic.h> +#include <asm/apic.h>  #include <asm/setup.h> +#include <asm/uv/uv.h>  #include <linux/mc146818rtc.h> -#include <mach_apic.h> -#include <mach_wakecpu.h> -#include <smpboot_hooks.h> +#include <asm/smpboot_hooks.h>  #ifdef CONFIG_X86_32  u8 apicid_2_node[MAX_APICID]; @@ -114,11 +112,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map);  DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);  EXPORT_PER_CPU_SYMBOL(cpu_info); -static atomic_t init_deasserted; - - -/* Set if we find a B stepping CPU */ -static int __cpuinitdata smp_b_stepping; +atomic_t init_deasserted;  #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) @@ -163,7 +157,7 @@ static void map_cpu_to_logical_apicid(void)  {  	int cpu = smp_processor_id();  	int apicid = logical_smp_processor_id(); -	int node = apicid_to_node(apicid); +	int node = apic->apicid_to_node(apicid);  	if (!node_online(node))  		node = first_online_node; @@ -196,7 +190,8 @@ static void __cpuinit smp_callin(void)  	 * our local APIC.  We have to wait for the IPI or we'll  	 * lock up on an APIC access.  	 */ -	wait_for_init_deassert(&init_deasserted); +	if (apic->wait_for_init_deassert) +		apic->wait_for_init_deassert(&init_deasserted);  	/*  	 * (This works even if the APIC is not enabled.) @@ -243,7 +238,8 @@ static void __cpuinit smp_callin(void)  	 */  	pr_debug("CALLIN, before setup_local_APIC().\n"); -	smp_callin_clear_local_apic(); +	if (apic->smp_callin_clear_local_apic) +		apic->smp_callin_clear_local_apic();  	setup_local_APIC();  	end_local_APIC_setup();  	map_cpu_to_logical_apicid(); @@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)  	cpumask_set_cpu(cpuid, cpu_callin_mask);  } -static int __cpuinitdata unsafe_smp; -  /*   * Activate a secondary processor.   */ @@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)  	cpu_idle();  } -static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) -{ -	/* -	 * Mask B, Pentium, but not Pentium MMX -	 */ -	if (c->x86_vendor == X86_VENDOR_INTEL && -	    c->x86 == 5 && -	    c->x86_mask >= 1 && c->x86_mask <= 4 && -	    c->x86_model <= 3) -		/* -		 * Remember we have B step Pentia with bugs -		 */ -		smp_b_stepping = 1; - -	/* -	 * Certain Athlons might work (for various values of 'work') in SMP -	 * but they are not certified as MP capable. -	 */ -	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { - -		if (num_possible_cpus() == 1) -			goto valid_k7; - -		/* Athlon 660/661 is valid. */ -		if ((c->x86_model == 6) && ((c->x86_mask == 0) || -		    (c->x86_mask == 1))) -			goto valid_k7; - -		/* Duron 670 is valid */ -		if ((c->x86_model == 7) && (c->x86_mask == 0)) -			goto valid_k7; - -		/* -		 * Athlon 662, Duron 671, and Athlon >model 7 have capability -		 * bit. It's worth noting that the A5 stepping (662) of some -		 * Athlon XP's have the MP bit set. -		 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for -		 * more. -		 */ -		if (((c->x86_model == 6) && (c->x86_mask >= 2)) || -		    ((c->x86_model == 7) && (c->x86_mask >= 1)) || -		     (c->x86_model > 7)) -			if (cpu_has_mp) -				goto valid_k7; - -		/* If we get here, not a certified SMP capable AMD system. */ -		unsafe_smp = 1; -	} - -valid_k7: -	; -} - -static void __cpuinit smp_checks(void) -{ -	if (smp_b_stepping) -		printk(KERN_WARNING "WARNING: SMP operation may be unreliable" -				    "with B stepping processors.\n"); - -	/* -	 * Don't taint if we are running SMP kernel on a single non-MP -	 * approved Athlon -	 */ -	if (unsafe_smp && num_online_cpus() > 1) { -		printk(KERN_INFO "WARNING: This combination of AMD" -			"processors is not suitable for SMP.\n"); -		add_taint(TAINT_UNSAFE_SMP); -	} -} -  /*   * The bootstrap kernel entry code has set these up. Save them for   * a given CPU @@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)  	c->cpu_index = id;  	if (id != 0)  		identify_secondary_cpu(c); -	smp_apply_quirks(c);  } @@ -583,7 +506,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)  	/* Target chip */  	/* Boot on the stack */  	/* Kick the second */ -	apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid); +	apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);  	pr_debug("Waiting for send to finish...\n");  	send_status = safe_apic_wait_icr_idle(); @@ -614,12 +537,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)  	unsigned long send_status, accept_status = 0;  	int maxlvt, num_starts, j; -	if (get_uv_system_type() == UV_NON_UNIQUE_APIC) { -		send_status = uv_wakeup_secondary(phys_apicid, start_eip); -		atomic_set(&init_deasserted, 1); -		return send_status; -	} -  	maxlvt = lapic_get_maxlvt();  	/* @@ -745,78 +662,23 @@ static void __cpuinit do_fork_idle(struct work_struct *work)  	complete(&c_idle->done);  } -#ifdef CONFIG_X86_64 - -/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */ -static void __ref free_bootmem_pda(struct x8664_pda *oldpda) -{ -	if (!after_bootmem) -		free_bootmem((unsigned long)oldpda, sizeof(*oldpda)); -} - -/* - * Allocate node local memory for the AP pda. - * - * Must be called after the _cpu_pda pointer table is initialized. - */ -int __cpuinit get_local_pda(int cpu) -{ -	struct x8664_pda *oldpda, *newpda; -	unsigned long size = sizeof(struct x8664_pda); -	int node = cpu_to_node(cpu); - -	if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem) -		return 0; - -	oldpda = cpu_pda(cpu); -	newpda = kmalloc_node(size, GFP_ATOMIC, node); -	if (!newpda) { -		printk(KERN_ERR "Could not allocate node local PDA " -			"for CPU %d on node %d\n", cpu, node); - -		if (oldpda) -			return 0;	/* have a usable pda */ -		else -			return -1; -	} - -	if (oldpda) { -		memcpy(newpda, oldpda, size); -		free_bootmem_pda(oldpda); -	} - -	newpda->in_bootmem = 0; -	cpu_pda(cpu) = newpda; -	return 0; -} -#endif /* CONFIG_X86_64 */ - -static int __cpuinit do_boot_cpu(int apicid, int cpu)  /*   * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad   * (ie clustered apic addressing mode), this is a LOGICAL apic ID. - * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. + * Returns zero if CPU booted OK, else error code from + * ->wakeup_secondary_cpu.   */ +static int __cpuinit do_boot_cpu(int apicid, int cpu)  {  	unsigned long boot_error = 0; -	int timeout;  	unsigned long start_ip; -	unsigned short nmi_high = 0, nmi_low = 0; +	int timeout;  	struct create_idle c_idle = { -		.cpu = cpu, -		.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), +		.cpu	= cpu, +		.done	= COMPLETION_INITIALIZER_ONSTACK(c_idle.done),  	}; -	INIT_WORK(&c_idle.work, do_fork_idle); -#ifdef CONFIG_X86_64 -	/* Allocate node local memory for AP pdas */ -	if (cpu > 0) { -		boot_error = get_local_pda(cpu); -		if (boot_error) -			goto restore_state; -			/* if can't get pda memory, can't start cpu */ -	} -#endif +	INIT_WORK(&c_idle.work, do_fork_idle);  	alternatives_smp_switch(1); @@ -847,14 +709,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)  	set_idle_for_cpu(cpu, c_idle.idle);  do_rest: -#ifdef CONFIG_X86_32  	per_cpu(current_task, cpu) = c_idle.idle; -	init_gdt(cpu); +#ifdef CONFIG_X86_32  	/* Stack for startup_32 can be just as for start_secondary onwards */  	irq_ctx_init(cpu);  #else -	cpu_pda(cpu)->pcurrent = c_idle.idle;  	clear_tsk_thread_flag(c_idle.idle, TIF_FORK); +	initial_gs = per_cpu_offset(cpu); +	per_cpu(kernel_stack, cpu) = +		(unsigned long)task_stack_page(c_idle.idle) - +		KERNEL_STACK_OFFSET + THREAD_SIZE;  #endif  	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);  	initial_code = (unsigned long)start_secondary; @@ -878,8 +742,6 @@ do_rest:  		pr_debug("Setting warm reset code and vector.\n"); -		store_NMI_vector(&nmi_high, &nmi_low); -  		smpboot_setup_warm_reset_vector(start_ip);  		/*  		 * Be paranoid about clearing APIC errors. @@ -891,9 +753,13 @@ do_rest:  	}  	/* -	 * Starting actual IPI sequence... +	 * Kick the secondary CPU. Use the method in the APIC driver +	 * if it's defined - or use an INIT boot APIC message otherwise:  	 */ -	boot_error = wakeup_secondary_cpu(apicid, start_ip); +	if (apic->wakeup_secondary_cpu) +		boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); +	else +		boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);  	if (!boot_error) {  		/* @@ -927,13 +793,11 @@ do_rest:  			else  				/* trampoline code not run */  				printk(KERN_ERR "Not responding.\n"); -			if (get_uv_system_type() != UV_NON_UNIQUE_APIC) -				inquire_remote_apic(apicid); +			if (apic->inquire_remote_apic) +				apic->inquire_remote_apic(apicid);  		}  	} -#ifdef CONFIG_X86_64 -restore_state: -#endif +  	if (boot_error) {  		/* Try to put things back the way they were before ... */  		numa_remove_cpu(cpu); /* was set by numa_add_cpu */ @@ -961,7 +825,7 @@ restore_state:  int __cpuinit native_cpu_up(unsigned int cpu)  { -	int apicid = cpu_present_to_apicid(cpu); +	int apicid = apic->cpu_present_to_apicid(cpu);  	unsigned long flags;  	int err; @@ -1054,14 +918,14 @@ static int __init smp_sanity_check(unsigned max_cpus)  {  	preempt_disable(); -#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) +#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)  	if (def_to_bigsmp && nr_cpu_ids > 8) {  		unsigned int cpu;  		unsigned nr;  		printk(KERN_WARNING  		       "More than 8 CPUs detected - skipping them.\n" -		       "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); +		       "Use CONFIG_X86_BIGSMP.\n");  		nr = 0;  		for_each_present_cpu(cpu) { @@ -1107,7 +971,7 @@ static int __init smp_sanity_check(unsigned max_cpus)  	 * Should not be necessary because the MP table should list the boot  	 * CPU too, but we do it for the sake of robustness anyway.  	 */ -	if (!check_phys_apicid_present(boot_cpu_physical_apicid)) { +	if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {  		printk(KERN_NOTICE  			"weird, boot CPU (#%d) not listed by the BIOS.\n",  			boot_cpu_physical_apicid); @@ -1125,6 +989,7 @@ static int __init smp_sanity_check(unsigned max_cpus)  		printk(KERN_ERR "... forcing use of dummy APIC emulation."  				"(tell your hw vendor)\n");  		smpboot_clear_io_apic(); +		arch_disable_smp_support();  		return -1;  	} @@ -1181,9 +1046,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)  	current_thread_info()->cpu = 0;  /* needed? */  	set_cpu_sibling_map(0); -#ifdef CONFIG_X86_64  	enable_IR_x2apic(); -	setup_apic_routing(); +#ifdef CONFIG_X86_64 +	default_setup_apic_routing();  #endif  	if (smp_sanity_check(max_cpus) < 0) { @@ -1207,18 +1072,18 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)  	 */  	setup_local_APIC(); -#ifdef CONFIG_X86_64  	/*  	 * Enable IO APIC before setting up error vector  	 */  	if (!skip_ioapic_setup && nr_ioapics)  		enable_IO_APIC(); -#endif +  	end_local_APIC_setup();  	map_cpu_to_logical_apicid(); -	setup_portio_remap(); +	if (apic->setup_portio_remap) +		apic->setup_portio_remap();  	smpboot_setup_io_apic();  	/* @@ -1240,10 +1105,7 @@ out:  void __init native_smp_prepare_boot_cpu(void)  {  	int me = smp_processor_id(); -#ifdef CONFIG_X86_32 -	init_gdt(me); -#endif -	switch_to_new_gdt(); +	switch_to_new_gdt(me);  	/* already set me in cpu_online_mask in boot_cpu_init() */  	cpumask_set_cpu(me, cpu_callout_mask);  	per_cpu(cpu_state, me) = CPU_ONLINE; @@ -1254,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)  	pr_debug("Boot done.\n");  	impress_friends(); -	smp_checks();  #ifdef CONFIG_X86_IO_APIC  	setup_ioapic_dest();  #endif diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c deleted file mode 100644 index 397e309839d..00000000000 --- a/arch/x86/kernel/smpcommon.c +++ /dev/null @@ -1,30 +0,0 @@ -/* - * SMP stuff which is common to all sub-architectures. - */ -#include <linux/module.h> -#include <asm/smp.h> - -#ifdef CONFIG_X86_32 -DEFINE_PER_CPU(unsigned long, this_cpu_off); -EXPORT_PER_CPU_SYMBOL(this_cpu_off); - -/* - * Initialize the CPU's GDT.  This is either the boot CPU doing itself - * (still using the master per-cpu area), or a CPU doing it for a - * secondary which will soon come up. - */ -__cpuinit void init_gdt(int cpu) -{ -	struct desc_struct gdt; - -	pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF, -			0x2 | DESCTYPE_S, 0x8); -	gdt.s = 1; - -	write_gdt_entry(get_cpu_gdt_table(cpu), -			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); - -	per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; -	per_cpu(cpu_number, cpu) = cpu; -} -#endif diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 10786af9554..f7bddc2e37d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -1,7 +1,7 @@  /*   * Stack trace management functions   * - *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + *  Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>   */  #include <linux/sched.h>  #include <linux/stacktrace.h> diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c deleted file mode 100644 index 7b987852e87..00000000000 --- a/arch/x86/kernel/summit_32.c +++ /dev/null @@ -1,188 +0,0 @@ -/* - * IBM Summit-Specific Code - * - * Written By: Matthew Dobson, IBM Corporation - * - * Copyright (c) 2003 IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT.  See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <colpatch@us.ibm.com> - * - */ - -#include <linux/mm.h> -#include <linux/init.h> -#include <asm/io.h> -#include <asm/bios_ebda.h> -#include <asm/summit/mpparse.h> - -static struct rio_table_hdr *rio_table_hdr __initdata; -static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata; -static struct rio_detail    *rio_devs[MAX_NUMNODES*4] __initdata; - -#ifndef CONFIG_X86_NUMAQ -static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata; -#endif - -static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) -{ -	int twister = 0, node = 0; -	int i, bus, num_buses; - -	for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { -		if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) { -			twister = rio_devs[i]->owner_id; -			break; -		} -	} -	if (i == rio_table_hdr->num_rio_dev) { -		printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); -		return last_bus; -	} - -	for (i = 0; i < rio_table_hdr->num_scal_dev; i++) { -		if (scal_devs[i]->node_id == twister) { -			node = scal_devs[i]->node_id; -			break; -		} -	} -	if (i == rio_table_hdr->num_scal_dev) { -		printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); -		return last_bus; -	} - -	switch (rio_devs[wpeg_num]->type) { -	case CompatWPEG: -		/* -		 * The Compatibility Winnipeg controls the 2 legacy buses, -		 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case -		 * a PCI-PCI bridge card is used in either slot: total 5 buses. -		 */ -		num_buses = 5; -		break; -	case AltWPEG: -		/* -		 * The Alternate Winnipeg controls the 2 133MHz buses [1 slot -		 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and -		 * the "extra" buses for each of those slots: total 7 buses. -		 */ -		num_buses = 7; -		break; -	case LookOutAWPEG: -	case LookOutBWPEG: -		/* -		 * A Lookout Winnipeg controls 3 100MHz buses [2 slots each] -		 * & the "extra" buses for each of those slots: total 9 buses. -		 */ -		num_buses = 9; -		break; -	default: -		printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); -		return last_bus; -	} - -	for (bus = last_bus; bus < last_bus + num_buses; bus++) -		mp_bus_id_to_node[bus] = node; -	return bus; -} - -static int __init build_detail_arrays(void) -{ -	unsigned long ptr; -	int i, scal_detail_size, rio_detail_size; - -	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { -		printk(KERN_WARNING "%s: MAX_NUMNODES too low!  Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); -		return 0; -	} - -	switch (rio_table_hdr->version) { -	default: -		printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); -		return 0; -	case 2: -		scal_detail_size = 11; -		rio_detail_size = 13; -		break; -	case 3: -		scal_detail_size = 12; -		rio_detail_size = 15; -		break; -	} - -	ptr = (unsigned long)rio_table_hdr + 3; -	for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size) -		scal_devs[i] = (struct scal_detail *)ptr; - -	for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size) -		rio_devs[i] = (struct rio_detail *)ptr; - -	return 1; -} - -void __init setup_summit(void) -{ -	unsigned long		ptr; -	unsigned short		offset; -	int			i, next_wpeg, next_bus = 0; - -	/* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ -	ptr = get_bios_ebda(); -	ptr = (unsigned long)phys_to_virt(ptr); - -	rio_table_hdr = NULL; -	offset = 0x180; -	while (offset) { -		/* The block id is stored in the 2nd word */ -		if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) { -			/* set the pointer past the offset & block id */ -			rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); -			break; -		} -		/* The next offset is stored in the 1st word.  0 means no more */ -		offset = *((unsigned short *)(ptr + offset)); -	} -	if (!rio_table_hdr) { -		printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); -		return; -	} - -	if (!build_detail_arrays()) -		return; - -	/* The first Winnipeg we're looking for has an index of 0 */ -	next_wpeg = 0; -	do { -		for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { -			if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) { -				/* It's the Winnipeg we're looking for! */ -				next_bus = setup_pci_node_map_for_wpeg(i, next_bus); -				next_wpeg++; -				break; -			} -		} -		/* -		 * If we go through all Rio devices and don't find one with -		 * the next index, it means we've found all the Winnipegs, -		 * and thus all the PCI buses. -		 */ -		if (i == rio_table_hdr->num_rio_dev) -			next_wpeg = 0; -	} while (next_wpeg != 0); -} diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index e2e86a08f31..3bdb64829b8 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -1,7 +1,7 @@  ENTRY(sys_call_table)  	.long sys_restart_syscall	/* 0 - old "setup()" system call, used for restarting */  	.long sys_exit -	.long sys_fork +	.long ptregs_fork  	.long sys_read  	.long sys_write  	.long sys_open		/* 5 */ @@ -10,7 +10,7 @@ ENTRY(sys_call_table)  	.long sys_creat  	.long sys_link  	.long sys_unlink	/* 10 */ -	.long sys_execve +	.long ptregs_execve  	.long sys_chdir  	.long sys_time  	.long sys_mknod @@ -109,17 +109,17 @@ ENTRY(sys_call_table)  	.long sys_newlstat  	.long sys_newfstat  	.long sys_uname -	.long sys_iopl		/* 110 */ +	.long ptregs_iopl	/* 110 */  	.long sys_vhangup  	.long sys_ni_syscall	/* old "idle" system call */ -	.long sys_vm86old +	.long ptregs_vm86old  	.long sys_wait4  	.long sys_swapoff	/* 115 */  	.long sys_sysinfo  	.long sys_ipc  	.long sys_fsync -	.long sys_sigreturn -	.long sys_clone		/* 120 */ +	.long ptregs_sigreturn +	.long ptregs_clone	/* 120 */  	.long sys_setdomainname  	.long sys_newuname  	.long sys_modify_ldt @@ -165,14 +165,14 @@ ENTRY(sys_call_table)  	.long sys_mremap  	.long sys_setresuid16  	.long sys_getresuid16	/* 165 */ -	.long sys_vm86 +	.long ptregs_vm86  	.long sys_ni_syscall	/* Old sys_query_module */  	.long sys_poll  	.long sys_nfsservctl  	.long sys_setresgid16	/* 170 */  	.long sys_getresgid16  	.long sys_prctl -	.long sys_rt_sigreturn +	.long ptregs_rt_sigreturn  	.long sys_rt_sigaction  	.long sys_rt_sigprocmask	/* 175 */  	.long sys_rt_sigpending @@ -185,11 +185,11 @@ ENTRY(sys_call_table)  	.long sys_getcwd  	.long sys_capget  	.long sys_capset	/* 185 */ -	.long sys_sigaltstack +	.long ptregs_sigaltstack  	.long sys_sendfile  	.long sys_ni_syscall	/* reserved for streams1 */  	.long sys_ni_syscall	/* reserved for streams2 */ -	.long sys_vfork		/* 190 */ +	.long ptregs_vfork	/* 190 */  	.long sys_getrlimit  	.long sys_mmap2  	.long sys_truncate64 diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 3985cac0ed4..5c5d87f0b2e 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -33,12 +33,12 @@  #include <linux/time.h>  #include <linux/mca.h> -#include <asm/arch_hooks.h> +#include <asm/setup.h>  #include <asm/hpet.h>  #include <asm/time.h>  #include <asm/timer.h> -#include "do_timer.h" +#include <asm/do_timer.h>  int timer_ack; @@ -118,7 +118,7 @@ void __init hpet_time_init(void)  {  	if (!hpet_enable())  		setup_pit_timer(); -	time_init_hook(); +	x86_quirk_time_init();  }  /* @@ -131,7 +131,7 @@ void __init hpet_time_init(void)   */  void __init time_init(void)  { -	pre_time_init_hook(); +	x86_quirk_pre_time_init();  	tsc_init();  	late_time_init = choose_time_init();  } diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c deleted file mode 100644 index ce505464224..00000000000 --- a/arch/x86/kernel/tlb_32.c +++ /dev/null @@ -1,256 +0,0 @@ -#include <linux/spinlock.h> -#include <linux/cpu.h> -#include <linux/interrupt.h> - -#include <asm/tlbflush.h> - -DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) -			____cacheline_aligned = { &init_mm, 0, }; - -/* must come after the send_IPI functions above for inlining */ -#include <mach_ipi.h> - -/* - *	Smarter SMP flushing macros. - *		c/o Linus Torvalds. - * - *	These mean you can really definitely utterly forget about - *	writing to user space from interrupts. (Its not allowed anyway). - * - *	Optimizations Manfred Spraul <manfred@colorfullife.com> - */ - -static cpumask_t flush_cpumask; -static struct mm_struct *flush_mm; -static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - * - * We need to reload %cr3 since the page tables may be going - * away from under us.. - */ -void leave_mm(int cpu) -{ -	BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK); -	cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask); -	load_cr3(swapper_pg_dir); -} -EXPORT_SYMBOL_GPL(leave_mm); - -/* - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * 	Stop ipi delivery for the old mm. This is not synchronized with - * 	the other cpus, but smp_invalidate_interrupt ignore flush ipis - * 	for the wrong mm, and in the worst case we perform a superfluous - * 	tlb flush. - * 1a2) set cpu_tlbstate to TLBSTATE_OK - * 	Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - *	was in lazy tlb mode. - * 1a3) update cpu_tlbstate[].active_mm - * 	Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * 	Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - *	cpu_tlbstate[].active_mm is correct, cpu0 already handles - *	flush ipis. - * 1b1) set cpu_tlbstate to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * 	Atomically set the bit [other cpus will start sending flush ipis], - * 	and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - *   runs in kernel space, the cpu could load tlb entries for user space - *   pages. - * - * The good news is that cpu_tlbstate is local to each cpu, no - * write/read ordering problems. - */ - -/* - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - */ - -void smp_invalidate_interrupt(struct pt_regs *regs) -{ -	unsigned long cpu; - -	cpu = get_cpu(); - -	if (!cpu_isset(cpu, flush_cpumask)) -		goto out; -		/* -		 * This was a BUG() but until someone can quote me the -		 * line from the intel manual that guarantees an IPI to -		 * multiple CPUs is retried _only_ on the erroring CPUs -		 * its staying as a return -		 * -		 * BUG(); -		 */ - -	if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) { -		if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) { -			if (flush_va == TLB_FLUSH_ALL) -				local_flush_tlb(); -			else -				__flush_tlb_one(flush_va); -		} else -			leave_mm(cpu); -	} -	ack_APIC_irq(); -	smp_mb__before_clear_bit(); -	cpu_clear(cpu, flush_cpumask); -	smp_mb__after_clear_bit(); -out: -	put_cpu_no_resched(); -	inc_irq_stat(irq_tlb_count); -} - -void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, -			     unsigned long va) -{ -	cpumask_t cpumask = *cpumaskp; - -	/* -	 * A couple of (to be removed) sanity checks: -	 * -	 * - current CPU must not be in mask -	 * - mask must exist :) -	 */ -	BUG_ON(cpus_empty(cpumask)); -	BUG_ON(cpu_isset(smp_processor_id(), cpumask)); -	BUG_ON(!mm); - -#ifdef CONFIG_HOTPLUG_CPU -	/* If a CPU which we ran on has gone down, OK. */ -	cpus_and(cpumask, cpumask, cpu_online_map); -	if (unlikely(cpus_empty(cpumask))) -		return; -#endif - -	/* -	 * i'm not happy about this global shared spinlock in the -	 * MM hot path, but we'll see how contended it is. -	 * AK: x86-64 has a faster method that could be ported. -	 */ -	spin_lock(&tlbstate_lock); - -	flush_mm = mm; -	flush_va = va; -	cpus_or(flush_cpumask, cpumask, flush_cpumask); - -	/* -	 * Make the above memory operations globally visible before -	 * sending the IPI. -	 */ -	smp_mb(); -	/* -	 * We have to send the IPI only to -	 * CPUs affected. -	 */ -	send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR); - -	while (!cpus_empty(flush_cpumask)) -		/* nothing. lockup detection does not belong here */ -		cpu_relax(); - -	flush_mm = NULL; -	flush_va = 0; -	spin_unlock(&tlbstate_lock); -} - -void flush_tlb_current_task(void) -{ -	struct mm_struct *mm = current->mm; -	cpumask_t cpu_mask; - -	preempt_disable(); -	cpu_mask = mm->cpu_vm_mask; -	cpu_clear(smp_processor_id(), cpu_mask); - -	local_flush_tlb(); -	if (!cpus_empty(cpu_mask)) -		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); -	preempt_enable(); -} - -void flush_tlb_mm(struct mm_struct *mm) -{ -	cpumask_t cpu_mask; - -	preempt_disable(); -	cpu_mask = mm->cpu_vm_mask; -	cpu_clear(smp_processor_id(), cpu_mask); - -	if (current->active_mm == mm) { -		if (current->mm) -			local_flush_tlb(); -		else -			leave_mm(smp_processor_id()); -	} -	if (!cpus_empty(cpu_mask)) -		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); - -	preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) -{ -	struct mm_struct *mm = vma->vm_mm; -	cpumask_t cpu_mask; - -	preempt_disable(); -	cpu_mask = mm->cpu_vm_mask; -	cpu_clear(smp_processor_id(), cpu_mask); - -	if (current->active_mm == mm) { -		if (current->mm) -			__flush_tlb_one(va); -		 else -			leave_mm(smp_processor_id()); -	} - -	if (!cpus_empty(cpu_mask)) -		flush_tlb_others(cpu_mask, mm, va); - -	preempt_enable(); -} -EXPORT_SYMBOL(flush_tlb_page); - -static void do_flush_tlb_all(void *info) -{ -	unsigned long cpu = smp_processor_id(); - -	__flush_tlb_all(); -	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY) -		leave_mm(cpu); -} - -void flush_tlb_all(void) -{ -	on_each_cpu(do_flush_tlb_all, NULL, 1); -} - -void reset_lazy_tlbstate(void) -{ -	int cpu = raw_smp_processor_id(); - -	per_cpu(cpu_tlbstate, cpu).state = 0; -	per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; -} - diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c deleted file mode 100644 index f8be6f1d2e4..00000000000 --- a/arch/x86/kernel/tlb_64.c +++ /dev/null @@ -1,284 +0,0 @@ -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/interrupt.h> - -#include <asm/mtrr.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/proto.h> -#include <asm/apicdef.h> -#include <asm/idle.h> -#include <asm/uv/uv_hub.h> -#include <asm/uv/uv_bau.h> - -#include <mach_ipi.h> -/* - *	Smarter SMP flushing macros. - *		c/o Linus Torvalds. - * - *	These mean you can really definitely utterly forget about - *	writing to user space from interrupts. (Its not allowed anyway). - * - *	Optimizations Manfred Spraul <manfred@colorfullife.com> - * - *	More scalable flush, from Andi Kleen - * - *	To avoid global state use 8 different call vectors. - *	Each CPU uses a specific vector to trigger flushes on other - *	CPUs. Depending on the received vector the target CPUs look into - *	the right per cpu variable for the flush data. - * - *	With more than 8 CPUs they are hashed to the 8 available - *	vectors. The limited global vector space forces us to this right now. - *	In future when interrupts are split into per CPU domains this could be - *	fixed, at the cost of triggering multiple IPIs in some cases. - */ - -union smp_flush_state { -	struct { -		cpumask_t flush_cpumask; -		struct mm_struct *flush_mm; -		unsigned long flush_va; -		spinlock_t tlbstate_lock; -	}; -	char pad[SMP_CACHE_BYTES]; -} ____cacheline_aligned; - -/* State is put into the per CPU data section, but padded -   to a full cache line because other CPUs can access it and we don't -   want false sharing in the per cpu data segment. */ -static DEFINE_PER_CPU(union smp_flush_state, flush_state); - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - */ -void leave_mm(int cpu) -{ -	if (read_pda(mmu_state) == TLBSTATE_OK) -		BUG(); -	cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); -	load_cr3(swapper_pg_dir); -} -EXPORT_SYMBOL_GPL(leave_mm); - -/* - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - *	Stop ipi delivery for the old mm. This is not synchronized with - *	the other cpus, but smp_invalidate_interrupt ignore flush ipis - *	for the wrong mm, and in the worst case we perform a superfluous - *	tlb flush. - * 1a2) set cpu mmu_state to TLBSTATE_OK - *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - *	was in lazy tlb mode. - * 1a3) update cpu active_mm - *	Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - *	Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - *	cpu active_mm is correct, cpu0 already handles - *	flush ipis. - * 1b1) set cpu mmu_state to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - *	Atomically set the bit [other cpus will start sending flush ipis], - *	and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - *   runs in kernel space, the cpu could load tlb entries for user space - *   pages. - * - * The good news is that cpu mmu_state is local to each cpu, no - * write/read ordering problems. - */ - -/* - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - * - * Interrupts are disabled. - */ - -asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) -{ -	int cpu; -	int sender; -	union smp_flush_state *f; - -	cpu = smp_processor_id(); -	/* -	 * orig_rax contains the negated interrupt vector. -	 * Use that to determine where the sender put the data. -	 */ -	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; -	f = &per_cpu(flush_state, sender); - -	if (!cpu_isset(cpu, f->flush_cpumask)) -		goto out; -		/* -		 * This was a BUG() but until someone can quote me the -		 * line from the intel manual that guarantees an IPI to -		 * multiple CPUs is retried _only_ on the erroring CPUs -		 * its staying as a return -		 * -		 * BUG(); -		 */ - -	if (f->flush_mm == read_pda(active_mm)) { -		if (read_pda(mmu_state) == TLBSTATE_OK) { -			if (f->flush_va == TLB_FLUSH_ALL) -				local_flush_tlb(); -			else -				__flush_tlb_one(f->flush_va); -		} else -			leave_mm(cpu); -	} -out: -	ack_APIC_irq(); -	cpu_clear(cpu, f->flush_cpumask); -	inc_irq_stat(irq_tlb_count); -} - -void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, -			     unsigned long va) -{ -	int sender; -	union smp_flush_state *f; -	cpumask_t cpumask = *cpumaskp; - -	if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va)) -		return; - -	/* Caller has disabled preemption */ -	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; -	f = &per_cpu(flush_state, sender); - -	/* -	 * Could avoid this lock when -	 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is -	 * probably not worth checking this for a cache-hot lock. -	 */ -	spin_lock(&f->tlbstate_lock); - -	f->flush_mm = mm; -	f->flush_va = va; -	cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); - -	/* -	 * Make the above memory operations globally visible before -	 * sending the IPI. -	 */ -	smp_mb(); -	/* -	 * We have to send the IPI only to -	 * CPUs affected. -	 */ -	send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender); - -	while (!cpus_empty(f->flush_cpumask)) -		cpu_relax(); - -	f->flush_mm = NULL; -	f->flush_va = 0; -	spin_unlock(&f->tlbstate_lock); -} - -static int __cpuinit init_smp_flush(void) -{ -	int i; - -	for_each_possible_cpu(i) -		spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); - -	return 0; -} -core_initcall(init_smp_flush); - -void flush_tlb_current_task(void) -{ -	struct mm_struct *mm = current->mm; -	cpumask_t cpu_mask; - -	preempt_disable(); -	cpu_mask = mm->cpu_vm_mask; -	cpu_clear(smp_processor_id(), cpu_mask); - -	local_flush_tlb(); -	if (!cpus_empty(cpu_mask)) -		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); -	preempt_enable(); -} - -void flush_tlb_mm(struct mm_struct *mm) -{ -	cpumask_t cpu_mask; - -	preempt_disable(); -	cpu_mask = mm->cpu_vm_mask; -	cpu_clear(smp_processor_id(), cpu_mask); - -	if (current->active_mm == mm) { -		if (current->mm) -			local_flush_tlb(); -		else -			leave_mm(smp_processor_id()); -	} -	if (!cpus_empty(cpu_mask)) -		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); - -	preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) -{ -	struct mm_struct *mm = vma->vm_mm; -	cpumask_t cpu_mask; - -	preempt_disable(); -	cpu_mask = mm->cpu_vm_mask; -	cpu_clear(smp_processor_id(), cpu_mask); - -	if (current->active_mm == mm) { -		if (current->mm) -			__flush_tlb_one(va); -		else -			leave_mm(smp_processor_id()); -	} - -	if (!cpus_empty(cpu_mask)) -		flush_tlb_others(cpu_mask, mm, va); - -	preempt_enable(); -} - -static void do_flush_tlb_all(void *info) -{ -	unsigned long cpu = smp_processor_id(); - -	__flush_tlb_all(); -	if (read_pda(mmu_state) == TLBSTATE_LAZY) -		leave_mm(cpu); -} - -void flush_tlb_all(void) -{ -	on_each_cpu(do_flush_tlb_all, NULL, 1); -} diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 6812b829ed8..d038b9c45cf 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -11,16 +11,15 @@  #include <linux/kernel.h>  #include <asm/mmu_context.h> +#include <asm/uv/uv.h>  #include <asm/uv/uv_mmrs.h>  #include <asm/uv/uv_hub.h>  #include <asm/uv/uv_bau.h> -#include <asm/genapic.h> +#include <asm/apic.h>  #include <asm/idle.h>  #include <asm/tsc.h>  #include <asm/irq_vectors.h> -#include <mach_apic.h> -  static struct bau_control	**uv_bau_table_bases __read_mostly;  static int			uv_bau_retry_limit __read_mostly; @@ -210,14 +209,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,   *   * Send a broadcast and wait for a broadcast message to complete.   * - * The cpumaskp mask contains the cpus the broadcast was sent to. + * The flush_mask contains the cpus the broadcast was sent to.   * - * Returns 1 if all remote flushing was done. The mask is zeroed. - * Returns 0 if some remote flushing remains to be done. The mask is left - * unchanged. + * Returns NULL if all remote flushing was done. The mask is zeroed. + * Returns @flush_mask if some remote flushing remains to be done. The + * mask will have some bits still set.   */ -int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, -			   cpumask_t *cpumaskp) +const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade, +					     struct bau_desc *bau_desc, +					     struct cpumask *flush_mask)  {  	int completion_status = 0;  	int right_shift; @@ -257,66 +257,74 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,  		 * the cpu's, all of which are still in the mask.  		 */  		__get_cpu_var(ptcstats).ptc_i++; -		return 0; +		return flush_mask;  	}  	/*  	 * Success, so clear the remote cpu's from the mask so we don't  	 * use the IPI method of shootdown on them.  	 */ -	for_each_cpu_mask(bit, *cpumaskp) { +	for_each_cpu(bit, flush_mask) {  		blade = uv_cpu_to_blade_id(bit);  		if (blade == this_blade)  			continue; -		cpu_clear(bit, *cpumaskp); +		cpumask_clear_cpu(bit, flush_mask);  	} -	if (!cpus_empty(*cpumaskp)) -		return 0; -	return 1; +	if (!cpumask_empty(flush_mask)) +		return flush_mask; +	return NULL;  }  /**   * uv_flush_tlb_others - globally purge translation cache of a virtual   * address or all TLB's - * @cpumaskp: mask of all cpu's in which the address is to be removed + * @cpumask: mask of all cpu's in which the address is to be removed   * @mm: mm_struct containing virtual address range   * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) + * @cpu: the current cpu   *   * This is the entry point for initiating any UV global TLB shootdown.   *   * Purges the translation caches of all specified processors of the given   * virtual address, or purges all TLB's on specified processors.   * - * The caller has derived the cpumaskp from the mm_struct and has subtracted - * the local cpu from the mask.  This function is called only if there - * are bits set in the mask. (e.g. flush_tlb_page()) + * The caller has derived the cpumask from the mm_struct.  This function + * is called only if there are bits set in the mask. (e.g. flush_tlb_page())   * - * The cpumaskp is converted into a nodemask of the nodes containing + * The cpumask is converted into a nodemask of the nodes containing   * the cpus.   * - * Returns 1 if all remote flushing was done. - * Returns 0 if some remote flushing remains to be done. + * Note that this function should be called with preemption disabled. + * + * Returns NULL if all remote flushing was done. + * Returns pointer to cpumask if some remote flushing remains to be + * done.  The returned pointer is valid till preemption is re-enabled.   */ -int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, -			unsigned long va) +const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, +					  struct mm_struct *mm, +					  unsigned long va, unsigned int cpu)  { +	static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask); +	struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);  	int i;  	int bit;  	int blade; -	int cpu; +	int uv_cpu;  	int this_blade;  	int locals = 0;  	struct bau_desc *bau_desc; -	cpu = uv_blade_processor_id(); +	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); + +	uv_cpu = uv_blade_processor_id();  	this_blade = uv_numa_blade_id();  	bau_desc = __get_cpu_var(bau_control).descriptor_base; -	bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu; +	bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;  	bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);  	i = 0; -	for_each_cpu_mask(bit, *cpumaskp) { +	for_each_cpu(bit, flush_mask) {  		blade = uv_cpu_to_blade_id(bit);  		BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));  		if (blade == this_blade) { @@ -331,17 +339,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,  		 * no off_node flushing; return status for local node  		 */  		if (locals) -			return 0; +			return flush_mask;  		else -			return 1; +			return NULL;  	}  	__get_cpu_var(ptcstats).requestor++;  	__get_cpu_var(ptcstats).ntargeted += i;  	bau_desc->payload.address = va; -	bau_desc->payload.sending_cpu = smp_processor_id(); +	bau_desc->payload.sending_cpu = cpu; -	return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp); +	return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);  }  /* diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index d8ccc3c6552..66d874e5404 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -29,7 +29,7 @@  #include <linux/linkage.h>  #include <asm/segment.h> -#include <asm/page.h> +#include <asm/page_types.h>  /* We can free up trampoline after bootup if cpu hotplug is not supported. */  #ifndef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 894293c598d..cddfb8d386b 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -25,10 +25,11 @@   */  #include <linux/linkage.h> -#include <asm/pgtable.h> -#include <asm/page.h> +#include <asm/pgtable_types.h> +#include <asm/page_types.h>  #include <asm/msr.h>  #include <asm/segment.h> +#include <asm/processor-flags.h>  .section .rodata, "a", @progbits @@ -37,7 +38,7 @@  ENTRY(trampoline_data)  r_base = .  	cli			# We should be safe anyway -	wbinvd	 +	wbinvd  	mov	%cs, %ax	# Code and data in the same place  	mov	%ax, %ds  	mov	%ax, %es @@ -73,9 +74,8 @@ r_base = .  	lidtl	tidt - r_base	# load idt with 0, 0  	lgdtl	tgdt - r_base	# load gdt with whatever is appropriate -	xor	%ax, %ax -	inc	%ax		# protected mode (PE) bit -	lmsw	%ax		# into protected mode +	mov	$X86_CR0_PE, %ax	# protected mode (PE) bit +	lmsw	%ax			# into protected mode  	# flush prefetch and jump to startup_32  	ljmpl	*(startup_32_vector - r_base) @@ -86,9 +86,8 @@ startup_32:  	movl	$__KERNEL_DS, %eax	# Initialize the %ds segment register  	movl	%eax, %ds -	xorl	%eax, %eax -	btsl	$5, %eax		# Enable PAE mode -	movl	%eax, %cr4 +	movl	$X86_CR4_PAE, %eax +	movl	%eax, %cr4		# Enable PAE mode  					# Setup trampoline 4 level pagetables  	leal	(trampoline_level4_pgt - r_base)(%esi), %eax @@ -99,9 +98,9 @@ startup_32:  	xorl	%edx, %edx  	wrmsr -	xorl	%eax, %eax -	btsl	$31, %eax		# Enable paging and in turn activate Long Mode -	btsl	$0, %eax		# Enable protected mode +	# Enable paging and in turn activate Long Mode +	# Enable protected mode +	movl	$(X86_CR0_PG | X86_CR0_PE), %eax  	movl	%eax, %cr0  	/* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a9e7548e179..a1d288327ff 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -54,15 +54,14 @@  #include <asm/desc.h>  #include <asm/i387.h> -#include <mach_traps.h> +#include <asm/mach_traps.h>  #ifdef CONFIG_X86_64  #include <asm/pgalloc.h>  #include <asm/proto.h> -#include <asm/pda.h>  #else  #include <asm/processor-flags.h> -#include <asm/arch_hooks.h> +#include <asm/setup.h>  #include <asm/traps.h>  #include "cpu/mcheck/mce.h" @@ -119,47 +118,6 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err)  	if (!user_mode_vm(regs))  		die(str, regs, err);  } - -/* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, - * we set the offset field correctly and return 1. - */ -static int lazy_iobitmap_copy(void) -{ -	struct thread_struct *thread; -	struct tss_struct *tss; -	int cpu; - -	cpu = get_cpu(); -	tss = &per_cpu(init_tss, cpu); -	thread = ¤t->thread; - -	if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && -	    thread->io_bitmap_ptr) { -		memcpy(tss->io_bitmap, thread->io_bitmap_ptr, -		       thread->io_bitmap_max); -		/* -		 * If the previously set map was extending to higher ports -		 * than the current one, pad extra space with 0xff (no access). -		 */ -		if (thread->io_bitmap_max < tss->io_bitmap_max) { -			memset((char *) tss->io_bitmap + -				thread->io_bitmap_max, 0xff, -				tss->io_bitmap_max - thread->io_bitmap_max); -		} -		tss->io_bitmap_max = thread->io_bitmap_max; -		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; -		tss->io_bitmap_owner = thread; -		put_cpu(); - -		return 1; -	} -	put_cpu(); - -	return 0; -}  #endif  static void __kprobes @@ -310,11 +268,6 @@ do_general_protection(struct pt_regs *regs, long error_code)  	conditional_sti(regs);  #ifdef CONFIG_X86_32 -	if (lazy_iobitmap_copy()) { -		/* restart the faulting instruction */ -		return; -	} -  	if (regs->flags & X86_VM_MASK)  		goto gp_in_vm86;  #endif @@ -914,19 +867,20 @@ void math_emulate(struct math_emu_info *info)  }  #endif /* CONFIG_MATH_EMULATION */ -dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs) +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error_code)  {  #ifdef CONFIG_X86_32  	if (read_cr0() & X86_CR0_EM) {  		struct math_emu_info info = { }; -		conditional_sti(®s); +		conditional_sti(regs); -		info.regs = ®s; +		info.regs = regs;  		math_emulate(&info);  	} else {  		math_state_restore(); /* interrupts still off */ -		conditional_sti(®s); +		conditional_sti(regs);  	}  #else  	math_state_restore(); @@ -942,7 +896,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)  	info.si_signo = SIGILL;  	info.si_errno = 0;  	info.si_code = ILL_BADSTK; -	info.si_addr = 0; +	info.si_addr = NULL;  	if (notify_die(DIE_TRAP, "iret exception",  			regs, error_code, 32, SIGILL) == NOTIFY_STOP)  		return; @@ -1026,6 +980,6 @@ void __init trap_init(void)  	cpu_init();  #ifdef CONFIG_X86_32 -	trap_init_hook(); +	x86_quirk_trap_init();  #endif  } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 599e5816863..83d53ce5d4c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -773,7 +773,7 @@ __cpuinit int unsynchronized_tsc(void)  	if (!cpu_has_tsc || tsc_unstable)  		return 1; -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP  	if (apic_is_clustered_box())  		return 1;  #endif diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c new file mode 100644 index 00000000000..2ffb6c53326 --- /dev/null +++ b/arch/x86/kernel/uv_time.c @@ -0,0 +1,393 @@ +/* + * SGI RTC clock/timer routines. + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License as published by + *  the Free Software Foundation; either version 2 of the License, or + *  (at your option) any later version. + * + *  This program is distributed in the hope that it will be useful, + *  but WITHOUT ANY WARRANTY; without even the implied warranty of + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *  GNU General Public License for more details. + * + *  You should have received a copy of the GNU General Public License + *  along with this program; if not, write to the Free Software + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + * + *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved. + *  Copyright (c) Dimitri Sivanich + */ +#include <linux/clockchips.h> + +#include <asm/uv/uv_mmrs.h> +#include <asm/uv/uv_hub.h> +#include <asm/uv/bios.h> +#include <asm/uv/uv.h> +#include <asm/apic.h> +#include <asm/cpu.h> + +#define RTC_NAME		"sgi_rtc" + +static cycle_t uv_read_rtc(void); +static int uv_rtc_next_event(unsigned long, struct clock_event_device *); +static void uv_rtc_timer_setup(enum clock_event_mode, +				struct clock_event_device *); + +static struct clocksource clocksource_uv = { +	.name		= RTC_NAME, +	.rating		= 400, +	.read		= uv_read_rtc, +	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, +	.shift		= 10, +	.flags		= CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static struct clock_event_device clock_event_device_uv = { +	.name		= RTC_NAME, +	.features	= CLOCK_EVT_FEAT_ONESHOT, +	.shift		= 20, +	.rating		= 400, +	.irq		= -1, +	.set_next_event	= uv_rtc_next_event, +	.set_mode	= uv_rtc_timer_setup, +	.event_handler	= NULL, +}; + +static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); + +/* There is one of these allocated per node */ +struct uv_rtc_timer_head { +	spinlock_t	lock; +	/* next cpu waiting for timer, local node relative: */ +	int		next_cpu; +	/* number of cpus on this node: */ +	int		ncpus; +	struct { +		int	lcpu;		/* systemwide logical cpu number */ +		u64	expires;	/* next timer expiration for this cpu */ +	} cpu[1]; +}; + +/* + * Access to uv_rtc_timer_head via blade id. + */ +static struct uv_rtc_timer_head		**blade_info __read_mostly; + +static int				uv_rtc_enable; + +/* + * Hardware interface routines + */ + +/* Send IPIs to another node */ +static void uv_rtc_send_IPI(int cpu) +{ +	unsigned long apicid, val; +	int pnode; + +	apicid = cpu_physical_id(cpu); +	pnode = uv_apicid_to_pnode(apicid); +	val = (1UL << UVH_IPI_INT_SEND_SHFT) | +	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) | +	      (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); + +	uv_write_global_mmr64(pnode, UVH_IPI_INT, val); +} + +/* Check for an RTC interrupt pending */ +static int uv_intr_pending(int pnode) +{ +	return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & +		UVH_EVENT_OCCURRED0_RTC1_MASK; +} + +/* Setup interrupt and return non-zero if early expiration occurred. */ +static int uv_setup_intr(int cpu, u64 expires) +{ +	u64 val; +	int pnode = uv_cpu_to_pnode(cpu); + +	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, +		UVH_RTC1_INT_CONFIG_M_MASK); +	uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); + +	uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, +		UVH_EVENT_OCCURRED0_RTC1_MASK); + +	val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | +		((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); + +	/* Set configuration */ +	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); +	/* Initialize comparator value */ +	uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); + +	return (expires < uv_read_rtc() && !uv_intr_pending(pnode)); +} + +/* + * Per-cpu timer tracking routines + */ + +static __init void uv_rtc_deallocate_timers(void) +{ +	int bid; + +	for_each_possible_blade(bid) { +		kfree(blade_info[bid]); +	} +	kfree(blade_info); +} + +/* Allocate per-node list of cpu timer expiration times. */ +static __init int uv_rtc_allocate_timers(void) +{ +	int cpu; + +	blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); +	if (!blade_info) +		return -ENOMEM; +	memset(blade_info, 0, uv_possible_blades * sizeof(void *)); + +	for_each_present_cpu(cpu) { +		int nid = cpu_to_node(cpu); +		int bid = uv_cpu_to_blade_id(cpu); +		int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; +		struct uv_rtc_timer_head *head = blade_info[bid]; + +		if (!head) { +			head = kmalloc_node(sizeof(struct uv_rtc_timer_head) + +				(uv_blade_nr_possible_cpus(bid) * +					2 * sizeof(u64)), +				GFP_KERNEL, nid); +			if (!head) { +				uv_rtc_deallocate_timers(); +				return -ENOMEM; +			} +			spin_lock_init(&head->lock); +			head->ncpus = uv_blade_nr_possible_cpus(bid); +			head->next_cpu = -1; +			blade_info[bid] = head; +		} + +		head->cpu[bcpu].lcpu = cpu; +		head->cpu[bcpu].expires = ULLONG_MAX; +	} + +	return 0; +} + +/* Find and set the next expiring timer.  */ +static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode) +{ +	u64 lowest = ULLONG_MAX; +	int c, bcpu = -1; + +	head->next_cpu = -1; +	for (c = 0; c < head->ncpus; c++) { +		u64 exp = head->cpu[c].expires; +		if (exp < lowest) { +			bcpu = c; +			lowest = exp; +		} +	} +	if (bcpu >= 0) { +		head->next_cpu = bcpu; +		c = head->cpu[bcpu].lcpu; +		if (uv_setup_intr(c, lowest)) +			/* If we didn't set it up in time, trigger */ +			uv_rtc_send_IPI(c); +	} else { +		uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, +			UVH_RTC1_INT_CONFIG_M_MASK); +	} +} + +/* + * Set expiration time for current cpu. + * + * Returns 1 if we missed the expiration time. + */ +static int uv_rtc_set_timer(int cpu, u64 expires) +{ +	int pnode = uv_cpu_to_pnode(cpu); +	int bid = uv_cpu_to_blade_id(cpu); +	struct uv_rtc_timer_head *head = blade_info[bid]; +	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; +	u64 *t = &head->cpu[bcpu].expires; +	unsigned long flags; +	int next_cpu; + +	spin_lock_irqsave(&head->lock, flags); + +	next_cpu = head->next_cpu; +	*t = expires; +	/* Will this one be next to go off? */ +	if (next_cpu < 0 || bcpu == next_cpu || +			expires < head->cpu[next_cpu].expires) { +		head->next_cpu = bcpu; +		if (uv_setup_intr(cpu, expires)) { +			*t = ULLONG_MAX; +			uv_rtc_find_next_timer(head, pnode); +			spin_unlock_irqrestore(&head->lock, flags); +			return 1; +		} +	} + +	spin_unlock_irqrestore(&head->lock, flags); +	return 0; +} + +/* + * Unset expiration time for current cpu. + * + * Returns 1 if this timer was pending. + */ +static int uv_rtc_unset_timer(int cpu) +{ +	int pnode = uv_cpu_to_pnode(cpu); +	int bid = uv_cpu_to_blade_id(cpu); +	struct uv_rtc_timer_head *head = blade_info[bid]; +	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; +	u64 *t = &head->cpu[bcpu].expires; +	unsigned long flags; +	int rc = 0; + +	spin_lock_irqsave(&head->lock, flags); + +	if (head->next_cpu == bcpu && uv_read_rtc() >= *t) +		rc = 1; + +	*t = ULLONG_MAX; + +	/* Was the hardware setup for this timer? */ +	if (head->next_cpu == bcpu) +		uv_rtc_find_next_timer(head, pnode); + +	spin_unlock_irqrestore(&head->lock, flags); + +	return rc; +} + + +/* + * Kernel interface routines. + */ + +/* + * Read the RTC. + */ +static cycle_t uv_read_rtc(void) +{ +	return (cycle_t)uv_read_local_mmr(UVH_RTC); +} + +/* + * Program the next event, relative to now + */ +static int uv_rtc_next_event(unsigned long delta, +			     struct clock_event_device *ced) +{ +	int ced_cpu = cpumask_first(ced->cpumask); + +	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc()); +} + +/* + * Setup the RTC timer in oneshot mode + */ +static void uv_rtc_timer_setup(enum clock_event_mode mode, +			       struct clock_event_device *evt) +{ +	int ced_cpu = cpumask_first(evt->cpumask); + +	switch (mode) { +	case CLOCK_EVT_MODE_PERIODIC: +	case CLOCK_EVT_MODE_ONESHOT: +	case CLOCK_EVT_MODE_RESUME: +		/* Nothing to do here yet */ +		break; +	case CLOCK_EVT_MODE_UNUSED: +	case CLOCK_EVT_MODE_SHUTDOWN: +		uv_rtc_unset_timer(ced_cpu); +		break; +	} +} + +static void uv_rtc_interrupt(void) +{ +	struct clock_event_device *ced = &__get_cpu_var(cpu_ced); +	int cpu = smp_processor_id(); + +	if (!ced || !ced->event_handler) +		return; + +	if (uv_rtc_unset_timer(cpu) != 1) +		return; + +	ced->event_handler(ced); +} + +static int __init uv_enable_rtc(char *str) +{ +	uv_rtc_enable = 1; + +	return 1; +} +__setup("uvrtc", uv_enable_rtc); + +static __init void uv_rtc_register_clockevents(struct work_struct *dummy) +{ +	struct clock_event_device *ced = &__get_cpu_var(cpu_ced); + +	*ced = clock_event_device_uv; +	ced->cpumask = cpumask_of(smp_processor_id()); +	clockevents_register_device(ced); +} + +static __init int uv_rtc_setup_clock(void) +{ +	int rc; + +	if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) +		return -ENODEV; + +	generic_interrupt_extension = uv_rtc_interrupt; + +	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, +				clocksource_uv.shift); + +	rc = clocksource_register(&clocksource_uv); +	if (rc) { +		generic_interrupt_extension = NULL; +		return rc; +	} + +	/* Setup and register clockevents */ +	rc = uv_rtc_allocate_timers(); +	if (rc) { +		clocksource_unregister(&clocksource_uv); +		generic_interrupt_extension = NULL; +		return rc; +	} + +	clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, +				NSEC_PER_SEC, clock_event_device_uv.shift); + +	clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / +						sn_rtc_cycles_per_second; + +	clock_event_device_uv.max_delta_ns = clocksource_uv.mask * +				(NSEC_PER_SEC / sn_rtc_cycles_per_second); + +	rc = schedule_on_each_cpu(uv_rtc_register_clockevents); +	if (rc) { +		clocksource_unregister(&clocksource_uv); +		generic_interrupt_extension = NULL; +		uv_rtc_deallocate_timers(); +	} + +	return rc; +} +arch_initcall(uv_rtc_setup_clock); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index d801d06af06..191a876e9e8 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -24,18 +24,14 @@  #include <asm/visws/cobalt.h>  #include <asm/visws/piix4.h> -#include <asm/arch_hooks.h>  #include <asm/io_apic.h>  #include <asm/fixmap.h>  #include <asm/reboot.h>  #include <asm/setup.h> +#include <asm/apic.h>  #include <asm/e820.h>  #include <asm/io.h> -#include <mach_ipi.h> - -#include "mach_apic.h" -  #include <linux/kernel_stat.h>  #include <asm/i8259.h> @@ -49,8 +45,6 @@  extern int no_broadcast; -#include <asm/apic.h> -  char visws_board_type	= -1;  char visws_board_rev	= -1; @@ -200,7 +194,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)  		return;  	} -	apic_cpus = apicid_to_cpu_present(m->apicid); +	apic_cpus = apic->apicid_to_cpu_present(m->apicid);  	physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);  	/*  	 * Validate version diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 4eeb5cf9720..d7ac84e7fc1 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -158,7 +158,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)  	ret = KVM86->regs32;  	ret->fs = current->thread.saved_fs; -	loadsegment(gs, current->thread.saved_gs); +	set_user_gs(ret, current->thread.saved_gs);  	return ret;  } @@ -197,9 +197,9 @@ out:  static int do_vm86_irq_handling(int subfunction, int irqnumber);  static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); -asmlinkage int sys_vm86old(struct pt_regs regs) +int sys_vm86old(struct pt_regs *regs)  { -	struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx; +	struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;  	struct kernel_vm86_struct info; /* declare this _on top_,  					 * this avoids wasting of stack space.  					 * This remains on the stack until we @@ -218,7 +218,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)  	if (tmp)  		goto out;  	memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); -	info.regs32 = ®s; +	info.regs32 = regs;  	tsk->thread.vm86_info = v86;  	do_sys_vm86(&info, tsk);  	ret = 0;	/* we never return here */ @@ -227,7 +227,7 @@ out:  } -asmlinkage int sys_vm86(struct pt_regs regs) +int sys_vm86(struct pt_regs *regs)  {  	struct kernel_vm86_struct info; /* declare this _on top_,  					 * this avoids wasting of stack space. @@ -239,12 +239,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)  	struct vm86plus_struct __user *v86;  	tsk = current; -	switch (regs.bx) { +	switch (regs->bx) {  	case VM86_REQUEST_IRQ:  	case VM86_FREE_IRQ:  	case VM86_GET_IRQ_BITS:  	case VM86_GET_AND_RESET_IRQ: -		ret = do_vm86_irq_handling(regs.bx, (int)regs.cx); +		ret = do_vm86_irq_handling(regs->bx, (int)regs->cx);  		goto out;  	case VM86_PLUS_INSTALL_CHECK:  		/* @@ -261,14 +261,14 @@ asmlinkage int sys_vm86(struct pt_regs regs)  	ret = -EPERM;  	if (tsk->thread.saved_sp0)  		goto out; -	v86 = (struct vm86plus_struct __user *)regs.cx; +	v86 = (struct vm86plus_struct __user *)regs->cx;  	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,  				       offsetof(struct kernel_vm86_struct, regs32) -  				       sizeof(info.regs));  	ret = -EFAULT;  	if (tmp)  		goto out; -	info.regs32 = ®s; +	info.regs32 = regs;  	info.vm86plus.is_vm86pus = 1;  	tsk->thread.vm86_info = (struct vm86_struct __user *)v86;  	do_sys_vm86(&info, tsk); @@ -323,7 +323,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk  	info->regs32->ax = 0;  	tsk->thread.saved_sp0 = tsk->thread.sp0;  	tsk->thread.saved_fs = info->regs32->fs; -	savesegment(gs, tsk->thread.saved_gs); +	tsk->thread.saved_gs = get_user_gs(info->regs32);  	tss = &per_cpu(init_tss, get_cpu());  	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index bef58b4982d..2cc4a90e2cb 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -680,10 +680,11 @@ static inline int __init activate_vmi(void)  	para_fill(pv_mmu_ops.write_cr2, SetCR2);  	para_fill(pv_mmu_ops.write_cr3, SetCR3);  	para_fill(pv_cpu_ops.write_cr4, SetCR4); -	para_fill(pv_irq_ops.save_fl, GetInterruptMask); -	para_fill(pv_irq_ops.restore_fl, SetInterruptMask); -	para_fill(pv_irq_ops.irq_disable, DisableInterrupts); -	para_fill(pv_irq_ops.irq_enable, EnableInterrupts); + +	para_fill(pv_irq_ops.save_fl.func, GetInterruptMask); +	para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask); +	para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts); +	para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);  	para_fill(pv_cpu_ops.wbinvd, WBINVD);  	para_fill(pv_cpu_ops.read_tsc, RDTSC); @@ -797,8 +798,8 @@ static inline int __init activate_vmi(void)  #endif  #ifdef CONFIG_X86_LOCAL_APIC -       para_fill(apic_ops->read, APICRead); -       para_fill(apic_ops->write, APICWrite); +       para_fill(apic->read, APICRead); +       para_fill(apic->write, APICWrite);  #endif  	/* diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index e5b088fffa4..33a788d5879 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -28,7 +28,6 @@  #include <asm/vmi.h>  #include <asm/vmi_time.h> -#include <asm/arch_hooks.h>  #include <asm/apicdef.h>  #include <asm/apic.h>  #include <asm/timer.h> @@ -256,7 +255,7 @@ void __devinit vmi_time_bsp_init(void)  	 */  	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);  	local_irq_disable(); -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP  	/*  	 * XXX handle_percpu_irq only defined for SMP; we need to switch over  	 * to using it, since this is a local interrupt, which each CPU must @@ -288,8 +287,7 @@ static struct clocksource clocksource_vmi;  static cycle_t read_real_cycles(void)  {  	cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); -	return ret >= clocksource_vmi.cycle_last ? -		ret : clocksource_vmi.cycle_last; +	return max(ret, clocksource_vmi.cycle_last);  }  static struct clocksource clocksource_vmi = { diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 82c67559dde..0d860963f26 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -12,7 +12,7 @@  #include <asm-generic/vmlinux.lds.h>  #include <asm/thread_info.h> -#include <asm/page.h> +#include <asm/page_types.h>  #include <asm/cache.h>  #include <asm/boot.h> @@ -178,14 +178,7 @@ SECTIONS  	__initramfs_end = .;    }  #endif -  . = ALIGN(PAGE_SIZE); -  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) { -	__per_cpu_start = .; -	*(.data.percpu.page_aligned) -	*(.data.percpu) -	*(.data.percpu.shared_aligned) -	__per_cpu_end = .; -  } +  PERCPU(PAGE_SIZE)    . = ALIGN(PAGE_SIZE);    /* freed after init ends here */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 1a614c0e6be..fbfced6f680 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -5,7 +5,8 @@  #define LOAD_OFFSET __START_KERNEL_map  #include <asm-generic/vmlinux.lds.h> -#include <asm/page.h> +#include <asm/asm-offsets.h> +#include <asm/page_types.h>  #undef i386	/* in case the preprocessor is a 32bit one */ @@ -13,12 +14,15 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")  OUTPUT_ARCH(i386:x86-64)  ENTRY(phys_startup_64)  jiffies_64 = jiffies; -_proxy_pda = 1;  PHDRS {  	text PT_LOAD FLAGS(5);	/* R_E */  	data PT_LOAD FLAGS(7);	/* RWE */  	user PT_LOAD FLAGS(7);	/* RWE */  	data.init PT_LOAD FLAGS(7);	/* RWE */ +#ifdef CONFIG_SMP +	percpu PT_LOAD FLAGS(7);	/* RWE */ +#endif +	data.init2 PT_LOAD FLAGS(7);	/* RWE */  	note PT_NOTE FLAGS(0);	/* ___ */  }  SECTIONS @@ -208,14 +212,28 @@ SECTIONS    __initramfs_end = .;  #endif +#ifdef CONFIG_SMP +  /* +   * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the +   * output PHDR, so the next output section - __data_nosave - should +   * start another section data.init2.  Also, pda should be at the head of +   * percpu area.  Preallocate it and define the percpu offset symbol +   * so that it can be accessed as a percpu variable. +   */ +  . = ALIGN(PAGE_SIZE); +  PERCPU_VADDR(0, :percpu) +#else    PERCPU(PAGE_SIZE) +#endif    . = ALIGN(PAGE_SIZE);    __init_end = .;    . = ALIGN(PAGE_SIZE);    __nosave_begin = .; -  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } +  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { +      *(.data.nosave) +  } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */    . = ALIGN(PAGE_SIZE);    __nosave_end = .; @@ -239,8 +257,21 @@ SECTIONS    DWARF_DEBUG  } + /* +  * Per-cpu symbols which need to be offset from __per_cpu_load +  * for the boot processor. +  */ +#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(irq_stack_union); +  /*   * Build-time check on the image size:   */  ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),  	"kernel image bigger than KERNEL_IMAGE_SIZE") + +#ifdef CONFIG_SMP +ASSERT((per_cpu__irq_stack_union == 0), +        "irq_stack_union is not at start of per-cpu area"); +#endif diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index a688f3bfaec..74de562812c 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -22,7 +22,7 @@  #include <asm/paravirt.h>  #include <asm/setup.h> -#if defined CONFIG_PCI && defined CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT  /*   * Interrupt control on vSMPowered systems:   * ~AC is a shadow of IF.  If IF is 'on' AC should be 'off' @@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)  		flags &= ~X86_EFLAGS_IF;  	return flags;  } +PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);  static void vsmp_restore_fl(unsigned long flags)  { @@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)  		flags |= X86_EFLAGS_AC;  	native_restore_fl(flags);  } +PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);  static void vsmp_irq_disable(void)  { @@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)  	native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);  } +PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);  static void vsmp_irq_enable(void)  { @@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)  	native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));  } +PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);  static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,  				  unsigned long addr, unsigned len) @@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)  	       cap, ctl);  	if (cap & ctl & (1 << 4)) {  		/* Setup irq ops and turn on vSMP  IRQ fastpath handling */ -		pv_irq_ops.irq_disable = vsmp_irq_disable; -		pv_irq_ops.irq_enable  = vsmp_irq_enable; -		pv_irq_ops.save_fl  = vsmp_save_fl; -		pv_irq_ops.restore_fl  = vsmp_restore_fl; +		pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); +		pv_irq_ops.irq_enable  = PV_CALLEE_SAVE(vsmp_irq_enable); +		pv_irq_ops.save_fl  = PV_CALLEE_SAVE(vsmp_save_fl); +		pv_irq_ops.restore_fl  = PV_CALLEE_SAVE(vsmp_restore_fl);  		pv_init_ops.patch = vsmp_patch;  		ctl &= ~(1 << 4); @@ -110,7 +114,6 @@ static void __init set_vsmp_pv_ops(void)  }  #endif -#ifdef CONFIG_PCI  static int is_vsmp = -1;  static void __init detect_vsmp_box(void) @@ -135,15 +138,6 @@ int is_vsmp_box(void)  		return 0;  	}  } -#else -static void __init detect_vsmp_box(void) -{ -} -int is_vsmp_box(void) -{ -	return 0; -} -#endif  void __init vsmp_init(void)  { diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 695e426aa35..3909e3ba5ce 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);  EXPORT_SYMBOL(empty_zero_page);  EXPORT_SYMBOL(init_level4_pgt);  EXPORT_SYMBOL(load_gs_index); - -EXPORT_SYMBOL(_proxy_pda);  |