diff options
Diffstat (limited to 'arch/x86/kernel')
77 files changed, 1537 insertions, 2668 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 832cb838cb4..d8e5d0cdd67 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,8 +31,8 @@ GCOV_PROFILE_paravirt.o		:= n  obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o  obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y			+= time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y			+= setup.o i8259.o irqinit.o +obj-y			+= time.o ioport.o ldt.o dumpstack.o +obj-y			+= setup.o x86_init.o i8259.o irqinit.o  obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o  obj-$(CONFIG_X86_32)	+= probe_roms_32.o  obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o @@ -56,6 +56,7 @@ obj-$(CONFIG_INTEL_TXT)		+= tboot.o  obj-$(CONFIG_STACKTRACE)	+= stacktrace.o  obj-y				+= cpu/  obj-y				+= acpi/ +obj-$(CONFIG_SFI)		+= sfi.o  obj-y				+= reboot.o  obj-$(CONFIG_MCA)		+= mca_32.o  obj-$(CONFIG_X86_MSR)		+= msr.o @@ -105,6 +106,7 @@ obj-$(CONFIG_SCx200)		+= scx200.o  scx200-y			+= scx200_32.o  obj-$(CONFIG_OLPC)		+= olpc.o +obj-$(CONFIG_X86_MRST)		+= mrst.o  microcode-y				:= microcode_core.o  microcode-$(CONFIG_MICROCODE_INTEL)	+= microcode_intel.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 159740decc4..894aa97f071 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,7 +14,7 @@   *	Mikael Pettersson	:	PM converted to driver model.   */ -#include <linux/perf_counter.h> +#include <linux/perf_event.h>  #include <linux/kernel_stat.h>  #include <linux/mc146818rtc.h>  #include <linux/acpi_pmtmr.h> @@ -35,7 +35,8 @@  #include <linux/smp.h>  #include <linux/mm.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> +#include <asm/x86_init.h>  #include <asm/pgalloc.h>  #include <asm/atomic.h>  #include <asm/mpspec.h> @@ -61,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U;  /*   * The highest APIC ID seen during enumeration.   * - * This determines the messaging protocol we can use: if all APIC IDs + * On AMD, this determines the messaging protocol we can use: if all APIC IDs   * are in the 0 ... 7 range, then we can use logical addressing which   * has some performance advantages (better broadcasting).   * @@ -978,7 +979,7 @@ void lapic_shutdown(void)  {  	unsigned long flags; -	if (!cpu_has_apic) +	if (!cpu_has_apic && !apic_from_smp_config())  		return;  	local_irq_save(flags); @@ -1188,7 +1189,7 @@ void __cpuinit setup_local_APIC(void)  		apic_write(APIC_ESR, 0);  	}  #endif -	perf_counters_lapic_init(); +	perf_events_lapic_init();  	preempt_disable(); @@ -1196,8 +1197,7 @@ void __cpuinit setup_local_APIC(void)  	 * Double-check whether this APIC is really registered.  	 * This is meaningless in clustered apic mode, so we skip it.  	 */ -	if (!apic->apic_id_registered()) -		BUG(); +	BUG_ON(!apic->apic_id_registered());  	/*  	 * Intel recommends to set DFR, LDR and TPR before enabling @@ -1709,7 +1709,7 @@ int __init APIC_init_uniprocessor(void)  	localise_nmi_watchdog();  #endif -	setup_boot_clock(); +	x86_init.timers.setup_percpu_clockev();  #ifdef CONFIG_X86_64  	check_nmi_watchdog();  #endif @@ -1916,24 +1916,14 @@ void __cpuinit generic_processor_info(int apicid, int version)  		max_physical_apicid = apicid;  #ifdef CONFIG_X86_32 -	/* -	 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y -	 * but we need to work other dependencies like SMP_SUSPEND etc -	 * before this can be done without some confusion. -	 * if (CPU_HOTPLUG_ENABLED || num_processors > 8) -	 *       - Ashok Raj <ashok.raj@intel.com> -	 */ -	if (max_physical_apicid >= 8) { -		switch (boot_cpu_data.x86_vendor) { -		case X86_VENDOR_INTEL: -			if (!APIC_XAPIC(version)) { -				def_to_bigsmp = 0; -				break; -			} -			/* If P4 and above fall through */ -		case X86_VENDOR_AMD: +	switch (boot_cpu_data.x86_vendor) { +	case X86_VENDOR_INTEL: +		if (num_processors > 8) +			def_to_bigsmp = 1; +		break; +	case X86_VENDOR_AMD: +		if (max_physical_apicid >= 8)  			def_to_bigsmp = 1; -		}  	}  #endif diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 676cdac385c..77a06413b6b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -112,7 +112,7 @@ static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)  	return physids_promote(0xFFL);  } -static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int bigsmp_check_phys_apicid_present(int phys_apicid)  {  	return 1;  } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c8f9e75d03..64970b9885f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -96,6 +96,11 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];  /* # of MP IRQ source entries */  int mp_irq_entries; +/* Number of legacy interrupts */ +static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; +/* GSI interrupts */ +static int nr_irqs_gsi = NR_IRQS_LEGACY; +  #if defined (CONFIG_MCA) || defined (CONFIG_EISA)  int mp_bus_id_to_type[MAX_MP_BUSSES];  #endif @@ -173,6 +178,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = {  	[15] = { .vector = IRQ15_VECTOR, },  }; +void __init io_apic_disable_legacy(void) +{ +	nr_legacy_irqs = 0; +	nr_irqs_gsi = 0; +} +  int __init arch_early_irq_init(void)  {  	struct irq_cfg *cfg; @@ -190,7 +201,7 @@ int __init arch_early_irq_init(void)  		desc->chip_data = &cfg[i];  		zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);  		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); -		if (i < NR_IRQS_LEGACY) +		if (i < nr_legacy_irqs)  			cpumask_setall(cfg[i].domain);  	} @@ -867,7 +878,7 @@ static int __init find_isa_irq_apic(int irq, int type)   */  static int EISA_ELCR(unsigned int irq)  { -	if (irq < NR_IRQS_LEGACY) { +	if (irq < nr_legacy_irqs) {  		unsigned int port = 0x4d0 + (irq >> 3);  		return (inb(port) >> (irq & 7)) & 1;  	} @@ -1464,7 +1475,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq  	}  	ioapic_register_intr(irq, desc, trigger); -	if (irq < NR_IRQS_LEGACY) +	if (irq < nr_legacy_irqs)  		disable_8259A_irq(irq);  	ioapic_write_entry(apic_id, pin, entry); @@ -1831,7 +1842,7 @@ __apicdebuginit(void) print_PIC(void)  	unsigned int v;  	unsigned long flags; -	if (apic_verbosity == APIC_QUIET) +	if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs)  		return;  	printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1863,7 +1874,7 @@ __apicdebuginit(int) print_all_ICs(void)  	print_PIC();  	/* don't print out if apic is not there */ -	if (!cpu_has_apic || disable_apic) +	if (!cpu_has_apic && !apic_from_smp_config())  		return 0;  	print_all_local_APICs(); @@ -1894,6 +1905,10 @@ void __init enable_IO_APIC(void)  		spin_unlock_irqrestore(&ioapic_lock, flags);  		nr_ioapic_registers[apic] = reg_01.bits.entries+1;  	} + +	if (!nr_legacy_irqs) +		return; +  	for(apic = 0; apic < nr_ioapics; apic++) {  		int pin;  		/* See if any of the pins is in ExtINT mode */ @@ -1948,6 +1963,9 @@ void disable_IO_APIC(void)  	 */  	clear_IO_APIC(); +	if (!nr_legacy_irqs) +		return; +  	/*  	 * If the i8259 is routed through an IOAPIC  	 * Put that IOAPIC in virtual wire mode @@ -1981,7 +1999,7 @@ void disable_IO_APIC(void)  	/*  	 * Use virtual wire A mode when interrupt remapping is enabled.  	 */ -	if (cpu_has_apic) +	if (cpu_has_apic || apic_from_smp_config())  		disconnect_bsp_APIC(!intr_remapping_enabled &&  				ioapic_i8259.pin != -1);  } @@ -1994,7 +2012,7 @@ void disable_IO_APIC(void)   * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999   */ -static void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc(void)  {  	union IO_APIC_reg_00 reg_00;  	physid_mask_t phys_id_present_map; @@ -2003,9 +2021,8 @@ static void __init setup_ioapic_ids_from_mpc(void)  	unsigned char old_id;  	unsigned long flags; -	if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) +	if (acpi_ioapic)  		return; -  	/*  	 * Don't check I/O APIC IDs for xAPIC systems.  They have  	 * no meaning without the serial APIC bus. @@ -2179,7 +2196,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)  	struct irq_cfg *cfg;  	spin_lock_irqsave(&ioapic_lock, flags); -	if (irq < NR_IRQS_LEGACY) { +	if (irq < nr_legacy_irqs) {  		disable_8259A_irq(irq);  		if (i8259A_irq_pending(irq))  			was_pending = 1; @@ -2657,7 +2674,7 @@ static inline void init_IO_APIC_traps(void)  			 * so default to an old-fashioned 8259  			 * interrupt if we can..  			 */ -			if (irq < NR_IRQS_LEGACY) +			if (irq < nr_legacy_irqs)  				make_8259A_irq(irq);  			else  				/* Strange. Oh, well.. */ @@ -2993,7 +3010,7 @@ out:   * the I/O APIC in all cases now.  No actual device should request   * it anyway.  --macro   */ -#define PIC_IRQS	(1 << PIC_CASCADE_IR) +#define PIC_IRQS	(1UL << PIC_CASCADE_IR)  void __init setup_IO_APIC(void)  { @@ -3001,21 +3018,19 @@ void __init setup_IO_APIC(void)  	/*  	 * calling enable_IO_APIC() is moved to setup_local_APIC for BP  	 */ - -	io_apic_irqs = ~PIC_IRQS; +	io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL;  	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");  	/*           * Set up IO-APIC IRQ routing.           */ -#ifdef CONFIG_X86_32 -	if (!acpi_ioapic) -		setup_ioapic_ids_from_mpc(); -#endif +	x86_init.mpparse.setup_ioapic_ids(); +  	sync_Arb_IDs();  	setup_IO_APIC_irqs();  	init_IO_APIC_traps(); -	check_timer(); +	if (nr_legacy_irqs) +		check_timer();  }  /* @@ -3116,7 +3131,6 @@ static int __init ioapic_init_sysfs(void)  device_initcall(ioapic_init_sysfs); -static int nr_irqs_gsi = NR_IRQS_LEGACY;  /*   * Dynamic irq allocate and deallocation   */ @@ -3856,7 +3870,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,  	/*  	 * IRQs < 16 are already in the irq_2_pin[] map  	 */ -	if (irq >= NR_IRQS_LEGACY) { +	if (irq >= nr_legacy_irqs) {  		cfg = desc->chip_data;  		if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {  			printk(KERN_INFO "can not add pin %d for irq %d\n", diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index db7220220d0..cb66a22d98a 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)  static inline int mce_in_progress(void)  { -#if defined(CONFIG_X86_NEW_MCE) +#if defined(CONFIG_X86_MCE)  	return atomic_read(&mce_entry) > 0;  #endif  	return 0; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index ca96e68f0d2..efa00e2b850 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -66,7 +66,6 @@ struct mpc_trans {  	unsigned short			trans_reserved;  }; -/* x86_quirks member */  static int				mpc_record;  static struct mpc_trans			*translation_table[MAX_MPC_ENTRY]; @@ -130,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void)  	}  } -static int __init numaq_pre_time_init(void) +static void __init numaq_tsc_init(void)  {  	numaq_tsc_disable(); -	return 0;  }  static inline int generate_logical_apicid(int quad, int phys_apicid) @@ -177,6 +175,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m)  	quad_local_to_mp_bus_id[quad][local] = m->busid;  } +/* + * Called from mpparse code. + * mode = 0: prescan + * mode = 1: one mpc entry scanned + */ +static void numaq_mpc_record(unsigned int mode) +{ +	if (!mode) +		mpc_record = 0; +	else +		mpc_record++; +} +  static void __init MP_translation_info(struct mpc_trans *m)  {  	printk(KERN_INFO @@ -206,9 +217,9 @@ static int __init mpf_checksum(unsigned char *mp, int len)  /*   * Read/parse the MPC oem tables   */ -static void __init - smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize) +static void __init smp_read_mpc_oem(struct mpc_table *mpc)  { +	struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;  	int count = sizeof(*oemtable);	/* the header size */  	unsigned char *oemptr = ((unsigned char *)oemtable) + count; @@ -250,29 +261,6 @@ static void __init  	}  } -static int __init numaq_setup_ioapic_ids(void) -{ -	/* so can skip it */ -	return 1; -} - -static struct x86_quirks numaq_x86_quirks __initdata = { -	.arch_pre_time_init		= numaq_pre_time_init, -	.arch_time_init			= NULL, -	.arch_pre_intr_init		= NULL, -	.arch_memory_setup		= NULL, -	.arch_intr_init			= NULL, -	.arch_trap_init			= NULL, -	.mach_get_smp_config		= NULL, -	.mach_find_smp_config		= NULL, -	.mpc_record			= &mpc_record, -	.mpc_apic_id			= mpc_apic_id, -	.mpc_oem_bus_info		= mpc_oem_bus_info, -	.mpc_oem_pci_bus		= mpc_oem_pci_bus, -	.smp_read_mpc_oem		= smp_read_mpc_oem, -	.setup_ioapic_ids		= numaq_setup_ioapic_ids, -}; -  static __init void early_check_numaq(void)  {  	/* @@ -286,8 +274,15 @@ static __init void early_check_numaq(void)  	if (smp_found_config)  		early_get_smp_config(); -	if (found_numaq) -		x86_quirks = &numaq_x86_quirks; +	if (found_numaq) { +		x86_init.mpparse.mpc_record = numaq_mpc_record; +		x86_init.mpparse.setup_ioapic_ids = x86_init_noop; +		x86_init.mpparse.mpc_apic_id = mpc_apic_id; +		x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; +		x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; +		x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; +		x86_init.timers.tsc_pre_init = numaq_tsc_init; +	}  }  int __init get_memcfg_numaq(void) @@ -418,7 +413,7 @@ static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)  /* Where the IO area was mapped on multiquad, always 0 otherwise */  void *xquad_portio; -static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) +static inline int numaq_check_phys_apicid_present(int phys_apicid)  {  	return 1;  } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 65edc180fc8..c4cbd3080c1 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -64,16 +64,23 @@ void __init default_setup_apic_routing(void)  			apic = &apic_x2apic_phys;  		else  			apic = &apic_x2apic_cluster; -		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);  	}  #endif  	if (apic == &apic_flat) { -		if (max_physical_apicid >= 8) -			apic = &apic_physflat; -		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); +		switch (boot_cpu_data.x86_vendor) { +		case X86_VENDOR_INTEL: +			if (num_processors > 8) +				apic = &apic_physflat; +			break; +		case X86_VENDOR_AMD: +			if (max_physical_apicid >= 8) +				apic = &apic_physflat; +		}  	} +	printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); +  	if (is_vsmp_box()) {  		/* need to update phys_pkg_id */  		apic->phys_pkg_id = apicid_phys_pkg_id; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index eafdfbd1ea9..645ecc4ff0b 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -272,7 +272,7 @@ static physid_mask_t summit_apicid_to_cpu_present(int apicid)  	return physid_mask_of_physid(0);  } -static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int summit_check_phys_apicid_present(int physical_apicid)  {  	return 1;  } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 601159374e8..f5f5886a6b5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode)  		map_high("GRU", gru.s.base, shift, max_pnode, map_wb);  } +static __init void map_mmr_high(int max_pnode) +{ +	union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; +	int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + +	mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); +	if (mmr.s.enable) +		map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); +} +  static __init void map_mmioh_high(int max_pnode)  {  	union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; @@ -643,6 +653,7 @@ void __init uv_system_init(void)  	}  	map_gru_high(max_pnode); +	map_mmr_high(max_pnode);  	map_mmioh_high(max_pnode);  	uv_cpu_init(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c1f253dac15..68537e957a9 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -13,7 +13,7 @@ CFLAGS_common.o		:= $(nostackp)  obj-y			:= intel_cacheinfo.o addon_cpuid_features.o  obj-y			+= proc.o capflags.o powerflags.o common.o -obj-y			+= vmware.o hypervisor.o +obj-y			+= vmware.o hypervisor.o sched.o  obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o  obj-$(CONFIG_X86_64)	+= bugs_64.o @@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o  obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o  obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o -obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o +obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o  obj-$(CONFIG_X86_MCE)			+= mcheck/  obj-$(CONFIG_MTRR)			+= mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 22a47c82f3c..c910a716a71 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -184,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)  	 * approved Athlon  	 */  	WARN_ONCE(1, "WARNING: This combination of AMD" -		"processors is not suitable for SMP.\n"); +		" processors is not suitable for SMP.\n");  	if (!test_taint(TAINT_UNSAFE_SMP))  		add_taint(TAINT_UNSAFE_SMP); @@ -333,6 +333,16 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)  #endif  } +int amd_get_nb_id(int cpu) +{ +	int id = 0; +#ifdef CONFIG_SMP +	id = per_cpu(cpu_llc_id, cpu); +#endif +	return id; +} +EXPORT_SYMBOL_GPL(amd_get_nb_id); +  static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)  {  #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2055fc2b2e6..cc25c2b4a56 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,7 +13,7 @@  #include <linux/io.h>  #include <asm/stackprotector.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h>  #include <asm/mmu_context.h>  #include <asm/hypervisor.h>  #include <asm/processor.h> @@ -34,7 +34,6 @@  #include <asm/mce.h>  #include <asm/msr.h>  #include <asm/pat.h> -#include <linux/smp.h>  #ifdef CONFIG_X86_LOCAL_APIC  #include <asm/uv/uv.h> @@ -870,7 +869,7 @@ void __init identify_boot_cpu(void)  #else  	vgetcpu_set_mode();  #endif -	init_hw_perf_counters(); +	init_hw_perf_events();  }  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ae9b503220c..7d5c3b0ea8d 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,7 +33,7 @@  #include <linux/cpufreq.h>  #include <linux/compiler.h>  #include <linux/dmi.h> -#include <trace/power.h> +#include <trace/events/power.h>  #include <linux/acpi.h>  #include <linux/io.h> @@ -60,7 +60,6 @@ enum {  };  #define INTEL_MSR_RANGE		(0xffff) -#define CPUID_6_ECX_APERFMPERF_CAPABILITY	(0x1)  struct acpi_cpufreq_data {  	struct acpi_processor_performance *acpi_data; @@ -71,13 +70,7 @@ struct acpi_cpufreq_data {  static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); -struct acpi_msr_data { -	u64 saved_aperf, saved_mperf; -}; - -static DEFINE_PER_CPU(struct acpi_msr_data, msr_data); - -DEFINE_TRACE(power_mark); +static DEFINE_PER_CPU(struct aperfmperf, old_perf);  /* acpi_perf_data is a pointer to percpu data. */  static struct acpi_processor_performance *acpi_perf_data; @@ -244,23 +237,12 @@ static u32 get_cur_val(const struct cpumask *mask)  	return cmd.val;  } -struct perf_pair { -	union { -		struct { -			u32 lo; -			u32 hi; -		} split; -		u64 whole; -	} aperf, mperf; -}; -  /* Called via smp_call_function_single(), on the target CPU */  static void read_measured_perf_ctrs(void *_cur)  { -	struct perf_pair *cur = _cur; +	struct aperfmperf *am = _cur; -	rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); -	rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi); +	get_aperfmperf(am);  }  /* @@ -279,63 +261,17 @@ static void read_measured_perf_ctrs(void *_cur)  static unsigned int get_measured_perf(struct cpufreq_policy *policy,  				      unsigned int cpu)  { -	struct perf_pair readin, cur; -	unsigned int perf_percent; +	struct aperfmperf perf; +	unsigned long ratio;  	unsigned int retval; -	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) +	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))  		return 0; -	cur.aperf.whole = readin.aperf.whole - -				per_cpu(msr_data, cpu).saved_aperf; -	cur.mperf.whole = readin.mperf.whole - -				per_cpu(msr_data, cpu).saved_mperf; -	per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole; -	per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole; - -#ifdef __i386__ -	/* -	 * We dont want to do 64 bit divide with 32 bit kernel -	 * Get an approximate value. Return failure in case we cannot get -	 * an approximate value. -	 */ -	if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) { -		int shift_count; -		u32 h; - -		h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi); -		shift_count = fls(h); - -		cur.aperf.whole >>= shift_count; -		cur.mperf.whole >>= shift_count; -	} - -	if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) { -		int shift_count = 7; -		cur.aperf.split.lo >>= shift_count; -		cur.mperf.split.lo >>= shift_count; -	} - -	if (cur.aperf.split.lo && cur.mperf.split.lo) -		perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo; -	else -		perf_percent = 0; - -#else -	if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) { -		int shift_count = 7; -		cur.aperf.whole >>= shift_count; -		cur.mperf.whole >>= shift_count; -	} - -	if (cur.aperf.whole && cur.mperf.whole) -		perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole; -	else -		perf_percent = 0; - -#endif +	ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); +	per_cpu(old_perf, cpu) = perf; -	retval = (policy->cpuinfo.max_freq * perf_percent) / 100; +	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;  	return retval;  } @@ -394,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,  	unsigned int next_perf_state = 0; /* Index into perf table */  	unsigned int i;  	int result = 0; -	struct power_trace it;  	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -426,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,  		}  	} -	trace_power_mark(&it, POWER_PSTATE, next_perf_state); +	trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);  	switch (data->cpu_feature) {  	case SYSTEM_INTEL_MSR_CAPABLE: @@ -588,6 +523,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {  	},  	{ }  }; + +static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) +{ +	/* http://www.intel.com/Assets/PDF/specupdate/314554.pdf +	 * AL30: A Machine Check Exception (MCE) Occurring during an +	 * Enhanced Intel SpeedStep Technology Ratio Change May Cause +	 * Both Processor Cores to Lock Up when HT is enabled*/ +	if (c->x86_vendor == X86_VENDOR_INTEL) { +		if ((c->x86 == 15) && +		    (c->x86_model == 6) && +		    (c->x86_mask == 8) && smt_capable()) +			return -ENODEV; +		} +	return 0; +}  #endif  static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) @@ -602,6 +552,12 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)  	dprintk("acpi_cpufreq_cpu_init\n"); +#ifdef CONFIG_SMP +	result = acpi_cpufreq_blacklist(c); +	if (result) +		return result; +#endif +  	data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);  	if (!data)  		return -ENOMEM; @@ -731,12 +687,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)  	acpi_processor_notify_smm(THIS_MODULE);  	/* Check for APERF/MPERF support in hardware */ -	if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { -		unsigned int ecx; -		ecx = cpuid_ecx(6); -		if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY) -			acpi_cpufreq_driver.getavg = get_measured_perf; -	} +	if (cpu_has(c, X86_FEATURE_APERFMPERF)) +		acpi_cpufreq_driver.getavg = get_measured_perf;  	dprintk("CPU%u - ACPI performance management activated.\n", cpu);  	for (i = 0; i < perf->state_count; i++) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 2a50ef89100..6394aa5c798 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,  	return 0;  } -static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry) +static void invalidate_entry(struct cpufreq_frequency_table *powernow_table, +		unsigned int entry)  { -	data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; +	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;  }  static void print_basics(struct powernow_k8_data *data) @@ -854,6 +855,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)  		goto err_out;  	} +	/* fill in data */ +	data->numps = data->acpi_data.state_count; +	powernow_k8_acpi_pst_values(data, 0); +  	if (cpu_family == CPU_HW_PSTATE)  		ret_val = fill_powernow_table_pstate(data, powernow_table);  	else @@ -866,11 +871,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)  	powernow_table[data->acpi_data.state_count].index = 0;  	data->powernow_table = powernow_table; -	/* fill in data */ -	data->numps = data->acpi_data.state_count;  	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)  		print_basics(data); -	powernow_k8_acpi_pst_values(data, 0);  	/* notify BIOS that we exist */  	acpi_processor_notify_smm(THIS_MODULE); @@ -914,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,  					"bad value %d.\n", i, index);  			printk(KERN_ERR PFX "Please report to BIOS "  					"manufacturer\n"); -			invalidate_entry(data, i); +			invalidate_entry(powernow_table, i);  			continue;  		}  		rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);  		if (!(hi & HW_PSTATE_VALID_MASK)) {  			dprintk("invalid pstate %d, ignoring\n", index); -			invalidate_entry(data, i); +			invalidate_entry(powernow_table, i);  			continue;  		} @@ -941,7 +943,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,  		struct cpufreq_frequency_table *powernow_table)  {  	int i; -	int cntlofreq = 0;  	for (i = 0; i < data->acpi_data.state_count; i++) {  		u32 fid; @@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,  		/* verify frequency is OK */  		if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {  			dprintk("invalid freq %u kHz, ignoring\n", freq); -			invalidate_entry(data, i); +			invalidate_entry(powernow_table, i);  			continue;  		} @@ -978,38 +979,17 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,  		 * BIOSs are using "off" to indicate invalid */  		if (vid == VID_OFF) {  			dprintk("invalid vid %u, ignoring\n", vid); -			invalidate_entry(data, i); +			invalidate_entry(powernow_table, i);  			continue;  		} -		/* verify only 1 entry from the lo frequency table */ -		if (fid < HI_FID_TABLE_BOTTOM) { -			if (cntlofreq) { -				/* if both entries are the same, -				 * ignore this one ... */ -				if ((freq != powernow_table[cntlofreq].frequency) || -				    (index != powernow_table[cntlofreq].index)) { -					printk(KERN_ERR PFX -						"Too many lo freq table " -						"entries\n"); -					return 1; -				} - -				dprintk("double low frequency table entry, " -						"ignoring it.\n"); -				invalidate_entry(data, i); -				continue; -			} else -				cntlofreq = i; -		} -  		if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {  			printk(KERN_INFO PFX "invalid freq entries "  				"%u kHz vs. %u kHz\n", freq,  				(unsigned int)  				(data->acpi_data.states[i].core_frequency  				 * 1000)); -			invalidate_entry(data, i); +			invalidate_entry(powernow_table, i);  			continue;  		}  	} diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 93ba8eeb100..08be922de33 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,13 +34,6 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)  		c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;  } -unsigned long get_hypervisor_tsc_freq(void) -{ -	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) -		return vmware_get_tsc_khz(); -	return 0; -} -  static inline void __cpuinit  hypervisor_set_feature_bits(struct cpuinfo_x86 *c)  { @@ -55,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)  	detect_hypervisor_vendor(c);  	hypervisor_set_feature_bits(c);  } + +void __init init_hypervisor_platform(void) +{ +	init_hypervisor(&boot_cpu_data); +	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) +		vmware_platform_setup(); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 80a722a071b..40e1835b35e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)  			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);  	} +	if (c->cpuid_level > 6) { +		unsigned ecx = cpuid_ecx(6); +		if (ecx & 0x01) +			set_cpu_cap(c, X86_FEATURE_APERFMPERF); +	} +  	if (cpu_has_xmm2)  		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);  	if (cpu_has_ds) { diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 188a1ca5ad2..4ac6d48fe11 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,8 @@ -obj-y				=  mce.o +obj-y				=  mce.o mce-severity.o -obj-$(CONFIG_X86_NEW_MCE)	+= mce-severity.o -obj-$(CONFIG_X86_OLD_MCE)	+= k7.o p4.o p6.o  obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o  obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o  obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd.o -obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o  obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o  obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c deleted file mode 100644 index b945d5dbc60..00000000000 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Athlon specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones <davej@redhat.com> - */ -#include <linux/interrupt.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* Machine Check Handler For AMD Athlon/Duron: */ -static void k7_machine_check(struct pt_regs *regs, long error_code) -{ -	u32 alow, ahigh, high, low; -	u32 mcgstl, mcgsth; -	int recover = 1; -	int i; - -	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -	if (mcgstl & (1<<0))	/* Recoverable ? */ -		recover = 0; - -	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", -		smp_processor_id(), mcgsth, mcgstl); - -	for (i = 1; i < nr_mce_banks; i++) { -		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); -		if (high & (1<<31)) { -			char misc[20]; -			char addr[24]; - -			misc[0] = '\0'; -			addr[0] = '\0'; - -			if (high & (1<<29)) -				recover |= 1; -			if (high & (1<<25)) -				recover |= 2; -			high &= ~(1<<31); - -			if (high & (1<<27)) { -				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); -				snprintf(misc, 20, "[%08x%08x]", ahigh, alow); -			} -			if (high & (1<<26)) { -				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); -				snprintf(addr, 24, " at %08x%08x", ahigh, alow); -			} - -			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", -				smp_processor_id(), i, high, low, misc, addr); - -			/* Clear it: */ -			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); -			/* Serialize: */ -			wmb(); -			add_taint(TAINT_MACHINE_CHECK); -		} -	} - -	if (recover & 2) -		panic("CPU context corrupt"); -	if (recover & 1) -		panic("Unable to continue"); - -	printk(KERN_EMERG "Attempting to continue.\n"); - -	mcgstl &= ~(1<<2); -	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - - -/* AMD K7 machine check is Intel like: */ -void amd_mcheck_init(struct cpuinfo_x86 *c) -{ -	u32 l, h; -	int i; - -	if (!cpu_has(c, X86_FEATURE_MCE)) -		return; - -	machine_check_vector = k7_machine_check; -	/* Make sure the vector pointer is visible before we enable MCEs: */ -	wmb(); - -	printk(KERN_INFO "Intel machine check architecture supported.\n"); - -	rdmsr(MSR_IA32_MCG_CAP, l, h); -	if (l & (1<<8))	/* Control register present ? */ -		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); -	nr_mce_banks = l & 0xff; - -	/* -	 * Clear status for MC index 0 separately, we don't touch CTL, -	 * as some K7 Athlons cause spurious MCEs when its enabled: -	 */ -	if (boot_cpu_data.x86 == 6) { -		wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); -		i = 1; -	} else -		i = 0; - -	for (; i < nr_mce_banks; i++) { -		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); -		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); -	} - -	set_in_cr4(X86_CR4_MCE); -	printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", -		smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a3a235a53f0..7029f0e2aca 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -18,7 +18,12 @@  #include <linux/string.h>  #include <linux/fs.h>  #include <linux/smp.h> +#include <linux/notifier.h> +#include <linux/kdebug.h> +#include <linux/cpu.h> +#include <linux/sched.h>  #include <asm/mce.h> +#include <asm/apic.h>  /* Update fake mce registers on current CPU. */  static void inject_mce(struct mce *m) @@ -39,44 +44,141 @@ static void inject_mce(struct mce *m)  	i->finished = 1;  } -struct delayed_mce { -	struct timer_list timer; -	struct mce m; -}; +static void raise_poll(struct mce *m) +{ +	unsigned long flags; +	mce_banks_t b; -/* Inject mce on current CPU */ -static void raise_mce(unsigned long data) +	memset(&b, 0xff, sizeof(mce_banks_t)); +	local_irq_save(flags); +	machine_check_poll(0, &b); +	local_irq_restore(flags); +	m->finished = 0; +} + +static void raise_exception(struct mce *m, struct pt_regs *pregs)  { -	struct delayed_mce *dm = (struct delayed_mce *)data; -	struct mce *m = &dm->m; -	int cpu = m->extcpu; +	struct pt_regs regs; +	unsigned long flags; -	inject_mce(m); -	if (m->status & MCI_STATUS_UC) { -		struct pt_regs regs; +	if (!pregs) {  		memset(®s, 0, sizeof(struct pt_regs));  		regs.ip = m->ip;  		regs.cs = m->cs; +		pregs = ®s; +	} +	/* in mcheck exeception handler, irq will be disabled */ +	local_irq_save(flags); +	do_machine_check(pregs, 0); +	local_irq_restore(flags); +	m->finished = 0; +} + +static cpumask_t mce_inject_cpumask; + +static int mce_raise_notify(struct notifier_block *self, +			    unsigned long val, void *data) +{ +	struct die_args *args = (struct die_args *)data; +	int cpu = smp_processor_id(); +	struct mce *m = &__get_cpu_var(injectm); +	if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) +		return NOTIFY_DONE; +	cpu_clear(cpu, mce_inject_cpumask); +	if (m->inject_flags & MCJ_EXCEPTION) +		raise_exception(m, args->regs); +	else if (m->status) +		raise_poll(m); +	return NOTIFY_STOP; +} + +static struct notifier_block mce_raise_nb = { +	.notifier_call = mce_raise_notify, +	.priority = 1000, +}; + +/* Inject mce on current CPU */ +static int raise_local(struct mce *m) +{ +	int context = MCJ_CTX(m->inject_flags); +	int ret = 0; +	int cpu = m->extcpu; + +	if (m->inject_flags & MCJ_EXCEPTION) {  		printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); -		do_machine_check(®s, 0); +		switch (context) { +		case MCJ_CTX_IRQ: +			/* +			 * Could do more to fake interrupts like +			 * calling irq_enter, but the necessary +			 * machinery isn't exported currently. +			 */ +			/*FALL THROUGH*/ +		case MCJ_CTX_PROCESS: +			raise_exception(m, NULL); +			break; +		default: +			printk(KERN_INFO "Invalid MCE context\n"); +			ret = -EINVAL; +		}  		printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); -	} else { -		mce_banks_t b; -		memset(&b, 0xff, sizeof(mce_banks_t)); +	} else if (m->status) {  		printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); -		machine_check_poll(0, &b); +		raise_poll(m);  		mce_notify_irq(); -		printk(KERN_INFO "Finished machine check poll on CPU %d\n", -		       cpu); -	} -	kfree(dm); +		printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); +	} else +		m->finished = 0; + +	return ret; +} + +static void raise_mce(struct mce *m) +{ +	int context = MCJ_CTX(m->inject_flags); + +	inject_mce(m); + +	if (context == MCJ_CTX_RANDOM) +		return; + +#ifdef CONFIG_X86_LOCAL_APIC +	if (m->inject_flags & MCJ_NMI_BROADCAST) { +		unsigned long start; +		int cpu; +		get_online_cpus(); +		mce_inject_cpumask = cpu_online_map; +		cpu_clear(get_cpu(), mce_inject_cpumask); +		for_each_online_cpu(cpu) { +			struct mce *mcpu = &per_cpu(injectm, cpu); +			if (!mcpu->finished || +			    MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) +				cpu_clear(cpu, mce_inject_cpumask); +		} +		if (!cpus_empty(mce_inject_cpumask)) +			apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); +		start = jiffies; +		while (!cpus_empty(mce_inject_cpumask)) { +			if (!time_before(jiffies, start + 2*HZ)) { +				printk(KERN_ERR +				"Timeout waiting for mce inject NMI %lx\n", +					*cpus_addr(mce_inject_cpumask)); +				break; +			} +			cpu_relax(); +		} +		raise_local(m); +		put_cpu(); +		put_online_cpus(); +	} else +#endif +		raise_local(m);  }  /* Error injection interface */  static ssize_t mce_write(struct file *filp, const char __user *ubuf,  			 size_t usize, loff_t *off)  { -	struct delayed_mce *dm;  	struct mce m;  	if (!capable(CAP_SYS_ADMIN)) @@ -96,19 +198,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,  	if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))  		return -EINVAL; -	dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); -	if (!dm) -		return -ENOMEM; -  	/*  	 * Need to give user space some time to set everything up,  	 * so do it a jiffie or two later everywhere. -	 * Should we use a hrtimer here for better synchronization?  	 */ -	memcpy(&dm->m, &m, sizeof(struct mce)); -	setup_timer(&dm->timer, raise_mce, (unsigned long)dm); -	dm->timer.expires = jiffies + 2; -	add_timer_on(&dm->timer, m.extcpu); +	schedule_timeout(2); +	raise_mce(&m);  	return usize;  } @@ -116,6 +211,7 @@ static int inject_init(void)  {  	printk(KERN_INFO "Machine check injector initialized\n");  	mce_chrdev_ops.write = mce_write; +	register_die_notifier(&mce_raise_nb);  	return 0;  } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 54dcb8ff12e..32996f9fab6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,4 @@ +#include <linux/sysdev.h>  #include <asm/mce.h>  enum severity_level { @@ -10,6 +11,20 @@ enum severity_level {  	MCE_PANIC_SEVERITY,  }; +#define ATTR_LEN		16 + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { +	u64			ctl;			/* subevents to enable */ +	unsigned char init;				/* initialise bank? */ +	struct sysdev_attribute attr;			/* sysdev attribute */ +	char			attrname[ATTR_LEN];	/* attribute name */ +}; +  int mce_severity(struct mce *a, int tolerant, char **msg); +struct dentry *mce_get_debugfs_dir(void);  extern int mce_ser; + +extern struct mce_bank *mce_banks; + diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index ff0807f9705..8a85dd1b1aa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg)  	}  } +#ifdef CONFIG_DEBUG_FS  static void *s_start(struct seq_file *f, loff_t *pos)  {  	if (*pos >= ARRAY_SIZE(severities)) @@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void)  {  	struct dentry *dmce = NULL, *fseverities_coverage = NULL; -	dmce = debugfs_create_dir("mce", NULL); +	dmce = mce_get_debugfs_dir();  	if (dmce == NULL)  		goto err_out;  	fseverities_coverage = debugfs_create_file("severities-coverage", @@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void)  	return 0;  err_out: -	if (fseverities_coverage) -		debugfs_remove(fseverities_coverage); -	if (dmce) -		debugfs_remove(dmce);  	return -ENOMEM;  }  late_initcall(severities_debugfs_init); +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fdd51b55435..2f5aab26320 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -34,6 +34,7 @@  #include <linux/smp.h>  #include <linux/fs.h>  #include <linux/mm.h> +#include <linux/debugfs.h>  #include <asm/processor.h>  #include <asm/hw_irq.h> @@ -45,21 +46,8 @@  #include "mce-internal.h" -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ -	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", -	       smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = -						unexpected_machine_check; -  int mce_disabled __read_mostly; -#ifdef CONFIG_X86_NEW_MCE -  #define MISC_MCELOG_MINOR	227  #define SPINUNIT 100	/* 100ns */ @@ -77,7 +65,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);   */  static int			tolerant		__read_mostly = 1;  static int			banks			__read_mostly; -static u64			*bank			__read_mostly;  static int			rip_msr			__read_mostly;  static int			mce_bootlog		__read_mostly = -1;  static int			monarch_timeout		__read_mostly = -1; @@ -87,13 +74,13 @@ int				mce_cmci_disabled	__read_mostly;  int				mce_ignore_ce		__read_mostly;  int				mce_ser			__read_mostly; +struct mce_bank                *mce_banks		__read_mostly; +  /* User mode helper program triggered by machine check event */  static unsigned long		mce_need_notify;  static char			mce_helper[128];  static char			*mce_helper_argv[2] = { mce_helper, NULL }; -static unsigned long		dont_init_banks; -  static DECLARE_WAIT_QUEUE_HEAD(mce_wait);  static DEFINE_PER_CPU(struct mce, mces_seen);  static int			cpu_missing; @@ -104,11 +91,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {  	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL  }; -static inline int skip_bank_init(int i) -{ -	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); -} -  static DEFINE_PER_CPU(struct work_struct, mce_work);  /* Do initial initialization of a struct mce */ @@ -232,6 +214,9 @@ static void print_mce_tail(void)  static atomic_t mce_paniced; +static int fake_panic; +static atomic_t mce_fake_paniced; +  /* Panic in progress. Enable interrupts and wait for final IPI */  static void wait_for_panic(void)  { @@ -249,15 +234,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp)  {  	int i; -	/* -	 * Make sure only one CPU runs in machine check panic -	 */ -	if (atomic_add_return(1, &mce_paniced) > 1) -		wait_for_panic(); -	barrier(); +	if (!fake_panic) { +		/* +		 * Make sure only one CPU runs in machine check panic +		 */ +		if (atomic_inc_return(&mce_paniced) > 1) +			wait_for_panic(); +		barrier(); -	bust_spinlocks(1); -	console_verbose(); +		bust_spinlocks(1); +		console_verbose(); +	} else { +		/* Don't log too much for fake panic */ +		if (atomic_inc_return(&mce_fake_paniced) > 1) +			return; +	}  	print_mce_head();  	/* First print corrected ones that are still unlogged */  	for (i = 0; i < MCE_LOG_LEN; i++) { @@ -284,9 +275,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)  	print_mce_tail();  	if (exp)  		printk(KERN_EMERG "Machine check: %s\n", exp); -	if (panic_timeout == 0) -		panic_timeout = mce_panic_timeout; -	panic(msg); +	if (!fake_panic) { +		if (panic_timeout == 0) +			panic_timeout = mce_panic_timeout; +		panic(msg); +	} else +		printk(KERN_EMERG "Fake kernel panic: %s\n", msg);  }  /* Support code for software error injection */ @@ -296,11 +290,11 @@ static int msr_to_offset(u32 msr)  	unsigned bank = __get_cpu_var(injectm.bank);  	if (msr == rip_msr)  		return offsetof(struct mce, ip); -	if (msr == MSR_IA32_MC0_STATUS + bank*4) +	if (msr == MSR_IA32_MCx_STATUS(bank))  		return offsetof(struct mce, status); -	if (msr == MSR_IA32_MC0_ADDR + bank*4) +	if (msr == MSR_IA32_MCx_ADDR(bank))  		return offsetof(struct mce, addr); -	if (msr == MSR_IA32_MC0_MISC + bank*4) +	if (msr == MSR_IA32_MCx_MISC(bank))  		return offsetof(struct mce, misc);  	if (msr == MSR_IA32_MCG_STATUS)  		return offsetof(struct mce, mcgstatus); @@ -505,7 +499,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)  	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);  	for (i = 0; i < banks; i++) { -		if (!bank[i] || !test_bit(i, *b)) +		if (!mce_banks[i].ctl || !test_bit(i, *b))  			continue;  		m.misc = 0; @@ -514,7 +508,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)  		m.tsc = 0;  		barrier(); -		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); +		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));  		if (!(m.status & MCI_STATUS_VAL))  			continue; @@ -529,9 +523,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)  			continue;  		if (m.status & MCI_STATUS_MISCV) -			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); +			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));  		if (m.status & MCI_STATUS_ADDRV) -			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); +			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));  		if (!(flags & MCP_TIMESTAMP))  			m.tsc = 0; @@ -547,7 +541,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)  		/*  		 * Clear state for this bank.  		 */ -		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);  	}  	/* @@ -568,7 +562,7 @@ static int mce_no_way_out(struct mce *m, char **msg)  	int i;  	for (i = 0; i < banks; i++) { -		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); +		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));  		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)  			return 1;  	} @@ -628,7 +622,7 @@ out:   * This way we prevent any potential data corruption in a unrecoverable case   * and also makes sure always all CPU's errors are examined.   * - * Also this detects the case of an machine check event coming from outer + * Also this detects the case of a machine check event coming from outer   * space (not detected by any CPUs) In this case some external agent wants   * us to shut down, so panic too.   * @@ -681,7 +675,7 @@ static void mce_reign(void)  	 * No machine check event found. Must be some external  	 * source or one CPU is hung. Panic.  	 */ -	if (!m && tolerant < 3) +	if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)  		mce_panic("Machine check from unknown source", NULL, NULL);  	/* @@ -715,7 +709,7 @@ static int mce_start(int *no_way_out)  	 * global_nwo should be updated before mce_callin  	 */  	smp_wmb(); -	order = atomic_add_return(1, &mce_callin); +	order = atomic_inc_return(&mce_callin);  	/*  	 * Wait for everyone. @@ -852,7 +846,7 @@ static void mce_clear_state(unsigned long *toclear)  	for (i = 0; i < banks; i++) {  		if (test_bit(i, toclear)) -			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);  	}  } @@ -905,11 +899,11 @@ void do_machine_check(struct pt_regs *regs, long error_code)  	mce_setup(&m);  	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); -	no_way_out = mce_no_way_out(&m, &msg); -  	final = &__get_cpu_var(mces_seen);  	*final = m; +	no_way_out = mce_no_way_out(&m, &msg); +  	barrier();  	/* @@ -926,14 +920,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)  	order = mce_start(&no_way_out);  	for (i = 0; i < banks; i++) {  		__clear_bit(i, toclear); -		if (!bank[i]) +		if (!mce_banks[i].ctl)  			continue;  		m.misc = 0;  		m.addr = 0;  		m.bank = i; -		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); +		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));  		if ((m.status & MCI_STATUS_VAL) == 0)  			continue; @@ -974,9 +968,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)  			kill_it = 1;  		if (m.status & MCI_STATUS_MISCV) -			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); +			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));  		if (m.status & MCI_STATUS_ADDRV) -			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); +			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));  		/*  		 * Action optional error. Queue address for later processing. @@ -1169,10 +1163,25 @@ int mce_notify_irq(void)  }  EXPORT_SYMBOL_GPL(mce_notify_irq); +static int mce_banks_init(void) +{ +	int i; + +	mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); +	if (!mce_banks) +		return -ENOMEM; +	for (i = 0; i < banks; i++) { +		struct mce_bank *b = &mce_banks[i]; +		b->ctl = -1ULL; +		b->init = 1; +	} +	return 0; +} +  /*   * Initialize Machine Checks for a CPU.   */ -static int mce_cap_init(void) +static int __cpuinit mce_cap_init(void)  {  	unsigned b;  	u64 cap; @@ -1192,11 +1201,10 @@ static int mce_cap_init(void)  	/* Don't support asymmetric configurations today */  	WARN_ON(banks != 0 && b != banks);  	banks = b; -	if (!bank) { -		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); -		if (!bank) -			return -ENOMEM; -		memset(bank, 0xff, banks * sizeof(u64)); +	if (!mce_banks) { +		int err = mce_banks_init(); +		if (err) +			return err;  	}  	/* Use accurate RIP reporting if available. */ @@ -1228,15 +1236,16 @@ static void mce_init(void)  		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);  	for (i = 0; i < banks; i++) { -		if (skip_bank_init(i)) +		struct mce_bank *b = &mce_banks[i]; +		if (!b->init)  			continue; -		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); -		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); +		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); +		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);  	}  }  /* Add per CPU specific workarounds here */ -static int mce_cpu_quirks(struct cpuinfo_x86 *c) +static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)  {  	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {  		pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); @@ -1251,7 +1260,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)  			 * trips off incorrectly with the IOMMU & 3ware  			 * & Cerberus:  			 */ -			clear_bit(10, (unsigned long *)&bank[4]); +			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);  		}  		if (c->x86 <= 17 && mce_bootlog < 0) {  			/* @@ -1265,7 +1274,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)  		 * by default.  		 */  		 if (c->x86 == 6 && banks > 0) -			bank[0] = 0; +			mce_banks[0].ctl = 0;  	}  	if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1278,8 +1287,8 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)  		 * valid event later, merely don't write CTL0.  		 */ -		if (c->x86 == 6 && c->x86_model < 0x1A) -			__set_bit(0, &dont_init_banks); +		if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) +			mce_banks[0].init = 0;  		/*  		 * All newer Intel systems support MCE broadcasting. Enable @@ -1348,6 +1357,17 @@ static void mce_init_timer(void)  	add_timer_on(t, smp_processor_id());  } +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ +	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", +	       smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = +						unexpected_machine_check; +  /*   * Called for each booted CPU to set up machine checks.   * Must be called with preempt off: @@ -1561,8 +1581,10 @@ static struct miscdevice mce_log_device = {   */  static int __init mcheck_enable(char *str)  { -	if (*str == 0) +	if (*str == 0) {  		enable_p5_mce(); +		return 1; +	}  	if (*str == '=')  		str++;  	if (!strcmp(str, "off")) @@ -1603,8 +1625,9 @@ static int mce_disable(void)  	int i;  	for (i = 0; i < banks; i++) { -		if (!skip_bank_init(i)) -			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +		struct mce_bank *b = &mce_banks[i]; +		if (b->init) +			wrmsrl(MSR_IA32_MCx_CTL(i), 0);  	}  	return 0;  } @@ -1679,14 +1702,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev);  __cpuinitdata  void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static struct sysdev_attribute *bank_attrs; +static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +{ +	return container_of(attr, struct mce_bank, attr); +}  static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,  			 char *buf)  { -	u64 b = bank[attr - bank_attrs]; - -	return sprintf(buf, "%llx\n", b); +	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);  }  static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1697,7 +1721,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,  	if (strict_strtoull(buf, 0, &new) < 0)  		return -EINVAL; -	bank[attr - bank_attrs] = new; +	attr_to_bank(attr)->ctl = new;  	mce_restart();  	return size; @@ -1839,7 +1863,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)  	}  	for (j = 0; j < banks; j++) {  		err = sysdev_create_file(&per_cpu(mce_dev, cpu), -					&bank_attrs[j]); +					&mce_banks[j].attr);  		if (err)  			goto error2;  	} @@ -1848,10 +1872,10 @@ static __cpuinit int mce_create_device(unsigned int cpu)  	return 0;  error2:  	while (--j >= 0) -		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); +		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);  error:  	while (--i >= 0) -		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); +		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);  	sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1869,7 +1893,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)  		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);  	for (i = 0; i < banks; i++) -		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); +		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);  	sysdev_unregister(&per_cpu(mce_dev, cpu));  	cpumask_clear_cpu(cpu, mce_dev_initialized); @@ -1886,8 +1910,9 @@ static void mce_disable_cpu(void *h)  	if (!(action & CPU_TASKS_FROZEN))  		cmci_clear();  	for (i = 0; i < banks; i++) { -		if (!skip_bank_init(i)) -			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +		struct mce_bank *b = &mce_banks[i]; +		if (b->init) +			wrmsrl(MSR_IA32_MCx_CTL(i), 0);  	}  } @@ -1902,8 +1927,9 @@ static void mce_reenable_cpu(void *h)  	if (!(action & CPU_TASKS_FROZEN))  		cmci_reenable();  	for (i = 0; i < banks; i++) { -		if (!skip_bank_init(i)) -			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +		struct mce_bank *b = &mce_banks[i]; +		if (b->init) +			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);  	}  } @@ -1951,35 +1977,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {  	.notifier_call = mce_cpu_callback,  }; -static __init int mce_init_banks(void) +static __init void mce_init_banks(void)  {  	int i; -	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, -				GFP_KERNEL); -	if (!bank_attrs) -		return -ENOMEM; -  	for (i = 0; i < banks; i++) { -		struct sysdev_attribute *a = &bank_attrs[i]; +		struct mce_bank *b = &mce_banks[i]; +		struct sysdev_attribute *a = &b->attr; -		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i); -		if (!a->attr.name) -			goto nomem; +		a->attr.name	= b->attrname; +		snprintf(b->attrname, ATTR_LEN, "bank%d", i);  		a->attr.mode	= 0644;  		a->show		= show_bank;  		a->store	= set_bank;  	} -	return 0; - -nomem: -	while (--i >= 0) -		kfree(bank_attrs[i].attr.name); -	kfree(bank_attrs); -	bank_attrs = NULL; - -	return -ENOMEM;  }  static __init int mce_init_device(void) @@ -1992,9 +2004,7 @@ static __init int mce_init_device(void)  	zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); -	err = mce_init_banks(); -	if (err) -		return err; +	mce_init_banks();  	err = sysdev_class_register(&mce_sysclass);  	if (err) @@ -2014,57 +2024,65 @@ static __init int mce_init_device(void)  device_initcall(mce_init_device); -#else /* CONFIG_X86_OLD_MCE: */ - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */ +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ +	mce_disabled = 1; +	return 1; +} +__setup("nomce", mcheck_disable); -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void)  { -	if (mce_disabled) -		return; +	static struct dentry *dmce; -	switch (c->x86_vendor) { -	case X86_VENDOR_AMD: -		amd_mcheck_init(c); -		break; +	if (!dmce) +		dmce = debugfs_create_dir("mce", NULL); -	case X86_VENDOR_INTEL: -		if (c->x86 == 5) -			intel_p5_mcheck_init(c); -		if (c->x86 == 6) -			intel_p6_mcheck_init(c); -		if (c->x86 == 15) -			intel_p4_mcheck_init(c); -		break; +	return dmce; +} -	case X86_VENDOR_CENTAUR: -		if (c->x86 == 5) -			winchip_mcheck_init(c); -		break; +static void mce_reset(void) +{ +	cpu_missing = 0; +	atomic_set(&mce_fake_paniced, 0); +	atomic_set(&mce_executing, 0); +	atomic_set(&mce_callin, 0); +	atomic_set(&global_nwo, 0); +} -	default: -		break; -	} -	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); +static int fake_panic_get(void *data, u64 *val) +{ +	*val = fake_panic; +	return 0;  } -static int __init mcheck_enable(char *str) +static int fake_panic_set(void *data, u64 val)  { -	mce_p5_enabled = 1; -	return 1; +	mce_reset(); +	fake_panic = val; +	return 0;  } -__setup("mce", mcheck_enable); -#endif /* CONFIG_X86_OLD_MCE */ +DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, +			fake_panic_set, "%llu\n"); -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) +static int __init mce_debugfs_init(void)  { -	mce_disabled = 1; -	return 1; +	struct dentry *dmce, *ffake_panic; + +	dmce = mce_get_debugfs_dir(); +	if (!dmce) +		return -ENOMEM; +	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, +					  &fake_panic_fops); +	if (!ffake_panic) +		return -ENOMEM; + +	return 0;  } -__setup("nomce", mcheck_disable); +late_initcall(mce_debugfs_init); +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 8cd5224943b..83a3d1f4efc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -489,8 +489,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  	int i, err = 0;  	struct threshold_bank *b = NULL;  	char name[32]; +#ifdef CONFIG_SMP  	struct cpuinfo_x86 *c = &cpu_data(cpu); - +#endif  	sprintf(name, "threshold_bank%i", bank); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e1acec0f7a3..889f665fe93 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot)  		if (test_bit(i, owned))  			continue; -		rdmsrl(MSR_IA32_MC0_CTL2 + i, val); +		rdmsrl(MSR_IA32_MCx_CTL2(i), val);  		/* Already owned by someone else? */  		if (val & CMCI_EN) { @@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot)  		}  		val |= CMCI_EN | CMCI_THRESHOLD; -		wrmsrl(MSR_IA32_MC0_CTL2 + i, val); -		rdmsrl(MSR_IA32_MC0_CTL2 + i, val); +		wrmsrl(MSR_IA32_MCx_CTL2(i), val); +		rdmsrl(MSR_IA32_MCx_CTL2(i), val);  		/* Did the enable bit stick? -- the bank supports CMCI */  		if (val & CMCI_EN) { @@ -152,9 +152,9 @@ void cmci_clear(void)  		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))  			continue;  		/* Disable CMCI */ -		rdmsrl(MSR_IA32_MC0_CTL2 + i, val); +		rdmsrl(MSR_IA32_MCx_CTL2(i), val);  		val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); -		wrmsrl(MSR_IA32_MC0_CTL2 + i, val); +		wrmsrl(MSR_IA32_MCx_CTL2(i), val);  		__clear_bit(i, __get_cpu_var(mce_banks_owned));  	}  	spin_unlock_irqrestore(&cmci_discover_lock, flags); diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c deleted file mode 100644 index f5f2d6f71fb..00000000000 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Non Fatal Machine Check Exception Reporting - * - * (C) Copyright 2002 Dave Jones. <davej@redhat.com> - * - * This file contains routines to check for non-fatal MCEs every 15s - * - */ -#include <linux/interrupt.h> -#include <linux/workqueue.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -static int		firstbank; - -#define MCE_RATE	(15*HZ)	/* timer rate is 15s */ - -static void mce_checkregs(void *info) -{ -	u32 low, high; -	int i; - -	for (i = firstbank; i < nr_mce_banks; i++) { -		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - -		if (!(high & (1<<31))) -			continue; - -		printk(KERN_INFO "MCE: The hardware reports a non fatal, " -			"correctable incident occurred on CPU %d.\n", -				smp_processor_id()); - -		printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - -		/* -		 * Scrub the error so we don't pick it up in MCE_RATE -		 * seconds time: -		 */ -		wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - -		/* Serialize: */ -		wmb(); -		add_taint(TAINT_MACHINE_CHECK); -	} -} - -static void mce_work_fn(struct work_struct *work); -static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); - -static void mce_work_fn(struct work_struct *work) -{ -	on_each_cpu(mce_checkregs, NULL, 1); -	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); -} - -static int __init init_nonfatal_mce_checker(void) -{ -	struct cpuinfo_x86 *c = &boot_cpu_data; - -	/* Check for MCE support */ -	if (!cpu_has(c, X86_FEATURE_MCE)) -		return -ENODEV; - -	/* Check for PPro style MCA */ -	if (!cpu_has(c, X86_FEATURE_MCA)) -		return -ENODEV; - -	/* Some Athlons misbehave when we frob bank 0 */ -	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && -						boot_cpu_data.x86 == 6) -		firstbank = 1; -	else -		firstbank = 0; - -	/* -	 * Check for non-fatal errors every MCE_RATE s -	 */ -	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); -	printk(KERN_INFO "Machine check exception polling timer started.\n"); - -	return 0; -} -module_init(init_nonfatal_mce_checker); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c deleted file mode 100644 index 4482aea9aa2..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * P4 specific Machine Check Exception Reporting - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* as supported by the P4/Xeon family */ -struct intel_mce_extended_msrs { -	u32 eax; -	u32 ebx; -	u32 ecx; -	u32 edx; -	u32 esi; -	u32 edi; -	u32 ebp; -	u32 esp; -	u32 eflags; -	u32 eip; -	/* u32 *reserved[]; */ -}; - -static int mce_num_extended_msrs; - -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) -{ -	u32 h; - -	rdmsr(MSR_IA32_MCG_EAX, r->eax, h); -	rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); -	rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); -	rdmsr(MSR_IA32_MCG_EDX, r->edx, h); -	rdmsr(MSR_IA32_MCG_ESI, r->esi, h); -	rdmsr(MSR_IA32_MCG_EDI, r->edi, h); -	rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); -	rdmsr(MSR_IA32_MCG_ESP, r->esp, h); -	rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); -	rdmsr(MSR_IA32_MCG_EIP, r->eip, h); -} - -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ -	u32 alow, ahigh, high, low; -	u32 mcgstl, mcgsth; -	int recover = 1; -	int i; - -	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -	if (mcgstl & (1<<0))	/* Recoverable ? */ -		recover = 0; - -	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", -		smp_processor_id(), mcgsth, mcgstl); - -	if (mce_num_extended_msrs > 0) { -		struct intel_mce_extended_msrs dbg; - -		intel_get_extended_msrs(&dbg); - -		printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" -			"\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" -			"\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", -			smp_processor_id(), dbg.eip, dbg.eflags, -			dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, -			dbg.esi, dbg.edi, dbg.ebp, dbg.esp); -	} - -	for (i = 0; i < nr_mce_banks; i++) { -		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); -		if (high & (1<<31)) { -			char misc[20]; -			char addr[24]; - -			misc[0] = addr[0] = '\0'; -			if (high & (1<<29)) -				recover |= 1; -			if (high & (1<<25)) -				recover |= 2; -			high &= ~(1<<31); -			if (high & (1<<27)) { -				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); -				snprintf(misc, 20, "[%08x%08x]", ahigh, alow); -			} -			if (high & (1<<26)) { -				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); -				snprintf(addr, 24, " at %08x%08x", ahigh, alow); -			} -			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", -				smp_processor_id(), i, high, low, misc, addr); -		} -	} - -	if (recover & 2) -		panic("CPU context corrupt"); -	if (recover & 1) -		panic("Unable to continue"); - -	printk(KERN_EMERG "Attempting to continue.\n"); - -	/* -	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not -	 * recoverable/continuable.This will allow BIOS to look at the MSRs -	 * for errors if the OS could not log the error. -	 */ -	for (i = 0; i < nr_mce_banks; i++) { -		u32 msr; -		msr = MSR_IA32_MC0_STATUS+i*4; -		rdmsr(msr, low, high); -		if (high&(1<<31)) { -			/* Clear it */ -			wrmsr(msr, 0UL, 0UL); -			/* Serialize */ -			wmb(); -			add_taint(TAINT_MACHINE_CHECK); -		} -	} -	mcgstl &= ~(1<<2); -	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -void intel_p4_mcheck_init(struct cpuinfo_x86 *c) -{ -	u32 l, h; -	int i; - -	machine_check_vector = intel_machine_check; -	wmb(); - -	printk(KERN_INFO "Intel machine check architecture supported.\n"); -	rdmsr(MSR_IA32_MCG_CAP, l, h); -	if (l & (1<<8))	/* Control register present ? */ -		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); -	nr_mce_banks = l & 0xff; - -	for (i = 0; i < nr_mce_banks; i++) { -		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); -		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); -	} - -	set_in_cr4(X86_CR4_MCE); -	printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", -		smp_processor_id()); - -	/* Check for P4/Xeon extended MCE MSRs */ -	rdmsr(MSR_IA32_MCG_CAP, l, h); -	if (l & (1<<9))	{/* MCG_EXT_P */ -		mce_num_extended_msrs = (l >> 16) & 0xff; -		printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" -				" available\n", -			smp_processor_id(), mce_num_extended_msrs); - -#ifdef CONFIG_X86_MCE_P4THERMAL -		/* Check for P4/Xeon Thermal monitor */ -		intel_init_thermal(c); -#endif -	} -} diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c deleted file mode 100644 index 01e4f817818..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * P6 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> - */ -#include <linux/interrupt.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* Machine Check Handler For PII/PIII */ -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ -	u32 alow, ahigh, high, low; -	u32 mcgstl, mcgsth; -	int recover = 1; -	int i; - -	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -	if (mcgstl & (1<<0))	/* Recoverable ? */ -		recover = 0; - -	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", -		smp_processor_id(), mcgsth, mcgstl); - -	for (i = 0; i < nr_mce_banks; i++) { -		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); -		if (high & (1<<31)) { -			char misc[20]; -			char addr[24]; - -			misc[0] = '\0'; -			addr[0] = '\0'; - -			if (high & (1<<29)) -				recover |= 1; -			if (high & (1<<25)) -				recover |= 2; -			high &= ~(1<<31); - -			if (high & (1<<27)) { -				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); -				snprintf(misc, 20, "[%08x%08x]", ahigh, alow); -			} -			if (high & (1<<26)) { -				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); -				snprintf(addr, 24, " at %08x%08x", ahigh, alow); -			} - -			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", -				smp_processor_id(), i, high, low, misc, addr); -		} -	} - -	if (recover & 2) -		panic("CPU context corrupt"); -	if (recover & 1) -		panic("Unable to continue"); - -	printk(KERN_EMERG "Attempting to continue.\n"); -	/* -	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not -	 * recoverable/continuable.This will allow BIOS to look at the MSRs -	 * for errors if the OS could not log the error: -	 */ -	for (i = 0; i < nr_mce_banks; i++) { -		unsigned int msr; - -		msr = MSR_IA32_MC0_STATUS+i*4; -		rdmsr(msr, low, high); -		if (high & (1<<31)) { -			/* Clear it: */ -			wrmsr(msr, 0UL, 0UL); -			/* Serialize: */ -			wmb(); -			add_taint(TAINT_MACHINE_CHECK); -		} -	} -	mcgstl &= ~(1<<2); -	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -/* Set up machine check reporting for processors with Intel style MCE: */ -void intel_p6_mcheck_init(struct cpuinfo_x86 *c) -{ -	u32 l, h; -	int i; - -	/* Check for MCE support */ -	if (!cpu_has(c, X86_FEATURE_MCE)) -		return; - -	/* Check for PPro style MCA */ -	if (!cpu_has(c, X86_FEATURE_MCA)) -		return; - -	/* Ok machine check is available */ -	machine_check_vector = intel_machine_check; -	/* Make sure the vector pointer is visible before we enable MCEs: */ -	wmb(); - -	printk(KERN_INFO "Intel machine check architecture supported.\n"); -	rdmsr(MSR_IA32_MCG_CAP, l, h); -	if (l & (1<<8))	/* Control register present ? */ -		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); -	nr_mce_banks = l & 0xff; - -	/* -	 * Following the example in IA-32 SDM Vol 3: -	 * - MC0_CTL should not be written -	 * - Status registers on all banks should be cleared on reset -	 */ -	for (i = 1; i < nr_mce_banks; i++) -		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - -	for (i = 0; i < nr_mce_banks; i++) -		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - -	set_in_cr4(X86_CR4_MCE); -	printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", -		smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 5957a93e517..63a56d147e4 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -260,9 +260,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)  		return;  	} -	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) -		tm2 = 1; -  	/* Check whether a vector already exists */  	if (h & APIC_VECTOR_MASK) {  		printk(KERN_DEBUG @@ -271,6 +268,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c)  		return;  	} +	/* early Pentium M models use different method for enabling TM2 */ +	if (cpu_has(c, X86_FEATURE_TM2)) { +		if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { +			rdmsr(MSR_THERM2_CTL, l, h); +			if (l & MSR_THERM2_CTL_TM_SELECT) +				tm2 = 1; +		} else if (l & MSR_IA32_MISC_ENABLE_TM2) +			tm2 = 1; +	} +  	/* We'll mask the thermal vector in the lapic till we're ready: */  	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;  	apic_write(APIC_LVTTHMR, h); diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 08b6ea4c62b..f04e7252760 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -126,8 +126,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)  		return -EINVAL;  	base = simple_strtoull(line + 5, &ptr, 0); -	for (; isspace(*ptr); ++ptr) -		; +	while (isspace(*ptr)) +		ptr++;  	if (strncmp(ptr, "size=", 5))  		return -EINVAL; @@ -135,14 +135,14 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)  	size = simple_strtoull(ptr + 5, &ptr, 0);  	if ((base & 0xfff) || (size & 0xfff))  		return -EINVAL; -	for (; isspace(*ptr); ++ptr) -		; +	while (isspace(*ptr)) +		ptr++;  	if (strncmp(ptr, "type=", 5))  		return -EINVAL;  	ptr += 5; -	for (; isspace(*ptr); ++ptr) -		; +	while (isspace(*ptr)) +		ptr++;  	for (i = 0; i < MTRR_NUM_TYPES; ++i) {  		if (strcmp(ptr, mtrr_strings[i])) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c index 2732e2c1e4d..a3c7adb06b7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1,5 +1,5 @@  /* - * Performance counter x86 architecture code + * Performance events x86 architecture code   *   *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>   *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar @@ -11,7 +11,7 @@   *  For licencing details see kernel-base/COPYING   */ -#include <linux/perf_counter.h> +#include <linux/perf_event.h>  #include <linux/capability.h>  #include <linux/notifier.h>  #include <linux/hardirq.h> @@ -27,19 +27,19 @@  #include <asm/stacktrace.h>  #include <asm/nmi.h> -static u64 perf_counter_mask __read_mostly; +static u64 perf_event_mask __read_mostly; -/* The maximal number of PEBS counters: */ -#define MAX_PEBS_COUNTERS	4 +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS	4  /* The size of a BTS record in bytes: */  #define BTS_RECORD_SIZE		24  /* The size of a per-cpu BTS buffer in bytes: */ -#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 1024) +#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)  /* The BTS overflow threshold in bytes from the end of the buffer: */ -#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 64) +#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)  /* @@ -65,11 +65,11 @@ struct debug_store {  	u64	pebs_index;  	u64	pebs_absolute_maximum;  	u64	pebs_interrupt_threshold; -	u64	pebs_counter_reset[MAX_PEBS_COUNTERS]; +	u64	pebs_event_reset[MAX_PEBS_EVENTS];  }; -struct cpu_hw_counters { -	struct perf_counter	*counters[X86_PMC_IDX_MAX]; +struct cpu_hw_events { +	struct perf_event	*events[X86_PMC_IDX_MAX];  	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];  	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];  	unsigned long		interrupts; @@ -86,17 +86,17 @@ struct x86_pmu {  	int		(*handle_irq)(struct pt_regs *);  	void		(*disable_all)(void);  	void		(*enable_all)(void); -	void		(*enable)(struct hw_perf_counter *, int); -	void		(*disable)(struct hw_perf_counter *, int); +	void		(*enable)(struct hw_perf_event *, int); +	void		(*disable)(struct hw_perf_event *, int);  	unsigned	eventsel;  	unsigned	perfctr;  	u64		(*event_map)(int);  	u64		(*raw_event)(u64);  	int		max_events; -	int		num_counters; -	int		num_counters_fixed; -	int		counter_bits; -	u64		counter_mask; +	int		num_events; +	int		num_events_fixed; +	int		event_bits; +	u64		event_mask;  	int		apic;  	u64		max_period;  	u64		intel_ctrl; @@ -106,7 +106,7 @@ struct x86_pmu {  static struct x86_pmu x86_pmu __read_mostly; -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {  	.enabled = 1,  }; @@ -124,35 +124,35 @@ static const u64 p6_perfmon_event_map[] =    [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,  }; -static u64 p6_pmu_event_map(int event) +static u64 p6_pmu_event_map(int hw_event)  { -	return p6_perfmon_event_map[event]; +	return p6_perfmon_event_map[hw_event];  }  /* - * Counter setting that is specified not to count anything. + * Event setting that is specified not to count anything.   * We use this to effectively disable a counter.   *   * L2_RQSTS with 0 MESI unit mask.   */ -#define P6_NOP_COUNTER			0x0000002EULL +#define P6_NOP_EVENT			0x0000002EULL -static u64 p6_pmu_raw_event(u64 event) +static u64 p6_pmu_raw_event(u64 hw_event)  {  #define P6_EVNTSEL_EVENT_MASK		0x000000FFULL  #define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL  #define P6_EVNTSEL_EDGE_MASK		0x00040000ULL  #define P6_EVNTSEL_INV_MASK		0x00800000ULL -#define P6_EVNTSEL_COUNTER_MASK		0xFF000000ULL +#define P6_EVNTSEL_REG_MASK		0xFF000000ULL  #define P6_EVNTSEL_MASK			\  	(P6_EVNTSEL_EVENT_MASK |	\  	 P6_EVNTSEL_UNIT_MASK  |	\  	 P6_EVNTSEL_EDGE_MASK  |	\  	 P6_EVNTSEL_INV_MASK   |	\ -	 P6_EVNTSEL_COUNTER_MASK) +	 P6_EVNTSEL_REG_MASK) -	return event & P6_EVNTSEL_MASK; +	return hw_event & P6_EVNTSEL_MASK;  } @@ -170,16 +170,16 @@ static const u64 intel_perfmon_event_map[] =    [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,  }; -static u64 intel_pmu_event_map(int event) +static u64 intel_pmu_event_map(int hw_event)  { -	return intel_perfmon_event_map[event]; +	return intel_perfmon_event_map[hw_event];  }  /* - * Generalized hw caching related event table, filled + * Generalized hw caching related hw_event table, filled   * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'event makes no sense on - * this CPU', any other value means the raw event + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event   * ID.   */ @@ -463,22 +463,22 @@ static const u64 atom_hw_cache_event_ids   },  }; -static u64 intel_pmu_raw_event(u64 event) +static u64 intel_pmu_raw_event(u64 hw_event)  {  #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL  #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL  #define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL  #define CORE_EVNTSEL_INV_MASK		0x00800000ULL -#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL +#define CORE_EVNTSEL_REG_MASK	0xFF000000ULL  #define CORE_EVNTSEL_MASK		\  	(CORE_EVNTSEL_EVENT_MASK |	\  	 CORE_EVNTSEL_UNIT_MASK  |	\  	 CORE_EVNTSEL_EDGE_MASK  |	\  	 CORE_EVNTSEL_INV_MASK  |	\ -	 CORE_EVNTSEL_COUNTER_MASK) +	 CORE_EVNTSEL_REG_MASK) -	return event & CORE_EVNTSEL_MASK; +	return hw_event & CORE_EVNTSEL_MASK;  }  static const u64 amd_hw_cache_event_ids @@ -585,39 +585,39 @@ static const u64 amd_perfmon_event_map[] =    [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,  }; -static u64 amd_pmu_event_map(int event) +static u64 amd_pmu_event_map(int hw_event)  { -	return amd_perfmon_event_map[event]; +	return amd_perfmon_event_map[hw_event];  } -static u64 amd_pmu_raw_event(u64 event) +static u64 amd_pmu_raw_event(u64 hw_event)  {  #define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL  #define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL  #define K7_EVNTSEL_EDGE_MASK	0x000040000ULL  #define K7_EVNTSEL_INV_MASK	0x000800000ULL -#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL +#define K7_EVNTSEL_REG_MASK	0x0FF000000ULL  #define K7_EVNTSEL_MASK			\  	(K7_EVNTSEL_EVENT_MASK |	\  	 K7_EVNTSEL_UNIT_MASK  |	\  	 K7_EVNTSEL_EDGE_MASK  |	\  	 K7_EVNTSEL_INV_MASK   |	\ -	 K7_EVNTSEL_COUNTER_MASK) +	 K7_EVNTSEL_REG_MASK) -	return event & K7_EVNTSEL_MASK; +	return hw_event & K7_EVNTSEL_MASK;  }  /* - * Propagate counter elapsed time into the generic counter. - * Can only be executed on the CPU where the counter is active. + * Propagate event elapsed time into the generic event. + * Can only be executed on the CPU where the event is active.   * Returns the delta events processed.   */  static u64 -x86_perf_counter_update(struct perf_counter *counter, -			struct hw_perf_counter *hwc, int idx) +x86_perf_event_update(struct perf_event *event, +			struct hw_perf_event *hwc, int idx)  { -	int shift = 64 - x86_pmu.counter_bits; +	int shift = 64 - x86_pmu.event_bits;  	u64 prev_raw_count, new_raw_count;  	s64 delta; @@ -625,15 +625,15 @@ x86_perf_counter_update(struct perf_counter *counter,  		return 0;  	/* -	 * Careful: an NMI might modify the previous counter value. +	 * Careful: an NMI might modify the previous event value.  	 *  	 * Our tactic to handle this is to first atomically read and  	 * exchange a new raw count - then add that new-prev delta -	 * count to the generic counter atomically: +	 * count to the generic event atomically:  	 */  again:  	prev_raw_count = atomic64_read(&hwc->prev_count); -	rdmsrl(hwc->counter_base + idx, new_raw_count); +	rdmsrl(hwc->event_base + idx, new_raw_count);  	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,  					new_raw_count) != prev_raw_count) @@ -642,7 +642,7 @@ again:  	/*  	 * Now we have the new raw value and have updated the prev  	 * timestamp already. We can now calculate the elapsed delta -	 * (counter-)time and add that to the generic counter. +	 * (event-)time and add that to the generic event.  	 *  	 * Careful, not all hw sign-extends above the physical width  	 * of the count. @@ -650,13 +650,13 @@ again:  	delta = (new_raw_count << shift) - (prev_raw_count << shift);  	delta >>= shift; -	atomic64_add(delta, &counter->count); +	atomic64_add(delta, &event->count);  	atomic64_sub(delta, &hwc->period_left);  	return new_raw_count;  } -static atomic_t active_counters; +static atomic_t active_events;  static DEFINE_MUTEX(pmc_reserve_mutex);  static bool reserve_pmc_hardware(void) @@ -667,12 +667,12 @@ static bool reserve_pmc_hardware(void)  	if (nmi_watchdog == NMI_LOCAL_APIC)  		disable_lapic_nmi_watchdog(); -	for (i = 0; i < x86_pmu.num_counters; i++) { +	for (i = 0; i < x86_pmu.num_events; i++) {  		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))  			goto perfctr_fail;  	} -	for (i = 0; i < x86_pmu.num_counters; i++) { +	for (i = 0; i < x86_pmu.num_events; i++) {  		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))  			goto eventsel_fail;  	} @@ -685,7 +685,7 @@ eventsel_fail:  	for (i--; i >= 0; i--)  		release_evntsel_nmi(x86_pmu.eventsel + i); -	i = x86_pmu.num_counters; +	i = x86_pmu.num_events;  perfctr_fail:  	for (i--; i >= 0; i--) @@ -703,7 +703,7 @@ static void release_pmc_hardware(void)  #ifdef CONFIG_X86_LOCAL_APIC  	int i; -	for (i = 0; i < x86_pmu.num_counters; i++) { +	for (i = 0; i < x86_pmu.num_events; i++) {  		release_perfctr_nmi(x86_pmu.perfctr + i);  		release_evntsel_nmi(x86_pmu.eventsel + i);  	} @@ -720,7 +720,7 @@ static inline bool bts_available(void)  static inline void init_debug_store_on_cpu(int cpu)  { -	struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;  	if (!ds)  		return; @@ -732,7 +732,7 @@ static inline void init_debug_store_on_cpu(int cpu)  static inline void fini_debug_store_on_cpu(int cpu)  { -	if (!per_cpu(cpu_hw_counters, cpu).ds) +	if (!per_cpu(cpu_hw_events, cpu).ds)  		return;  	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); @@ -751,12 +751,12 @@ static void release_bts_hardware(void)  		fini_debug_store_on_cpu(cpu);  	for_each_possible_cpu(cpu) { -		struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; +		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;  		if (!ds)  			continue; -		per_cpu(cpu_hw_counters, cpu).ds = NULL; +		per_cpu(cpu_hw_events, cpu).ds = NULL;  		kfree((void *)(unsigned long)ds->bts_buffer_base);  		kfree(ds); @@ -796,7 +796,7 @@ static int reserve_bts_hardware(void)  		ds->bts_interrupt_threshold =  			ds->bts_absolute_maximum - BTS_OVFL_TH; -		per_cpu(cpu_hw_counters, cpu).ds = ds; +		per_cpu(cpu_hw_events, cpu).ds = ds;  		err = 0;  	} @@ -812,9 +812,9 @@ static int reserve_bts_hardware(void)  	return err;  } -static void hw_perf_counter_destroy(struct perf_counter *counter) +static void hw_perf_event_destroy(struct perf_event *event)  { -	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { +	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {  		release_pmc_hardware();  		release_bts_hardware();  		mutex_unlock(&pmc_reserve_mutex); @@ -827,7 +827,7 @@ static inline int x86_pmu_initialized(void)  }  static inline int -set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)  {  	unsigned int cache_type, cache_op, cache_result;  	u64 config, val; @@ -880,7 +880,7 @@ static void intel_pmu_enable_bts(u64 config)  static void intel_pmu_disable_bts(void)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	unsigned long debugctlmsr;  	if (!cpuc->ds) @@ -898,10 +898,10 @@ static void intel_pmu_disable_bts(void)  /*   * Setup the hardware configuration for a given attr_type   */ -static int __hw_perf_counter_init(struct perf_counter *counter) +static int __hw_perf_event_init(struct perf_event *event)  { -	struct perf_counter_attr *attr = &counter->attr; -	struct hw_perf_counter *hwc = &counter->hw; +	struct perf_event_attr *attr = &event->attr; +	struct hw_perf_event *hwc = &event->hw;  	u64 config;  	int err; @@ -909,21 +909,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter)  		return -ENODEV;  	err = 0; -	if (!atomic_inc_not_zero(&active_counters)) { +	if (!atomic_inc_not_zero(&active_events)) {  		mutex_lock(&pmc_reserve_mutex); -		if (atomic_read(&active_counters) == 0) { +		if (atomic_read(&active_events) == 0) {  			if (!reserve_pmc_hardware())  				err = -EBUSY;  			else  				err = reserve_bts_hardware();  		}  		if (!err) -			atomic_inc(&active_counters); +			atomic_inc(&active_events);  		mutex_unlock(&pmc_reserve_mutex);  	}  	if (err)  		return err; +	event->destroy = hw_perf_event_destroy; +  	/*  	 * Generate PMC IRQs:  	 * (keep 'enabled' bit clear for now) @@ -946,17 +948,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)  		/*  		 * If we have a PMU initialized but no APIC  		 * interrupts, we cannot sample hardware -		 * counters (user-space has to fall back and -		 * sample via a hrtimer based software counter): +		 * events (user-space has to fall back and +		 * sample via a hrtimer based software event):  		 */  		if (!x86_pmu.apic)  			return -EOPNOTSUPP;  	} -	counter->destroy = hw_perf_counter_destroy; -  	/* -	 * Raw event type provide the config in the event structure +	 * Raw hw_event type provide the config in the hw_event structure  	 */  	if (attr->type == PERF_TYPE_RAW) {  		hwc->config |= x86_pmu.raw_event(attr->config); @@ -1001,7 +1001,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)  static void p6_pmu_disable_all(void)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	u64 val;  	if (!cpuc->enabled) @@ -1018,7 +1018,7 @@ static void p6_pmu_disable_all(void)  static void intel_pmu_disable_all(void)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	if (!cpuc->enabled)  		return; @@ -1034,7 +1034,7 @@ static void intel_pmu_disable_all(void)  static void amd_pmu_disable_all(void)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	int idx;  	if (!cpuc->enabled) @@ -1043,12 +1043,12 @@ static void amd_pmu_disable_all(void)  	cpuc->enabled = 0;  	/*  	 * ensure we write the disable before we start disabling the -	 * counters proper, so that amd_pmu_enable_counter() does the +	 * events proper, so that amd_pmu_enable_event() does the  	 * right thing.  	 */  	barrier(); -	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +	for (idx = 0; idx < x86_pmu.num_events; idx++) {  		u64 val;  		if (!test_bit(idx, cpuc->active_mask)) @@ -1070,7 +1070,7 @@ void hw_perf_disable(void)  static void p6_pmu_enable_all(void)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	unsigned long val;  	if (cpuc->enabled) @@ -1087,7 +1087,7 @@ static void p6_pmu_enable_all(void)  static void intel_pmu_enable_all(void)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	if (cpuc->enabled)  		return; @@ -1098,19 +1098,19 @@ static void intel_pmu_enable_all(void)  	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);  	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { -		struct perf_counter *counter = -			cpuc->counters[X86_PMC_IDX_FIXED_BTS]; +		struct perf_event *event = +			cpuc->events[X86_PMC_IDX_FIXED_BTS]; -		if (WARN_ON_ONCE(!counter)) +		if (WARN_ON_ONCE(!event))  			return; -		intel_pmu_enable_bts(counter->hw.config); +		intel_pmu_enable_bts(event->hw.config);  	}  }  static void amd_pmu_enable_all(void)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	int idx;  	if (cpuc->enabled) @@ -1119,14 +1119,14 @@ static void amd_pmu_enable_all(void)  	cpuc->enabled = 1;  	barrier(); -	for (idx = 0; idx < x86_pmu.num_counters; idx++) { -		struct perf_counter *counter = cpuc->counters[idx]; +	for (idx = 0; idx < x86_pmu.num_events; idx++) { +		struct perf_event *event = cpuc->events[idx];  		u64 val;  		if (!test_bit(idx, cpuc->active_mask))  			continue; -		val = counter->hw.config; +		val = event->hw.config;  		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;  		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);  	} @@ -1153,19 +1153,19 @@ static inline void intel_pmu_ack_status(u64 ack)  	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);  } -static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)  {  	(void)checking_wrmsrl(hwc->config_base + idx,  			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);  } -static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)  {  	(void)checking_wrmsrl(hwc->config_base + idx, hwc->config);  }  static inline void -intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)  {  	int idx = __idx - X86_PMC_IDX_FIXED;  	u64 ctrl_val, mask; @@ -1178,10 +1178,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)  }  static inline void -p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); -	u64 val = P6_NOP_COUNTER; +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); +	u64 val = P6_NOP_EVENT;  	if (cpuc->enabled)  		val |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -1190,7 +1190,7 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)  }  static inline void -intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)  {  	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {  		intel_pmu_disable_bts(); @@ -1202,24 +1202,24 @@ intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)  		return;  	} -	x86_pmu_disable_counter(hwc, idx); +	x86_pmu_disable_event(hwc, idx);  }  static inline void -amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)  { -	x86_pmu_disable_counter(hwc, idx); +	x86_pmu_disable_event(hwc, idx);  }  static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);  /*   * Set the next IRQ period, based on the hwc->period_left value. - * To be called with the counter disabled in hw: + * To be called with the event disabled in hw:   */  static int -x86_perf_counter_set_period(struct perf_counter *counter, -			     struct hw_perf_counter *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event, +			     struct hw_perf_event *hwc, int idx)  {  	s64 left = atomic64_read(&hwc->period_left);  	s64 period = hwc->sample_period; @@ -1245,7 +1245,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,  		ret = 1;  	}  	/* -	 * Quirk: certain CPUs dont like it if just 1 event is left: +	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:  	 */  	if (unlikely(left < 2))  		left = 2; @@ -1256,21 +1256,21 @@ x86_perf_counter_set_period(struct perf_counter *counter,  	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;  	/* -	 * The hw counter starts counting from this counter offset, +	 * The hw event starts counting from this event offset,  	 * mark it to be able to extra future deltas:  	 */  	atomic64_set(&hwc->prev_count, (u64)-left); -	err = checking_wrmsrl(hwc->counter_base + idx, -			     (u64)(-left) & x86_pmu.counter_mask); +	err = checking_wrmsrl(hwc->event_base + idx, +			     (u64)(-left) & x86_pmu.event_mask); -	perf_counter_update_userpage(counter); +	perf_event_update_userpage(event);  	return ret;  }  static inline void -intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)  {  	int idx = __idx - X86_PMC_IDX_FIXED;  	u64 ctrl_val, bits, mask; @@ -1295,9 +1295,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)  	err = checking_wrmsrl(hwc->config_base, ctrl_val);  } -static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	u64 val;  	val = hwc->config; @@ -1308,10 +1308,10 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)  } -static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)  {  	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { -		if (!__get_cpu_var(cpu_hw_counters).enabled) +		if (!__get_cpu_var(cpu_hw_events).enabled)  			return;  		intel_pmu_enable_bts(hwc->config); @@ -1323,134 +1323,134 @@ static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)  		return;  	} -	x86_pmu_enable_counter(hwc, idx); +	x86_pmu_enable_event(hwc, idx);  } -static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	if (cpuc->enabled) -		x86_pmu_enable_counter(hwc, idx); +		x86_pmu_enable_event(hwc, idx);  }  static int -fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) +fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)  { -	unsigned int event; +	unsigned int hw_event; -	event = hwc->config & ARCH_PERFMON_EVENT_MASK; +	hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; -	if (unlikely((event == +	if (unlikely((hw_event ==  		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&  		     (hwc->sample_period == 1)))  		return X86_PMC_IDX_FIXED_BTS; -	if (!x86_pmu.num_counters_fixed) +	if (!x86_pmu.num_events_fixed)  		return -1; -	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) +	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))  		return X86_PMC_IDX_FIXED_INSTRUCTIONS; -	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) +	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))  		return X86_PMC_IDX_FIXED_CPU_CYCLES; -	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) +	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))  		return X86_PMC_IDX_FIXED_BUS_CYCLES;  	return -1;  }  /* - * Find a PMC slot for the freshly enabled / scheduled in counter: + * Find a PMC slot for the freshly enabled / scheduled in event:   */ -static int x86_pmu_enable(struct perf_counter *counter) +static int x86_pmu_enable(struct perf_event *event)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); -	struct hw_perf_counter *hwc = &counter->hw; +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); +	struct hw_perf_event *hwc = &event->hw;  	int idx; -	idx = fixed_mode_idx(counter, hwc); +	idx = fixed_mode_idx(event, hwc);  	if (idx == X86_PMC_IDX_FIXED_BTS) {  		/* BTS is already occupied. */  		if (test_and_set_bit(idx, cpuc->used_mask))  			return -EAGAIN;  		hwc->config_base	= 0; -		hwc->counter_base	= 0; +		hwc->event_base	= 0;  		hwc->idx		= idx;  	} else if (idx >= 0) {  		/* -		 * Try to get the fixed counter, if that is already taken -		 * then try to get a generic counter: +		 * Try to get the fixed event, if that is already taken +		 * then try to get a generic event:  		 */  		if (test_and_set_bit(idx, cpuc->used_mask))  			goto try_generic;  		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;  		/* -		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to +		 * We set it so that event_base + idx in wrmsr/rdmsr maps to  		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:  		 */ -		hwc->counter_base = +		hwc->event_base =  			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;  		hwc->idx = idx;  	} else {  		idx = hwc->idx; -		/* Try to get the previous generic counter again */ +		/* Try to get the previous generic event again */  		if (test_and_set_bit(idx, cpuc->used_mask)) {  try_generic:  			idx = find_first_zero_bit(cpuc->used_mask, -						  x86_pmu.num_counters); -			if (idx == x86_pmu.num_counters) +						  x86_pmu.num_events); +			if (idx == x86_pmu.num_events)  				return -EAGAIN;  			set_bit(idx, cpuc->used_mask);  			hwc->idx = idx;  		}  		hwc->config_base  = x86_pmu.eventsel; -		hwc->counter_base = x86_pmu.perfctr; +		hwc->event_base = x86_pmu.perfctr;  	} -	perf_counters_lapic_init(); +	perf_events_lapic_init();  	x86_pmu.disable(hwc, idx); -	cpuc->counters[idx] = counter; +	cpuc->events[idx] = event;  	set_bit(idx, cpuc->active_mask); -	x86_perf_counter_set_period(counter, hwc, idx); +	x86_perf_event_set_period(event, hwc, idx);  	x86_pmu.enable(hwc, idx); -	perf_counter_update_userpage(counter); +	perf_event_update_userpage(event);  	return 0;  } -static void x86_pmu_unthrottle(struct perf_counter *counter) +static void x86_pmu_unthrottle(struct perf_event *event)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); -	struct hw_perf_counter *hwc = &counter->hw; +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); +	struct hw_perf_event *hwc = &event->hw;  	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || -				cpuc->counters[hwc->idx] != counter)) +				cpuc->events[hwc->idx] != event))  		return;  	x86_pmu.enable(hwc, hwc->idx);  } -void perf_counter_print_debug(void) +void perf_event_print_debug(void)  {  	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; -	struct cpu_hw_counters *cpuc; +	struct cpu_hw_events *cpuc;  	unsigned long flags;  	int cpu, idx; -	if (!x86_pmu.num_counters) +	if (!x86_pmu.num_events)  		return;  	local_irq_save(flags);  	cpu = smp_processor_id(); -	cpuc = &per_cpu(cpu_hw_counters, cpu); +	cpuc = &per_cpu(cpu_hw_events, cpu);  	if (x86_pmu.version >= 2) {  		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); @@ -1466,7 +1466,7 @@ void perf_counter_print_debug(void)  	}  	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask); -	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +	for (idx = 0; idx < x86_pmu.num_events; idx++) {  		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);  		rdmsrl(x86_pmu.perfctr  + idx, pmc_count); @@ -1479,7 +1479,7 @@ void perf_counter_print_debug(void)  		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",  			cpu, idx, prev_left);  	} -	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { +	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {  		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);  		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -1488,8 +1488,7 @@ void perf_counter_print_debug(void)  	local_irq_restore(flags);  } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, -				       struct perf_sample_data *data) +static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)  {  	struct debug_store *ds = cpuc->ds;  	struct bts_record { @@ -1497,11 +1496,14 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,  		u64	to;  		u64	flags;  	}; -	struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; -	unsigned long orig_ip = data->regs->ip; +	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];  	struct bts_record *at, *top; +	struct perf_output_handle handle; +	struct perf_event_header header; +	struct perf_sample_data data; +	struct pt_regs regs; -	if (!counter) +	if (!event)  		return;  	if (!ds) @@ -1510,26 +1512,45 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,  	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;  	top = (struct bts_record *)(unsigned long)ds->bts_index; +	if (top <= at) +		return; +  	ds->bts_index = ds->bts_buffer_base; + +	data.period	= event->hw.last_period; +	data.addr	= 0; +	regs.ip		= 0; + +	/* +	 * Prepare a generic sample, i.e. fill in the invariant fields. +	 * We will overwrite the from and to address before we output +	 * the sample. +	 */ +	perf_prepare_sample(&header, &data, event, ®s); + +	if (perf_output_begin(&handle, event, +			      header.size * (top - at), 1, 1)) +		return; +  	for (; at < top; at++) { -		data->regs->ip	= at->from; -		data->addr	= at->to; +		data.ip		= at->from; +		data.addr	= at->to; -		perf_counter_output(counter, 1, data); +		perf_output_sample(&handle, &header, &data, event);  	} -	data->regs->ip	= orig_ip; -	data->addr	= 0; +	perf_output_end(&handle);  	/* There's new data available. */ -	counter->pending_kill = POLL_IN; +	event->hw.interrupts++; +	event->pending_kill = POLL_IN;  } -static void x86_pmu_disable(struct perf_counter *counter) +static void x86_pmu_disable(struct perf_event *event)  { -	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); -	struct hw_perf_counter *hwc = &counter->hw; +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); +	struct hw_perf_event *hwc = &event->hw;  	int idx = hwc->idx;  	/* @@ -1541,67 +1562,63 @@ static void x86_pmu_disable(struct perf_counter *counter)  	/*  	 * Make sure the cleared pointer becomes visible before we -	 * (potentially) free the counter: +	 * (potentially) free the event:  	 */  	barrier();  	/* -	 * Drain the remaining delta count out of a counter +	 * Drain the remaining delta count out of a event  	 * that we are disabling:  	 */ -	x86_perf_counter_update(counter, hwc, idx); +	x86_perf_event_update(event, hwc, idx);  	/* Drain the remaining BTS records. */ -	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { -		struct perf_sample_data data; -		struct pt_regs regs; +	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) +		intel_pmu_drain_bts_buffer(cpuc); -		data.regs = ®s; -		intel_pmu_drain_bts_buffer(cpuc, &data); -	} -	cpuc->counters[idx] = NULL; +	cpuc->events[idx] = NULL;  	clear_bit(idx, cpuc->used_mask); -	perf_counter_update_userpage(counter); +	perf_event_update_userpage(event);  }  /* - * Save and restart an expired counter. Called by NMI contexts, - * so it has to be careful about preempting normal counter ops: + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops:   */ -static int intel_pmu_save_and_restart(struct perf_counter *counter) +static int intel_pmu_save_and_restart(struct perf_event *event)  { -	struct hw_perf_counter *hwc = &counter->hw; +	struct hw_perf_event *hwc = &event->hw;  	int idx = hwc->idx;  	int ret; -	x86_perf_counter_update(counter, hwc, idx); -	ret = x86_perf_counter_set_period(counter, hwc, idx); +	x86_perf_event_update(event, hwc, idx); +	ret = x86_perf_event_set_period(event, hwc, idx); -	if (counter->state == PERF_COUNTER_STATE_ACTIVE) -		intel_pmu_enable_counter(hwc, idx); +	if (event->state == PERF_EVENT_STATE_ACTIVE) +		intel_pmu_enable_event(hwc, idx);  	return ret;  }  static void intel_pmu_reset(void)  { -	struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; +	struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;  	unsigned long flags;  	int idx; -	if (!x86_pmu.num_counters) +	if (!x86_pmu.num_events)  		return;  	local_irq_save(flags);  	printk("clearing PMU state on CPU#%d\n", smp_processor_id()); -	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +	for (idx = 0; idx < x86_pmu.num_events; idx++) {  		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);  		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);  	} -	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { +	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {  		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);  	}  	if (ds) @@ -1613,39 +1630,38 @@ static void intel_pmu_reset(void)  static int p6_pmu_handle_irq(struct pt_regs *regs)  {  	struct perf_sample_data data; -	struct cpu_hw_counters *cpuc; -	struct perf_counter *counter; -	struct hw_perf_counter *hwc; +	struct cpu_hw_events *cpuc; +	struct perf_event *event; +	struct hw_perf_event *hwc;  	int idx, handled = 0;  	u64 val; -	data.regs = regs;  	data.addr = 0; -	cpuc = &__get_cpu_var(cpu_hw_counters); +	cpuc = &__get_cpu_var(cpu_hw_events); -	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +	for (idx = 0; idx < x86_pmu.num_events; idx++) {  		if (!test_bit(idx, cpuc->active_mask))  			continue; -		counter = cpuc->counters[idx]; -		hwc = &counter->hw; +		event = cpuc->events[idx]; +		hwc = &event->hw; -		val = x86_perf_counter_update(counter, hwc, idx); -		if (val & (1ULL << (x86_pmu.counter_bits - 1))) +		val = x86_perf_event_update(event, hwc, idx); +		if (val & (1ULL << (x86_pmu.event_bits - 1)))  			continue;  		/* -		 * counter overflow +		 * event overflow  		 */  		handled		= 1; -		data.period	= counter->hw.last_period; +		data.period	= event->hw.last_period; -		if (!x86_perf_counter_set_period(counter, hwc, idx)) +		if (!x86_perf_event_set_period(event, hwc, idx))  			continue; -		if (perf_counter_overflow(counter, 1, &data)) -			p6_pmu_disable_counter(hwc, idx); +		if (perf_event_overflow(event, 1, &data, regs)) +			p6_pmu_disable_event(hwc, idx);  	}  	if (handled) @@ -1661,17 +1677,16 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)  static int intel_pmu_handle_irq(struct pt_regs *regs)  {  	struct perf_sample_data data; -	struct cpu_hw_counters *cpuc; +	struct cpu_hw_events *cpuc;  	int bit, loops;  	u64 ack, status; -	data.regs = regs;  	data.addr = 0; -	cpuc = &__get_cpu_var(cpu_hw_counters); +	cpuc = &__get_cpu_var(cpu_hw_events);  	perf_disable(); -	intel_pmu_drain_bts_buffer(cpuc, &data); +	intel_pmu_drain_bts_buffer(cpuc);  	status = intel_pmu_get_status();  	if (!status) {  		perf_enable(); @@ -1681,8 +1696,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)  	loops = 0;  again:  	if (++loops > 100) { -		WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); -		perf_counter_print_debug(); +		WARN_ONCE(1, "perfevents: irq loop stuck!\n"); +		perf_event_print_debug();  		intel_pmu_reset();  		perf_enable();  		return 1; @@ -1691,19 +1706,19 @@ again:  	inc_irq_stat(apic_perf_irqs);  	ack = status;  	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { -		struct perf_counter *counter = cpuc->counters[bit]; +		struct perf_event *event = cpuc->events[bit];  		clear_bit(bit, (unsigned long *) &status);  		if (!test_bit(bit, cpuc->active_mask))  			continue; -		if (!intel_pmu_save_and_restart(counter)) +		if (!intel_pmu_save_and_restart(event))  			continue; -		data.period = counter->hw.last_period; +		data.period = event->hw.last_period; -		if (perf_counter_overflow(counter, 1, &data)) -			intel_pmu_disable_counter(&counter->hw, bit); +		if (perf_event_overflow(event, 1, &data, regs)) +			intel_pmu_disable_event(&event->hw, bit);  	}  	intel_pmu_ack_status(ack); @@ -1723,39 +1738,38 @@ again:  static int amd_pmu_handle_irq(struct pt_regs *regs)  {  	struct perf_sample_data data; -	struct cpu_hw_counters *cpuc; -	struct perf_counter *counter; -	struct hw_perf_counter *hwc; +	struct cpu_hw_events *cpuc; +	struct perf_event *event; +	struct hw_perf_event *hwc;  	int idx, handled = 0;  	u64 val; -	data.regs = regs;  	data.addr = 0; -	cpuc = &__get_cpu_var(cpu_hw_counters); +	cpuc = &__get_cpu_var(cpu_hw_events); -	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +	for (idx = 0; idx < x86_pmu.num_events; idx++) {  		if (!test_bit(idx, cpuc->active_mask))  			continue; -		counter = cpuc->counters[idx]; -		hwc = &counter->hw; +		event = cpuc->events[idx]; +		hwc = &event->hw; -		val = x86_perf_counter_update(counter, hwc, idx); -		if (val & (1ULL << (x86_pmu.counter_bits - 1))) +		val = x86_perf_event_update(event, hwc, idx); +		if (val & (1ULL << (x86_pmu.event_bits - 1)))  			continue;  		/* -		 * counter overflow +		 * event overflow  		 */  		handled		= 1; -		data.period	= counter->hw.last_period; +		data.period	= event->hw.last_period; -		if (!x86_perf_counter_set_period(counter, hwc, idx)) +		if (!x86_perf_event_set_period(event, hwc, idx))  			continue; -		if (perf_counter_overflow(counter, 1, &data)) -			amd_pmu_disable_counter(hwc, idx); +		if (perf_event_overflow(event, 1, &data, regs)) +			amd_pmu_disable_event(hwc, idx);  	}  	if (handled) @@ -1769,18 +1783,18 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)  	irq_enter();  	ack_APIC_irq();  	inc_irq_stat(apic_pending_irqs); -	perf_counter_do_pending(); +	perf_event_do_pending();  	irq_exit();  } -void set_perf_counter_pending(void) +void set_perf_event_pending(void)  {  #ifdef CONFIG_X86_LOCAL_APIC  	apic->send_IPI_self(LOCAL_PENDING_VECTOR);  #endif  } -void perf_counters_lapic_init(void) +void perf_events_lapic_init(void)  {  #ifdef CONFIG_X86_LOCAL_APIC  	if (!x86_pmu.apic || !x86_pmu_initialized()) @@ -1794,13 +1808,13 @@ void perf_counters_lapic_init(void)  }  static int __kprobes -perf_counter_nmi_handler(struct notifier_block *self, +perf_event_nmi_handler(struct notifier_block *self,  			 unsigned long cmd, void *__args)  {  	struct die_args *args = __args;  	struct pt_regs *regs; -	if (!atomic_read(&active_counters)) +	if (!atomic_read(&active_events))  		return NOTIFY_DONE;  	switch (cmd) { @@ -1819,7 +1833,7 @@ perf_counter_nmi_handler(struct notifier_block *self,  #endif  	/*  	 * Can't rely on the handled return value to say it was our NMI, two -	 * counters could trigger 'simultaneously' raising two back-to-back NMIs. +	 * events could trigger 'simultaneously' raising two back-to-back NMIs.  	 *  	 * If the first NMI handles both, the latter will be empty and daze  	 * the CPU. @@ -1829,8 +1843,8 @@ perf_counter_nmi_handler(struct notifier_block *self,  	return NOTIFY_STOP;  } -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { -	.notifier_call		= perf_counter_nmi_handler, +static __read_mostly struct notifier_block perf_event_nmi_notifier = { +	.notifier_call		= perf_event_nmi_handler,  	.next			= NULL,  	.priority		= 1  }; @@ -1840,8 +1854,8 @@ static struct x86_pmu p6_pmu = {  	.handle_irq		= p6_pmu_handle_irq,  	.disable_all		= p6_pmu_disable_all,  	.enable_all		= p6_pmu_enable_all, -	.enable			= p6_pmu_enable_counter, -	.disable		= p6_pmu_disable_counter, +	.enable			= p6_pmu_enable_event, +	.disable		= p6_pmu_disable_event,  	.eventsel		= MSR_P6_EVNTSEL0,  	.perfctr		= MSR_P6_PERFCTR0,  	.event_map		= p6_pmu_event_map, @@ -1850,16 +1864,16 @@ static struct x86_pmu p6_pmu = {  	.apic			= 1,  	.max_period		= (1ULL << 31) - 1,  	.version		= 0, -	.num_counters		= 2, +	.num_events		= 2,  	/* -	 * Counters have 40 bits implemented. However they are designed such +	 * Events have 40 bits implemented. However they are designed such  	 * that bits [32-39] are sign extensions of bit 31. As such the -	 * effective width of a counter for P6-like PMU is 32 bits only. +	 * effective width of a event for P6-like PMU is 32 bits only.  	 *  	 * See IA-32 Intel Architecture Software developer manual Vol 3B  	 */ -	.counter_bits		= 32, -	.counter_mask		= (1ULL << 32) - 1, +	.event_bits		= 32, +	.event_mask		= (1ULL << 32) - 1,  };  static struct x86_pmu intel_pmu = { @@ -1867,8 +1881,8 @@ static struct x86_pmu intel_pmu = {  	.handle_irq		= intel_pmu_handle_irq,  	.disable_all		= intel_pmu_disable_all,  	.enable_all		= intel_pmu_enable_all, -	.enable			= intel_pmu_enable_counter, -	.disable		= intel_pmu_disable_counter, +	.enable			= intel_pmu_enable_event, +	.disable		= intel_pmu_disable_event,  	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,  	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,  	.event_map		= intel_pmu_event_map, @@ -1878,7 +1892,7 @@ static struct x86_pmu intel_pmu = {  	/*  	 * Intel PMCs cannot be accessed sanely above 32 bit width,  	 * so we install an artificial 1<<31 period regardless of -	 * the generic counter period: +	 * the generic event period:  	 */  	.max_period		= (1ULL << 31) - 1,  	.enable_bts		= intel_pmu_enable_bts, @@ -1890,16 +1904,16 @@ static struct x86_pmu amd_pmu = {  	.handle_irq		= amd_pmu_handle_irq,  	.disable_all		= amd_pmu_disable_all,  	.enable_all		= amd_pmu_enable_all, -	.enable			= amd_pmu_enable_counter, -	.disable		= amd_pmu_disable_counter, +	.enable			= amd_pmu_enable_event, +	.disable		= amd_pmu_disable_event,  	.eventsel		= MSR_K7_EVNTSEL0,  	.perfctr		= MSR_K7_PERFCTR0,  	.event_map		= amd_pmu_event_map,  	.raw_event		= amd_pmu_raw_event,  	.max_events		= ARRAY_SIZE(amd_perfmon_event_map), -	.num_counters		= 4, -	.counter_bits		= 48, -	.counter_mask		= (1ULL << 48) - 1, +	.num_events		= 4, +	.event_bits		= 48, +	.event_mask		= (1ULL << 48) - 1,  	.apic			= 1,  	/* use highest bit to detect overflow */  	.max_period		= (1ULL << 47) - 1, @@ -1956,7 +1970,7 @@ static int intel_pmu_init(void)  	/*  	 * Check whether the Architectural PerfMon supports -	 * Branch Misses Retired Event or not. +	 * Branch Misses Retired hw_event or not.  	 */  	cpuid(10, &eax.full, &ebx, &unused, &edx.full);  	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) @@ -1968,15 +1982,15 @@ static int intel_pmu_init(void)  	x86_pmu				= intel_pmu;  	x86_pmu.version			= version; -	x86_pmu.num_counters		= eax.split.num_counters; -	x86_pmu.counter_bits		= eax.split.bit_width; -	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1; +	x86_pmu.num_events		= eax.split.num_events; +	x86_pmu.event_bits		= eax.split.bit_width; +	x86_pmu.event_mask		= (1ULL << eax.split.bit_width) - 1;  	/* -	 * Quirk: v2 perfmon does not report fixed-purpose counters, so -	 * assume at least 3 counters: +	 * Quirk: v2 perfmon does not report fixed-purpose events, so +	 * assume at least 3 events:  	 */ -	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3); +	x86_pmu.num_events_fixed	= max((int)edx.split.num_events_fixed, 3);  	/*  	 * Install the hw-cache-events table: @@ -2023,11 +2037,11 @@ static int amd_pmu_init(void)  	return 0;  } -void __init init_hw_perf_counters(void) +void __init init_hw_perf_events(void)  {  	int err; -	pr_info("Performance Counters: "); +	pr_info("Performance Events: ");  	switch (boot_cpu_data.x86_vendor) {  	case X86_VENDOR_INTEL: @@ -2040,45 +2054,45 @@ void __init init_hw_perf_counters(void)  		return;  	}  	if (err != 0) { -		pr_cont("no PMU driver, software counters only.\n"); +		pr_cont("no PMU driver, software events only.\n");  		return;  	}  	pr_cont("%s PMU driver.\n", x86_pmu.name); -	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { -		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", -		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC); -		x86_pmu.num_counters = X86_PMC_MAX_GENERIC; +	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { +		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", +		     x86_pmu.num_events, X86_PMC_MAX_GENERIC); +		x86_pmu.num_events = X86_PMC_MAX_GENERIC;  	} -	perf_counter_mask = (1 << x86_pmu.num_counters) - 1; -	perf_max_counters = x86_pmu.num_counters; +	perf_event_mask = (1 << x86_pmu.num_events) - 1; +	perf_max_events = x86_pmu.num_events; -	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { -		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", -		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); -		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; +	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { +		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", +		     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); +		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;  	} -	perf_counter_mask |= -		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; -	x86_pmu.intel_ctrl = perf_counter_mask; +	perf_event_mask |= +		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; +	x86_pmu.intel_ctrl = perf_event_mask; -	perf_counters_lapic_init(); -	register_die_notifier(&perf_counter_nmi_notifier); +	perf_events_lapic_init(); +	register_die_notifier(&perf_event_nmi_notifier); -	pr_info("... version:                 %d\n",     x86_pmu.version); -	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits); -	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters); -	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask); -	pr_info("... max period:              %016Lx\n", x86_pmu.max_period); -	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed); -	pr_info("... counter mask:            %016Lx\n", perf_counter_mask); +	pr_info("... version:                %d\n",     x86_pmu.version); +	pr_info("... bit width:              %d\n",     x86_pmu.event_bits); +	pr_info("... generic registers:      %d\n",     x86_pmu.num_events); +	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask); +	pr_info("... max period:             %016Lx\n", x86_pmu.max_period); +	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed); +	pr_info("... event mask:             %016Lx\n", perf_event_mask);  } -static inline void x86_pmu_read(struct perf_counter *counter) +static inline void x86_pmu_read(struct perf_event *event)  { -	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +	x86_perf_event_update(event, &event->hw, event->hw.idx);  }  static const struct pmu pmu = { @@ -2088,13 +2102,16 @@ static const struct pmu pmu = {  	.unthrottle	= x86_pmu_unthrottle,  }; -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_event_init(struct perf_event *event)  {  	int err; -	err = __hw_perf_counter_init(counter); -	if (err) +	err = __hw_perf_event_init(event); +	if (err) { +		if (event->destroy) +			event->destroy(event);  		return ERR_PTR(err); +	}  	return &pmu;  } @@ -2275,7 +2292,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)  	return entry;  } -void hw_perf_counter_setup_online(int cpu) +void hw_perf_event_setup_online(int cpu)  {  	init_debug_store_on_cpu(cpu);  } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 392bea43b89..fab786f60ed 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@  #include <linux/kprobes.h>  #include <asm/apic.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h>  struct nmi_watchdog_ctlblk {  	unsigned int cccr_msr; diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c new file mode 100644 index 00000000000..a640ae5ad20 --- /dev/null +++ b/arch/x86/kernel/cpu/sched.c @@ -0,0 +1,55 @@ +#include <linux/sched.h> +#include <linux/math64.h> +#include <linux/percpu.h> +#include <linux/irqflags.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); + +static unsigned long scale_aperfmperf(void) +{ +	struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); +	unsigned long ratio, flags; + +	local_irq_save(flags); +	get_aperfmperf(&val); +	local_irq_restore(flags); + +	ratio = calc_aperfmperf_ratio(old, &val); +	*old = val; + +	return ratio; +} + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ +	/* +	 * do aperf/mperf on the cpu level because it includes things +	 * like turbo mode, which are relevant to full cores. +	 */ +	if (boot_cpu_has(X86_FEATURE_APERFMPERF)) +		return scale_aperfmperf(); + +	/* +	 * maybe have something cpufreq here +	 */ + +	return default_scale_freq_power(sd, cpu); +} + +unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ +	/* +	 * aperf/mperf already includes the smt gain +	 */ +	if (boot_cpu_has(X86_FEATURE_APERFMPERF)) +		return SCHED_LOAD_SCALE; + +	return default_scale_smt_power(sd, cpu); +} + +#endif diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index bc24f514ec9..1cbed97b59c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,6 +24,7 @@  #include <linux/dmi.h>  #include <asm/div64.h>  #include <asm/vmware.h> +#include <asm/x86_init.h>  #define CPUID_VMWARE_INFO_LEAF	0x40000000  #define VMWARE_HYPERVISOR_MAGIC	0x564D5868 @@ -47,21 +48,35 @@ static inline int __vmware_platform(void)  	return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;  } -static unsigned long __vmware_get_tsc_khz(void) +static unsigned long vmware_get_tsc_khz(void)  {  	uint64_t tsc_hz;  	uint32_t eax, ebx, ecx, edx;  	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); -	if (ebx == UINT_MAX) -		return 0;  	tsc_hz = eax | (((uint64_t)ebx) << 32);  	do_div(tsc_hz, 1000);  	BUG_ON(tsc_hz >> 32); +	printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", +			 (unsigned long) tsc_hz / 1000, +			 (unsigned long) tsc_hz % 1000);  	return tsc_hz;  } +void __init vmware_platform_setup(void) +{ +	uint32_t eax, ebx, ecx, edx; + +	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + +	if (ebx != UINT_MAX) +		x86_platform.calibrate_tsc = vmware_get_tsc_khz; +	else +		printk(KERN_WARNING +		       "Failed to get TSC freq from the hypervisor\n"); +} +  /*   * While checking the dmi string infomation, just checking the product   * serial key should be enough, as this will always have a VMware @@ -87,12 +102,6 @@ int vmware_platform(void)  	return 0;  } -unsigned long vmware_get_tsc_khz(void) -{ -	BUG_ON(!vmware_platform()); -	return __vmware_get_tsc_khz(); -} -  /*   * VMware hypervisor takes care of exporting a reliable TSC to the guest.   * Still, due to timing difference when running on virtual cpus, the TSC can diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index b07af886124..6a52d4b36a3 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -182,7 +182,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =  	.notifier_call = cpuid_class_cpu_callback,  }; -static char *cpuid_nodename(struct device *dev) +static char *cpuid_devnode(struct device *dev, mode_t *mode)  {  	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));  } @@ -203,7 +203,7 @@ static int __init cpuid_init(void)  		err = PTR_ERR(cpuid_class);  		goto out_chrdev;  	} -	cpuid_class->nodename = cpuid_nodename; +	cpuid_class->devnode = cpuid_devnode;  	for_each_online_cpu(i) {  		err = cpuid_device_create(i);  		if (err != 0) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 147005a1cc3..85419bb7d4a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void)  	struct resource *res;  	u64 end; -	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); +	res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);  	e820_res = res;  	for (i = 0; i < e820.nr_map; i++) {  		end = e820.map[i].addr + e820.map[i].size - 1; @@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void)  	return who;  } -char *__init __attribute__((weak)) machine_specific_memory_setup(void) -{ -	if (x86_quirks->arch_memory_setup) { -		char *who = x86_quirks->arch_memory_setup(); - -		if (who) -			return who; -	} -	return default_machine_specific_memory_setup(); -} - -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -char * __init __attribute__((weak)) memory_setup(void) -{ -	return machine_specific_memory_setup(); -} -  void __init setup_memory_map(void)  {  	char *who; -	who = memory_setup(); +	who = x86_init.resources.memory_setup();  	memcpy(&e820_saved, &e820, sizeof(struct e820map));  	printk(KERN_INFO "BIOS-provided physical RAM map:\n");  	e820_print_map(who); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 335f049d110..2acfd3fdc0c 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -160,721 +160,6 @@ static struct console early_serial_console = {  	.index =	-1,  }; -#ifdef CONFIG_EARLY_PRINTK_DBGP - -static struct ehci_caps __iomem *ehci_caps; -static struct ehci_regs __iomem *ehci_regs; -static struct ehci_dbg_port __iomem *ehci_debug; -static unsigned int dbgp_endpoint_out; - -struct ehci_dev { -	u32 bus; -	u32 slot; -	u32 func; -}; - -static struct ehci_dev ehci_dev; - -#define USB_DEBUG_DEVNUM 127 - -#define DBGP_DATA_TOGGLE	0x8800 - -static inline u32 dbgp_pid_update(u32 x, u32 tok) -{ -	return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); -} - -static inline u32 dbgp_len_update(u32 x, u32 len) -{ -	return (x & ~0x0f) | (len & 0x0f); -} - -/* - * USB Packet IDs (PIDs) - */ - -/* token */ -#define USB_PID_OUT		0xe1 -#define USB_PID_IN		0x69 -#define USB_PID_SOF		0xa5 -#define USB_PID_SETUP		0x2d -/* handshake */ -#define USB_PID_ACK		0xd2 -#define USB_PID_NAK		0x5a -#define USB_PID_STALL		0x1e -#define USB_PID_NYET		0x96 -/* data */ -#define USB_PID_DATA0		0xc3 -#define USB_PID_DATA1		0x4b -#define USB_PID_DATA2		0x87 -#define USB_PID_MDATA		0x0f -/* Special */ -#define USB_PID_PREAMBLE	0x3c -#define USB_PID_ERR		0x3c -#define USB_PID_SPLIT		0x78 -#define USB_PID_PING		0xb4 -#define USB_PID_UNDEF_0		0xf0 - -#define USB_PID_DATA_TOGGLE	0x88 -#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) - -#define PCI_CAP_ID_EHCI_DEBUG	0xa - -#define HUB_ROOT_RESET_TIME	50	/* times are in msec */ -#define HUB_SHORT_RESET_TIME	10 -#define HUB_LONG_RESET_TIME	200 -#define HUB_RESET_TIMEOUT	500 - -#define DBGP_MAX_PACKET		8 - -static int dbgp_wait_until_complete(void) -{ -	u32 ctrl; -	int loop = 0x100000; - -	do { -		ctrl = readl(&ehci_debug->control); -		/* Stop when the transaction is finished */ -		if (ctrl & DBGP_DONE) -			break; -	} while (--loop > 0); - -	if (!loop) -		return -1; - -	/* -	 * Now that we have observed the completed transaction, -	 * clear the done bit. -	 */ -	writel(ctrl | DBGP_DONE, &ehci_debug->control); -	return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); -} - -static void __init dbgp_mdelay(int ms) -{ -	int i; - -	while (ms--) { -		for (i = 0; i < 1000; i++) -			outb(0x1, 0x80); -	} -} - -static void dbgp_breath(void) -{ -	/* Sleep to give the debug port a chance to breathe */ -} - -static int dbgp_wait_until_done(unsigned ctrl) -{ -	u32 pids, lpid; -	int ret; -	int loop = 3; - -retry: -	writel(ctrl | DBGP_GO, &ehci_debug->control); -	ret = dbgp_wait_until_complete(); -	pids = readl(&ehci_debug->pids); -	lpid = DBGP_PID_GET(pids); - -	if (ret < 0) -		return ret; - -	/* -	 * If the port is getting full or it has dropped data -	 * start pacing ourselves, not necessary but it's friendly. -	 */ -	if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) -		dbgp_breath(); - -	/* If I get a NACK reissue the transmission */ -	if (lpid == USB_PID_NAK) { -		if (--loop > 0) -			goto retry; -	} - -	return ret; -} - -static void dbgp_set_data(const void *buf, int size) -{ -	const unsigned char *bytes = buf; -	u32 lo, hi; -	int i; - -	lo = hi = 0; -	for (i = 0; i < 4 && i < size; i++) -		lo |= bytes[i] << (8*i); -	for (; i < 8 && i < size; i++) -		hi |= bytes[i] << (8*(i - 4)); -	writel(lo, &ehci_debug->data03); -	writel(hi, &ehci_debug->data47); -} - -static void __init dbgp_get_data(void *buf, int size) -{ -	unsigned char *bytes = buf; -	u32 lo, hi; -	int i; - -	lo = readl(&ehci_debug->data03); -	hi = readl(&ehci_debug->data47); -	for (i = 0; i < 4 && i < size; i++) -		bytes[i] = (lo >> (8*i)) & 0xff; -	for (; i < 8 && i < size; i++) -		bytes[i] = (hi >> (8*(i - 4))) & 0xff; -} - -static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, -			 const char *bytes, int size) -{ -	u32 pids, addr, ctrl; -	int ret; - -	if (size > DBGP_MAX_PACKET) -		return -1; - -	addr = DBGP_EPADDR(devnum, endpoint); - -	pids = readl(&ehci_debug->pids); -	pids = dbgp_pid_update(pids, USB_PID_OUT); - -	ctrl = readl(&ehci_debug->control); -	ctrl = dbgp_len_update(ctrl, size); -	ctrl |= DBGP_OUT; -	ctrl |= DBGP_GO; - -	dbgp_set_data(bytes, size); -	writel(addr, &ehci_debug->address); -	writel(pids, &ehci_debug->pids); - -	ret = dbgp_wait_until_done(ctrl); -	if (ret < 0) -		return ret; - -	return ret; -} - -static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, -				 int size) -{ -	u32 pids, addr, ctrl; -	int ret; - -	if (size > DBGP_MAX_PACKET) -		return -1; - -	addr = DBGP_EPADDR(devnum, endpoint); - -	pids = readl(&ehci_debug->pids); -	pids = dbgp_pid_update(pids, USB_PID_IN); - -	ctrl = readl(&ehci_debug->control); -	ctrl = dbgp_len_update(ctrl, size); -	ctrl &= ~DBGP_OUT; -	ctrl |= DBGP_GO; - -	writel(addr, &ehci_debug->address); -	writel(pids, &ehci_debug->pids); -	ret = dbgp_wait_until_done(ctrl); -	if (ret < 0) -		return ret; - -	if (size > ret) -		size = ret; -	dbgp_get_data(data, size); -	return ret; -} - -static int __init dbgp_control_msg(unsigned devnum, int requesttype, -	int request, int value, int index, void *data, int size) -{ -	u32 pids, addr, ctrl; -	struct usb_ctrlrequest req; -	int read; -	int ret; - -	read = (requesttype & USB_DIR_IN) != 0; -	if (size > (read ? DBGP_MAX_PACKET:0)) -		return -1; - -	/* Compute the control message */ -	req.bRequestType = requesttype; -	req.bRequest = request; -	req.wValue = cpu_to_le16(value); -	req.wIndex = cpu_to_le16(index); -	req.wLength = cpu_to_le16(size); - -	pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); -	addr = DBGP_EPADDR(devnum, 0); - -	ctrl = readl(&ehci_debug->control); -	ctrl = dbgp_len_update(ctrl, sizeof(req)); -	ctrl |= DBGP_OUT; -	ctrl |= DBGP_GO; - -	/* Send the setup message */ -	dbgp_set_data(&req, sizeof(req)); -	writel(addr, &ehci_debug->address); -	writel(pids, &ehci_debug->pids); -	ret = dbgp_wait_until_done(ctrl); -	if (ret < 0) -		return ret; - -	/* Read the result */ -	return dbgp_bulk_read(devnum, 0, data, size); -} - - -/* Find a PCI capability */ -static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) -{ -	u8 pos; -	int bytes; - -	if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & -		PCI_STATUS_CAP_LIST)) -		return 0; - -	pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); -	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { -		u8 id; - -		pos &= ~3; -		id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); -		if (id == 0xff) -			break; -		if (id == cap) -			return pos; - -		pos = read_pci_config_byte(num, slot, func, -						 pos+PCI_CAP_LIST_NEXT); -	} -	return 0; -} - -static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) -{ -	u32 class; - -	class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); -	if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) -		return 0; - -	return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); -} - -static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) -{ -	u32 bus, slot, func; - -	for (bus = 0; bus < 256; bus++) { -		for (slot = 0; slot < 32; slot++) { -			for (func = 0; func < 8; func++) { -				unsigned cap; - -				cap = __find_dbgp(bus, slot, func); - -				if (!cap) -					continue; -				if (ehci_num-- != 0) -					continue; -				*rbus = bus; -				*rslot = slot; -				*rfunc = func; -				return cap; -			} -		} -	} -	return 0; -} - -static int __init ehci_reset_port(int port) -{ -	u32 portsc; -	u32 delay_time, delay; -	int loop; - -	/* Reset the usb debug port */ -	portsc = readl(&ehci_regs->port_status[port - 1]); -	portsc &= ~PORT_PE; -	portsc |= PORT_RESET; -	writel(portsc, &ehci_regs->port_status[port - 1]); - -	delay = HUB_ROOT_RESET_TIME; -	for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; -	     delay_time += delay) { -		dbgp_mdelay(delay); - -		portsc = readl(&ehci_regs->port_status[port - 1]); -		if (portsc & PORT_RESET) { -			/* force reset to complete */ -			loop = 2; -			writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), -				&ehci_regs->port_status[port - 1]); -			do { -				portsc = readl(&ehci_regs->port_status[port-1]); -			} while ((portsc & PORT_RESET) && (--loop > 0)); -		} - -		/* Device went away? */ -		if (!(portsc & PORT_CONNECT)) -			return -ENOTCONN; - -		/* bomb out completely if something weird happend */ -		if ((portsc & PORT_CSC)) -			return -EINVAL; - -		/* If we've finished resetting, then break out of the loop */ -		if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) -			return 0; -	} -	return -EBUSY; -} - -static int __init ehci_wait_for_port(int port) -{ -	u32 status; -	int ret, reps; - -	for (reps = 0; reps < 3; reps++) { -		dbgp_mdelay(100); -		status = readl(&ehci_regs->status); -		if (status & STS_PCD) { -			ret = ehci_reset_port(port); -			if (ret == 0) -				return 0; -		} -	} -	return -ENOTCONN; -} - -#ifdef DBGP_DEBUG -# define dbgp_printk early_printk -#else -static inline void dbgp_printk(const char *fmt, ...) { } -#endif - -typedef void (*set_debug_port_t)(int port); - -static void __init default_set_debug_port(int port) -{ -} - -static set_debug_port_t __initdata set_debug_port = default_set_debug_port; - -static void __init nvidia_set_debug_port(int port) -{ -	u32 dword; -	dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, -				 0x74); -	dword &= ~(0x0f<<12); -	dword |= ((port & 0x0f)<<12); -	write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, -				 dword); -	dbgp_printk("set debug port to %d\n", port); -} - -static void __init detect_set_debug_port(void) -{ -	u32 vendorid; - -	vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, -		 0x00); - -	if ((vendorid & 0xffff) == 0x10de) { -		dbgp_printk("using nvidia set_debug_port\n"); -		set_debug_port = nvidia_set_debug_port; -	} -} - -static int __init ehci_setup(void) -{ -	struct usb_debug_descriptor dbgp_desc; -	u32 cmd, ctrl, status, portsc, hcs_params; -	u32 debug_port, new_debug_port = 0, n_ports; -	u32  devnum; -	int ret, i; -	int loop; -	int port_map_tried; -	int playtimes = 3; - -try_next_time: -	port_map_tried = 0; - -try_next_port: - -	hcs_params = readl(&ehci_caps->hcs_params); -	debug_port = HCS_DEBUG_PORT(hcs_params); -	n_ports    = HCS_N_PORTS(hcs_params); - -	dbgp_printk("debug_port: %d\n", debug_port); -	dbgp_printk("n_ports:    %d\n", n_ports); - -	for (i = 1; i <= n_ports; i++) { -		portsc = readl(&ehci_regs->port_status[i-1]); -		dbgp_printk("portstatus%d: %08x\n", i, portsc); -	} - -	if (port_map_tried && (new_debug_port != debug_port)) { -		if (--playtimes) { -			set_debug_port(new_debug_port); -			goto try_next_time; -		} -		return -1; -	} - -	loop = 10; -	/* Reset the EHCI controller */ -	cmd = readl(&ehci_regs->command); -	cmd |= CMD_RESET; -	writel(cmd, &ehci_regs->command); -	do { -		cmd = readl(&ehci_regs->command); -	} while ((cmd & CMD_RESET) && (--loop > 0)); - -	if (!loop) { -		dbgp_printk("can not reset ehci\n"); -		return -1; -	} -	dbgp_printk("ehci reset done\n"); - -	/* Claim ownership, but do not enable yet */ -	ctrl = readl(&ehci_debug->control); -	ctrl |= DBGP_OWNER; -	ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); -	writel(ctrl, &ehci_debug->control); - -	/* Start the ehci running */ -	cmd = readl(&ehci_regs->command); -	cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); -	cmd |= CMD_RUN; -	writel(cmd, &ehci_regs->command); - -	/* Ensure everything is routed to the EHCI */ -	writel(FLAG_CF, &ehci_regs->configured_flag); - -	/* Wait until the controller is no longer halted */ -	loop = 10; -	do { -		status = readl(&ehci_regs->status); -	} while ((status & STS_HALT) && (--loop > 0)); - -	if (!loop) { -		dbgp_printk("ehci can be started\n"); -		return -1; -	} -	dbgp_printk("ehci started\n"); - -	/* Wait for a device to show up in the debug port */ -	ret = ehci_wait_for_port(debug_port); -	if (ret < 0) { -		dbgp_printk("No device found in debug port\n"); -		goto next_debug_port; -	} -	dbgp_printk("ehci wait for port done\n"); - -	/* Enable the debug port */ -	ctrl = readl(&ehci_debug->control); -	ctrl |= DBGP_CLAIM; -	writel(ctrl, &ehci_debug->control); -	ctrl = readl(&ehci_debug->control); -	if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { -		dbgp_printk("No device in debug port\n"); -		writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); -		goto err; -	} -	dbgp_printk("debug ported enabled\n"); - -	/* Completely transfer the debug device to the debug controller */ -	portsc = readl(&ehci_regs->port_status[debug_port - 1]); -	portsc &= ~PORT_PE; -	writel(portsc, &ehci_regs->port_status[debug_port - 1]); - -	dbgp_mdelay(100); - -	/* Find the debug device and make it device number 127 */ -	for (devnum = 0; devnum <= 127; devnum++) { -		ret = dbgp_control_msg(devnum, -			USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, -			USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, -			&dbgp_desc, sizeof(dbgp_desc)); -		if (ret > 0) -			break; -	} -	if (devnum > 127) { -		dbgp_printk("Could not find attached debug device\n"); -		goto err; -	} -	if (ret < 0) { -		dbgp_printk("Attached device is not a debug device\n"); -		goto err; -	} -	dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; - -	/* Move the device to 127 if it isn't already there */ -	if (devnum != USB_DEBUG_DEVNUM) { -		ret = dbgp_control_msg(devnum, -			USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, -			USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); -		if (ret < 0) { -			dbgp_printk("Could not move attached device to %d\n", -				USB_DEBUG_DEVNUM); -			goto err; -		} -		devnum = USB_DEBUG_DEVNUM; -		dbgp_printk("debug device renamed to 127\n"); -	} - -	/* Enable the debug interface */ -	ret = dbgp_control_msg(USB_DEBUG_DEVNUM, -		USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, -		USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); -	if (ret < 0) { -		dbgp_printk(" Could not enable the debug device\n"); -		goto err; -	} -	dbgp_printk("debug interface enabled\n"); - -	/* Perform a small write to get the even/odd data state in sync -	 */ -	ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); -	if (ret < 0) { -		dbgp_printk("dbgp_bulk_write failed: %d\n", ret); -		goto err; -	} -	dbgp_printk("small write doned\n"); - -	return 0; -err: -	/* Things didn't work so remove my claim */ -	ctrl = readl(&ehci_debug->control); -	ctrl &= ~(DBGP_CLAIM | DBGP_OUT); -	writel(ctrl, &ehci_debug->control); -	return -1; - -next_debug_port: -	port_map_tried |= (1<<(debug_port - 1)); -	new_debug_port = ((debug_port-1+1)%n_ports) + 1; -	if (port_map_tried != ((1<<n_ports) - 1)) { -		set_debug_port(new_debug_port); -		goto try_next_port; -	} -	if (--playtimes) { -		set_debug_port(new_debug_port); -		goto try_next_time; -	} - -	return -1; -} - -static int __init early_dbgp_init(char *s) -{ -	u32 debug_port, bar, offset; -	u32 bus, slot, func, cap; -	void __iomem *ehci_bar; -	u32 dbgp_num; -	u32 bar_val; -	char *e; -	int ret; -	u8 byte; - -	if (!early_pci_allowed()) -		return -1; - -	dbgp_num = 0; -	if (*s) -		dbgp_num = simple_strtoul(s, &e, 10); -	dbgp_printk("dbgp_num: %d\n", dbgp_num); - -	cap = find_dbgp(dbgp_num, &bus, &slot, &func); -	if (!cap) -		return -1; - -	dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot, -			 func); - -	debug_port = read_pci_config(bus, slot, func, cap); -	bar = (debug_port >> 29) & 0x7; -	bar = (bar * 4) + 0xc; -	offset = (debug_port >> 16) & 0xfff; -	dbgp_printk("bar: %02x offset: %03x\n", bar, offset); -	if (bar != PCI_BASE_ADDRESS_0) { -		dbgp_printk("only debug ports on bar 1 handled.\n"); - -		return -1; -	} - -	bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); -	dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); -	if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { -		dbgp_printk("only simple 32bit mmio bars supported\n"); - -		return -1; -	} - -	/* double check if the mem space is enabled */ -	byte = read_pci_config_byte(bus, slot, func, 0x04); -	if (!(byte & 0x2)) { -		byte  |= 0x02; -		write_pci_config_byte(bus, slot, func, 0x04, byte); -		dbgp_printk("mmio for ehci enabled\n"); -	} - -	/* -	 * FIXME I don't have the bar size so just guess PAGE_SIZE is more -	 * than enough.  1K is the biggest I have seen. -	 */ -	set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); -	ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); -	ehci_bar += bar_val & ~PAGE_MASK; -	dbgp_printk("ehci_bar: %p\n", ehci_bar); - -	ehci_caps  = ehci_bar; -	ehci_regs  = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); -	ehci_debug = ehci_bar + offset; -	ehci_dev.bus = bus; -	ehci_dev.slot = slot; -	ehci_dev.func = func; - -	detect_set_debug_port(); - -	ret = ehci_setup(); -	if (ret < 0) { -		dbgp_printk("ehci_setup failed\n"); -		ehci_debug = NULL; - -		return -1; -	} - -	return 0; -} - -static void early_dbgp_write(struct console *con, const char *str, u32 n) -{ -	int chunk, ret; - -	if (!ehci_debug) -		return; -	while (n > 0) { -		chunk = n; -		if (chunk > DBGP_MAX_PACKET) -			chunk = DBGP_MAX_PACKET; -		ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, -			dbgp_endpoint_out, str, chunk); -		str += chunk; -		n -= chunk; -	} -} - -static struct console early_dbgp_console = { -	.name =		"earlydbg", -	.write =	early_dbgp_write, -	.flags =	CON_PRINTBUFFER, -	.index =	-1, -}; -#endif -  /* Direct interface for emergencies */  static struct console *early_console = &early_vga_console;  static int __initdata early_console_initialized; @@ -891,10 +176,19 @@ asmlinkage void early_printk(const char *fmt, ...)  	va_end(ap);  } +static inline void early_console_register(struct console *con, int keep_early) +{ +	early_console = con; +	if (keep_early) +		early_console->flags &= ~CON_BOOT; +	else +		early_console->flags |= CON_BOOT; +	register_console(early_console); +}  static int __init setup_early_printk(char *buf)  { -	int keep_early; +	int keep;  	if (!buf)  		return 0; @@ -903,42 +197,34 @@ static int __init setup_early_printk(char *buf)  		return 0;  	early_console_initialized = 1; -	keep_early = (strstr(buf, "keep") != NULL); +	keep = (strstr(buf, "keep") != NULL); -	if (!strncmp(buf, "serial", 6)) { -		early_serial_init(buf + 6); -		early_console = &early_serial_console; -	} else if (!strncmp(buf, "ttyS", 4)) { -		early_serial_init(buf); -		early_console = &early_serial_console; -	} else if (!strncmp(buf, "vga", 3) -		&& boot_params.screen_info.orig_video_isVGA == 1) { -		max_xpos = boot_params.screen_info.orig_video_cols; -		max_ypos = boot_params.screen_info.orig_video_lines; -		current_ypos = boot_params.screen_info.orig_y; -		early_console = &early_vga_console; +	while (*buf != '\0') { +		if (!strncmp(buf, "serial", 6)) { +			early_serial_init(buf + 6); +			early_console_register(&early_serial_console, keep); +		} +		if (!strncmp(buf, "ttyS", 4)) { +			early_serial_init(buf + 4); +			early_console_register(&early_serial_console, keep); +		} +		if (!strncmp(buf, "vga", 3) && +		    boot_params.screen_info.orig_video_isVGA == 1) { +			max_xpos = boot_params.screen_info.orig_video_cols; +			max_ypos = boot_params.screen_info.orig_video_lines; +			current_ypos = boot_params.screen_info.orig_y; +			early_console_register(&early_vga_console, keep); +		}  #ifdef CONFIG_EARLY_PRINTK_DBGP -	} else if (!strncmp(buf, "dbgp", 4)) { -		if (early_dbgp_init(buf+4) < 0) -			return 0; -		early_console = &early_dbgp_console; -		/* -		 * usb subsys will reset ehci controller, so don't keep -		 * that early console -		 */ -		keep_early = 0; +		if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) +			early_console_register(&early_dbgp_console, keep);  #endif  #ifdef CONFIG_HVC_XEN -	} else if (!strncmp(buf, "xen", 3)) { -		early_console = &xenboot_console; +		if (!strncmp(buf, "xen", 3)) +			early_console_register(&xenboot_console, keep);  #endif +		buf++;  	} - -	if (keep_early) -		early_console->flags &= ~CON_BOOT; -	else -		early_console->flags |= CON_BOOT; -	register_console(early_console);  	return 0;  } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index fe26ba3e345..ad5bd988fb7 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -42,6 +42,7 @@  #include <asm/time.h>  #include <asm/cacheflush.h>  #include <asm/tlbflush.h> +#include <asm/x86_init.h>  #define EFI_DEBUG	1  #define PFX 		"EFI: " @@ -453,6 +454,9 @@ void __init efi_init(void)  	if (add_efi_memmap)  		do_add_efi_memmap(); +	x86_platform.get_wallclock = efi_get_time; +	x86_platform.set_wallclock = efi_set_rtc_mmss; +  	/* Setup for EFI runtime service */  	reboot_type = BOOT_EFI; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c251be74510..b5c061f8f35 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller)  END(ftrace_graph_caller)  GLOBAL(return_to_handler) -	subq  $80, %rsp +	subq  $24, %rsp  	/* Save the return values */  	movq %rax, (%rsp) @@ -155,10 +155,10 @@ GLOBAL(return_to_handler)  	call ftrace_return_to_handler -	movq %rax, 72(%rsp) +	movq %rax, 16(%rsp)  	movq 8(%rsp), %rdx  	movq (%rsp), %rax -	addq $72, %rsp +	addq $16, %rsp  	retq  #endif @@ -536,20 +536,13 @@ sysret_signal:  	bt $TIF_SYSCALL_AUDIT,%edx  	jc sysret_audit  #endif -	/* edx:	work flags (arg3) */ -	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 -	xorl %esi,%esi # oldset -> arg2 -	SAVE_REST -	FIXUP_TOP_OF_STACK %r11 -	call do_notify_resume -	RESTORE_TOP_OF_STACK %r11 -	RESTORE_REST -	movl $_TIF_WORK_MASK,%edi -	/* Use IRET because user could have changed frame. This -	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ -	DISABLE_INTERRUPTS(CLBR_NONE) -	TRACE_IRQS_OFF -	jmp int_with_check +	/* +	 * We have a signal, or exit tracing or single-step. +	 * These all wind up with the iret return path anyway, +	 * so just join that path right now. +	 */ +	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET +	jmp int_check_syscall_exit_work  badsys:  	movq $-ENOSYS,RAX-ARGOFFSET(%rsp) @@ -654,6 +647,7 @@ int_careful:  int_very_careful:  	TRACE_IRQS_ON  	ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work:  	SAVE_REST  	/* Check for syscall exit trace */  	testl $_TIF_WORK_SYSCALL_EXIT,%edx @@ -1021,7 +1015,7 @@ apicinterrupt ERROR_APIC_VECTOR \  apicinterrupt SPURIOUS_APIC_VECTOR \  	spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS  apicinterrupt LOCAL_PENDING_VECTOR \  	perf_pending_interrupt smp_perf_pending_interrupt  #endif diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3f8579f8d42..4f8e2507e8f 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -11,8 +11,21 @@  #include <asm/setup.h>  #include <asm/sections.h>  #include <asm/e820.h> -#include <asm/bios_ebda.h> +#include <asm/page.h>  #include <asm/trampoline.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/bios_ebda.h> + +static void __init i386_default_early_setup(void) +{ +	/* Initilize 32bit specific setup functions */ +	x86_init.resources.probe_roms = probe_roms; +	x86_init.resources.reserve_resources = i386_reserve_resources; +	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + +	reserve_ebda_region(); +}  void __init i386_start_kernel(void)  { @@ -29,7 +42,16 @@ void __init i386_start_kernel(void)  		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");  	}  #endif -	reserve_ebda_region(); + +	/* Call the subarch specific early setup function */ +	switch (boot_params.hdr.hardware_subarch) { +	case X86_SUBARCH_MRST: +		x86_mrst_early_setup(); +		break; +	default: +		i386_default_early_setup(); +		break; +	}  	/*  	 * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 70eaa852c73..0b06cd778fd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -23,8 +23,8 @@  #include <asm/sections.h>  #include <asm/kdebug.h>  #include <asm/e820.h> -#include <asm/bios_ebda.h>  #include <asm/trampoline.h> +#include <asm/bios_ebda.h>  static void __init zap_identity_mappings(void)  { diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 7ffec6b3b33..218aad7ee76 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -157,6 +157,7 @@ subarch_entries:  	.long default_entry		/* normal x86/PC */  	.long lguest_entry		/* lguest hypervisor */  	.long xen_entry			/* Xen hypervisor */ +	.long default_entry		/* Moorestown MID */  num_subarch_entries = (. - subarch_entries) / 4  .previous  #endif /* CONFIG_PARAVIRT */ @@ -607,7 +608,7 @@ ENTRY(initial_code)  /*   * BSS section   */ -.section ".bss.page_aligned","wa" +__PAGE_ALIGNED_BSS  	.align PAGE_SIZE_asm  #ifdef CONFIG_X86_PAE  swapper_pg_pmd: @@ -625,7 +626,7 @@ ENTRY(empty_zero_page)   * This starts the data section.   */  #ifdef CONFIG_X86_PAE -.section ".data.page_aligned","wa" +__PAGE_ALIGNED_DATA  	/* Page-aligned for the benefit of paravirt? */  	.align PAGE_SIZE_asm  ENTRY(swapper_pg_dir) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fa54f78e2a0..d0bc0a13a43 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -418,7 +418,7 @@ ENTRY(phys_base)  ENTRY(idt_table)  	.skip IDT_ENTRIES * 16 -	.section .bss.page_aligned, "aw", @nobits +	__PAGE_ALIGNED_BSS  	.align PAGE_SIZE  ENTRY(empty_zero_page)  	.skip PAGE_SIZE diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 5cf36c053ac..23c167925a5 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -19,12 +19,6 @@  DEFINE_SPINLOCK(i8253_lock);  EXPORT_SYMBOL(i8253_lock); -#ifdef CONFIG_X86_32 -static void pit_disable_clocksource(void); -#else -static inline void pit_disable_clocksource(void) { } -#endif -  /*   * HPET replaces the PIT, when enabled. So we need to know, which of   * the two timers is used @@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode,  			outb_pit(0, PIT_CH0);  			outb_pit(0, PIT_CH0);  		} -		pit_disable_clocksource();  		break;  	case CLOCK_EVT_MODE_ONESHOT:  		/* One shot setup */ -		pit_disable_clocksource();  		outb_pit(0x38, PIT_MODE);  		break; @@ -200,17 +192,6 @@ static struct clocksource pit_cs = {  	.shift		= 20,  }; -static void pit_disable_clocksource(void) -{ -	/* -	 * Use mult to check whether it is registered or not -	 */ -	if (pit_cs.mult) { -		clocksource_unregister(&pit_cs); -		pit_cs.mult = 0; -	} -} -  static int __init init_pit_clocksource(void)  {  	 /* diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 270ff83efc1..3a54dcb9cd0 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);   * way process stacks are handled. This is done by having a special   * "init_task" linker map entry..   */ -union thread_union init_thread_union -	__attribute__((__section__(".data.init_task"))) = -		{ INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = +	{ INIT_THREAD_INFO(init_task) };  /*   * Initial task structure. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6932f..74656d1d4e3 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)  	seq_printf(p, "  Threshold APIC interrupts\n");  # endif  #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE  	seq_printf(p, "%*s: ", prec, "MCE");  	for_each_online_cpu(j)  		seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)  	sum += irq_stats(cpu)->irq_threshold_count;  # endif  #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE  	sum += per_cpu(mce_exception_count, cpu);  	sum += per_cpu(mce_poll_count, cpu);  #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 92b7703d3d5..40f30773fb2 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector)  	return 0;  } -static void __init init_ISA_irqs(void) +void __init init_ISA_irqs(void)  {  	int i; @@ -140,8 +140,10 @@ static void __init init_ISA_irqs(void)  	}  } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +void __init init_IRQ(void) +{ +	x86_init.irqs.intr_init(); +}  static void __init smp_intr_init(void)  { @@ -190,7 +192,7 @@ static void __init apic_intr_init(void)  #ifdef CONFIG_X86_MCE_THRESHOLD  	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);  #endif -#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)  	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);  #endif @@ -206,39 +208,19 @@ static void __init apic_intr_init(void)  	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);  	/* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_COUNTERS +# ifdef CONFIG_PERF_EVENTS  	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);  # endif  #endif  } -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - *	Perform any necessary interrupt initialisation prior to setting up - *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA - *	interrupts should be initialised here if the machine emulates a PC - *	in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ -#ifdef CONFIG_X86_32 -	if (x86_quirks->arch_pre_intr_init) { -		if (x86_quirks->arch_pre_intr_init()) -			return; -	} -#endif -	init_ISA_irqs(); -} -  void __init native_init_IRQ(void)  {  	int i;  	/* Execute any quirks before the call gates are initialised: */ -	x86_quirk_pre_intr_init(); +	x86_init.irqs.pre_vector_init();  	apic_intr_init(); @@ -258,12 +240,6 @@ void __init native_init_IRQ(void)  #ifdef CONFIG_X86_32  	/* -	 * Call quirks after call gates are initialised (usually add in -	 * the architecture specific gates): -	 */ -	x86_quirk_intr_init(); - -	/*  	 * External FPU? Set up irq13 if so, for  	 * original braindamaged IBM FERR coupling.  	 */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e5efcdcca31..feaeb0d3aa4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -22,6 +22,8 @@  #include <asm/msr.h>  #include <asm/apic.h>  #include <linux/percpu.h> + +#include <asm/x86_init.h>  #include <asm/reboot.h>  #define KVM_SCALE 22 @@ -182,12 +184,13 @@ void __init kvmclock_init(void)  	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {  		if (kvm_register_clock("boot clock"))  			return; -		pv_time_ops.get_wallclock = kvm_get_wallclock; -		pv_time_ops.set_wallclock = kvm_set_wallclock;  		pv_time_ops.sched_clock = kvm_clock_read; -		pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; +		x86_platform.calibrate_tsc = kvm_get_tsc_khz; +		x86_platform.get_wallclock = kvm_get_wallclock; +		x86_platform.set_wallclock = kvm_set_wallclock;  #ifdef CONFIG_X86_LOCAL_APIC -		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; +		x86_cpuinit.setup_percpu_clockev = +			kvm_setup_secondary_clock;  #endif  #ifdef CONFIG_SMP  		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9371448290a..378e9a8f1bf 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,  {  	ssize_t ret = -EINVAL; -	if ((len >> PAGE_SHIFT) > num_physpages) { -		pr_err("microcode: too much data (max %ld pages)\n", num_physpages); +	if ((len >> PAGE_SHIFT) > totalram_pages) { +		pr_err("microcode: too much data (max %ld pages)\n", totalram_pages);  		return ret;  	} @@ -236,7 +236,7 @@ static const struct file_operations microcode_fops = {  static struct miscdevice microcode_dev = {  	.minor			= MICROCODE_MINOR,  	.name			= "microcode", -	.devnode		= "cpu/microcode", +	.nodename		= "cpu/microcode",  	.fops			= µcode_fops,  }; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index fcd513bf284..5be95ef4ffe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len)  	return sum & 0xFF;  } +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ +	return m->apicid; +} +  static void __init MP_processor_info(struct mpc_cpu *m)  {  	int apicid; @@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)  		return;  	} -	if (x86_quirks->mpc_apic_id) -		apicid = x86_quirks->mpc_apic_id(m); -	else -		apicid = m->apicid; +	apicid = x86_init.mpparse.mpc_apic_id(m);  	if (m->cpuflag & CPU_BOOTPROCESSOR) {  		bootup_cpu = " (Bootup-CPU)"; @@ -70,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m)  }  #ifdef CONFIG_X86_IO_APIC -static void __init MP_bus_info(struct mpc_bus *m) +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)  { -	char str[7];  	memcpy(str, m->bustype, 6);  	str[6] = 0; +	apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} -	if (x86_quirks->mpc_oem_bus_info) -		x86_quirks->mpc_oem_bus_info(m, str); -	else -		apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +static void __init MP_bus_info(struct mpc_bus *m) +{ +	char str[7]; + +	x86_init.mpparse.mpc_oem_bus_info(m, str);  #if MAX_MP_BUSSES < 256  	if (m->busid >= MAX_MP_BUSSES) { @@ -96,8 +100,8 @@ static void __init MP_bus_info(struct mpc_bus *m)  		mp_bus_id_to_type[m->busid] = MP_BUS_ISA;  #endif  	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { -		if (x86_quirks->mpc_oem_pci_bus) -			x86_quirks->mpc_oem_pci_bus(m); +		if (x86_init.mpparse.mpc_oem_pci_bus) +			x86_init.mpparse.mpc_oem_pci_bus(m);  		clear_bit(m->busid, mp_bus_not_pci);  #if defined(CONFIG_EISA) || defined(CONFIG_MCA) @@ -291,6 +295,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)  			1, mpc, mpc->length, 1);  } +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } +  static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)  {  	char str[16]; @@ -312,16 +318,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)  	if (early)  		return 1; -	if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { -		struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; -		x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize); -	} +	if (mpc->oemptr) +		x86_init.mpparse.smp_read_mpc_oem(mpc);  	/*  	 *      Now process the configuration blocks.  	 */ -	if (x86_quirks->mpc_record) -		*x86_quirks->mpc_record = 0; +	x86_init.mpparse.mpc_record(0);  	while (count < mpc->length) {  		switch (*mpt) { @@ -353,8 +356,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)  			count = mpc->length;  			break;  		} -		if (x86_quirks->mpc_record) -			(*x86_quirks->mpc_record)++; +		x86_init.mpparse.mpc_record(1);  	}  #ifdef CONFIG_X86_BIGSMP @@ -608,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)  /*   * Scan the memory blocks for an SMP configuration block.   */ -static void __init __get_smp_config(unsigned int early) +void __init default_get_smp_config(unsigned int early)  {  	struct mpf_intel *mpf = mpf_found; @@ -625,11 +627,6 @@ static void __init __get_smp_config(unsigned int early)  	if (acpi_lapic && acpi_ioapic)  		return; -	if (x86_quirks->mach_get_smp_config) { -		if (x86_quirks->mach_get_smp_config(early)) -			return; -	} -  	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",  	       mpf->specification);  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) @@ -670,16 +667,6 @@ static void __init __get_smp_config(unsigned int early)  	 */  } -void __init early_get_smp_config(void) -{ -	__get_smp_config(1); -} - -void __init get_smp_config(void) -{ -	__get_smp_config(0); -} -  static void __init smp_reserve_bootmem(struct mpf_intel *mpf)  {  	unsigned long size = get_mpc_size(mpf->physptr); @@ -745,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,  	return 0;  } -static void __init __find_smp_config(unsigned int reserve) +void __init default_find_smp_config(unsigned int reserve)  {  	unsigned int address; -	if (x86_quirks->mach_find_smp_config) { -		if (x86_quirks->mach_find_smp_config(reserve)) -			return; -	}  	/*  	 * FIXME: Linux assumes you have 640K of base ram..  	 * this continues the error... @@ -787,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve)  		smp_scan_config(address, 0x400, reserve);  } -void __init early_find_smp_config(void) -{ -	__find_smp_config(0); -} - -void __init find_smp_config(void) -{ -	__find_smp_config(1); -} -  #ifdef CONFIG_X86_IO_APIC  static u8 __initdata irq_used[MAX_IRQ_SOURCES]; diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c new file mode 100644 index 00000000000..3b7078abc87 --- /dev/null +++ b/arch/x86/kernel/mrst.c @@ -0,0 +1,24 @@ +/* + * mrst.c: Intel Moorestown platform specific setup code + * + * (C) Copyright 2008 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/init.h> + +#include <asm/setup.h> + +/* + * Moorestown specific x86_init function overrides and early setup + * calls. + */ +void __init x86_mrst_early_setup(void) +{ +	x86_init.resources.probe_roms = x86_init_noop; +	x86_init.resources.reserve_resources = x86_init_noop; +} diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7dd95009417..6a3cefc7dda 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -241,7 +241,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {  	.notifier_call = msr_class_cpu_callback,  }; -static char *msr_nodename(struct device *dev) +static char *msr_devnode(struct device *dev, mode_t *mode)  {  	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));  } @@ -262,7 +262,7 @@ static int __init msr_init(void)  		err = PTR_ERR(msr_class);  		goto out_chrdev;  	} -	msr_class->nodename = msr_nodename; +	msr_class->devnode = msr_devnode;  	for_each_online_cpu(i) {  		err = msr_device_create(i);  		if (err != 0) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f5b0b4a01fb..1b1739d1631 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -54,17 +54,12 @@ u64 _paravirt_ident_64(u64 x)  	return x;  } -static void __init default_banner(void) +void __init default_banner(void)  {  	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",  	       pv_info.name);  } -char *memory_setup(void) -{ -	return pv_init_ops.memory_setup(); -} -  /* Simple instruction patching code. */  #define DEF_NATIVE(ops, name, code)					\  	extern const char start_##ops##_##name[], end_##ops##_##name[];	\ @@ -188,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,  	return insn_len;  } -void init_IRQ(void) -{ -	pv_irq_ops.init_IRQ(); -} -  static void native_flush_tlb(void)  {  	__native_flush_tlb(); @@ -218,13 +208,6 @@ extern void native_irq_enable_sysexit(void);  extern void native_usergs_sysret32(void);  extern void native_usergs_sysret64(void); -static int __init print_banner(void) -{ -	pv_init_ops.banner(); -	return 0; -} -core_initcall(print_banner); -  static struct resource reserve_ioports = {  	.start = 0,  	.end = IO_SPACE_LIMIT, @@ -320,21 +303,13 @@ struct pv_info pv_info = {  struct pv_init_ops pv_init_ops = {  	.patch = native_patch, -	.banner = default_banner, -	.arch_setup = paravirt_nop, -	.memory_setup = machine_specific_memory_setup,  };  struct pv_time_ops pv_time_ops = { -	.time_init = hpet_time_init, -	.get_wallclock = native_get_wallclock, -	.set_wallclock = native_set_wallclock,  	.sched_clock = native_sched_clock, -	.get_tsc_khz = native_calibrate_tsc,  };  struct pv_irq_ops pv_irq_ops = { -	.init_IRQ = native_init_IRQ,  	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),  	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),  	.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), @@ -409,8 +384,6 @@ struct pv_cpu_ops pv_cpu_ops = {  struct pv_apic_ops pv_apic_ops = {  #ifdef CONFIG_X86_LOCAL_APIC -	.setup_boot_clock = setup_boot_APIC_clock, -	.setup_secondary_clock = setup_secondary_APIC_clock,  	.startup_ipi_hook = paravirt_nop,  #endif  }; @@ -424,13 +397,6 @@ struct pv_apic_ops pv_apic_ops = {  #endif  struct pv_mmu_ops pv_mmu_ops = { -#ifndef CONFIG_X86_64 -	.pagetable_setup_start = native_pagetable_setup_start, -	.pagetable_setup_done = native_pagetable_setup_done, -#else -	.pagetable_setup_start = paravirt_nop, -	.pagetable_setup_done = paravirt_nop, -#endif  	.read_cr2 = native_read_cr2,  	.write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index d71c8655905..64b838eac18 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -225,10 +225,8 @@ static __init int iommu_setup(char *p)  		if (!strncmp(p, "soft", 4))  			swiotlb = 1;  #endif -		if (!strncmp(p, "pt", 2)) { +		if (!strncmp(p, "pt", 2))  			iommu_pass_through = 1; -			return 1; -		}  		gart_parse_options(p); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e8a35016115..aaa6b7839f1 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -46,9 +46,8 @@ void __init pci_swiotlb_init(void)  {  	/* don't initialize swiotlb if iommu=off (no_iommu=1) */  #ifdef CONFIG_X86_64 -	if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || -		iommu_pass_through) -	       swiotlb = 1; +	if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) +		swiotlb = 1;  #endif  	if (swiotlb_force)  		swiotlb = 1; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 071166a4ba8..847ab416031 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,7 +9,7 @@  #include <linux/pm.h>  #include <linux/clockchips.h>  #include <linux/random.h> -#include <trace/power.h> +#include <trace/events/power.h>  #include <asm/system.h>  #include <asm/apic.h>  #include <asm/syscalls.h> @@ -25,9 +25,6 @@ EXPORT_SYMBOL(idle_nomwait);  struct kmem_cache *task_xstate_cachep; -DEFINE_TRACE(power_start); -DEFINE_TRACE(power_end); -  int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)  {  	*dst = *src; @@ -299,9 +296,7 @@ static inline int hlt_use_halt(void)  void default_idle(void)  {  	if (hlt_use_halt()) { -		struct power_trace it; - -		trace_power_start(&it, POWER_CSTATE, 1); +		trace_power_start(POWER_CSTATE, 1);  		current_thread_info()->status &= ~TS_POLLING;  		/*  		 * TS_POLLING-cleared state must be visible before we @@ -314,7 +309,6 @@ void default_idle(void)  		else  			local_irq_enable();  		current_thread_info()->status |= TS_POLLING; -		trace_power_end(&it);  	} else {  		local_irq_enable();  		/* loop is done by the caller */ @@ -372,9 +366,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);   */  void mwait_idle_with_hints(unsigned long ax, unsigned long cx)  { -	struct power_trace it; - -	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); +	trace_power_start(POWER_CSTATE, (ax>>4)+1);  	if (!need_resched()) {  		if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))  			clflush((void *)¤t_thread_info()->flags); @@ -384,15 +376,13 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)  		if (!need_resched())  			__mwait(ax, cx);  	} -	trace_power_end(&it);  }  /* Default MONITOR/MWAIT with no hints, used for default C1 state */  static void mwait_idle(void)  { -	struct power_trace it;  	if (!need_resched()) { -		trace_power_start(&it, POWER_CSTATE, 1); +		trace_power_start(POWER_CSTATE, 1);  		if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))  			clflush((void *)¤t_thread_info()->flags); @@ -402,7 +392,6 @@ static void mwait_idle(void)  			__sti_mwait(0, 0);  		else  			local_irq_enable(); -		trace_power_end(&it);  	} else  		local_irq_enable();  } @@ -414,13 +403,11 @@ static void mwait_idle(void)   */  static void poll_idle(void)  { -	struct power_trace it; - -	trace_power_start(&it, POWER_CSTATE, 0); +	trace_power_start(POWER_CSTATE, 0);  	local_irq_enable();  	while (!need_resched())  		cpu_relax(); -	trace_power_end(&it); +	trace_power_end(0);  }  /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8d7d5c9c1be..7b058a2dc66 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -325,16 +325,6 @@ static int putreg(struct task_struct *child,  		return set_flags(child, value);  #ifdef CONFIG_X86_64 -	/* -	 * Orig_ax is really just a flag with small positive and -	 * negative values, so make sure to always sign-extend it -	 * from 32 bits so that it works correctly regardless of -	 * whether we come from a 32-bit environment or not. -	 */ -	case offsetof(struct user_regs_struct, orig_ax): -		value = (long) (s32) value; -		break; -  	case offsetof(struct user_regs_struct,fs_base):  		if (value >= TASK_SIZE_OF(child))  			return -EIO; @@ -1126,10 +1116,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)  	case offsetof(struct user32, regs.orig_eax):  		/* -		 * Sign-extend the value so that orig_eax = -1 -		 * causes (long)orig_ax < 0 tests to fire correctly. +		 * A 32-bit debugger setting orig_eax means to restore +		 * the state of the task restarting a 32-bit syscall. +		 * Make sure we interpret the -ERESTART* codes correctly +		 * in case the task is not actually still sitting at the +		 * exit from a 32-bit syscall with TS_COMPAT still set.  		 */ -		regs->orig_ax = (long) (s32) value; +		regs->orig_ax = value; +		if (syscall_get_nr(child, regs) >= 0) +			task_thread_info(child)->status |= TS_COMPAT;  		break;  	case offsetof(struct user32, regs.eflags): diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index af71d06624b..6c3b2c6fd77 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)  	pci_read_config_dword(nb_ht, 0x60, &val);  	set_dev_node(&dev->dev, val & 7); -	pci_dev_put(dev); +	pci_dev_put(nb_ht);  }  DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 5d465b207e7..1cfbbfc3ae2 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -8,6 +8,7 @@  #include <linux/pnp.h>  #include <asm/vsyscall.h> +#include <asm/x86_init.h>  #include <asm/time.h>  #ifdef CONFIG_X86_32 @@ -165,33 +166,29 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)  }  EXPORT_SYMBOL(rtc_cmos_write); -static int set_rtc_mmss(unsigned long nowtime) +int update_persistent_clock(struct timespec now)  {  	unsigned long flags;  	int retval;  	spin_lock_irqsave(&rtc_lock, flags); -	retval = set_wallclock(nowtime); +	retval = x86_platform.set_wallclock(now.tv_sec);  	spin_unlock_irqrestore(&rtc_lock, flags);  	return retval;  }  /* not static: needed by APM */ -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts)  {  	unsigned long retval, flags;  	spin_lock_irqsave(&rtc_lock, flags); -	retval = get_wallclock(); +	retval = x86_platform.get_wallclock();  	spin_unlock_irqrestore(&rtc_lock, flags); -	return retval; -} - -int update_persistent_clock(struct timespec now) -{ -	return set_rtc_mmss(now.tv_sec); +	ts->tv_sec = retval; +	ts->tv_nsec = 0;  }  unsigned long long native_read_tsc(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 19f15c4076f..e09f0e2c14b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -27,6 +27,7 @@  #include <linux/screen_info.h>  #include <linux/ioport.h>  #include <linux/acpi.h> +#include <linux/sfi.h>  #include <linux/apm_bios.h>  #include <linux/initrd.h>  #include <linux/bootmem.h> @@ -109,10 +110,6 @@  #include <asm/numa_64.h>  #endif -#ifndef ARCH_SETUP -#define ARCH_SETUP -#endif -  /*   * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.   * The direct mapping extends to max_pfn_mapped, so that we can directly access @@ -134,9 +131,9 @@ int default_cpu_present_to_apicid(int mps_cpu)  	return __default_cpu_present_to_apicid(mps_cpu);  } -int default_check_phys_apicid_present(int boot_cpu_physical_apicid) +int default_check_phys_apicid_present(int phys_apicid)  { -	return __default_check_phys_apicid_present(boot_cpu_physical_apicid); +	return __default_check_phys_apicid_present(phys_apicid);  }  #endif @@ -172,13 +169,6 @@ static struct resource bss_resource = {  #ifdef CONFIG_X86_32 -static struct resource video_ram_resource = { -	.name	= "Video RAM area", -	.start	= 0xa0000, -	.end	= 0xbffff, -	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM -}; -  /* cpu data as detected by the assembly code in head.S */  struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};  /* common cpu data for all cpus */ @@ -606,7 +596,7 @@ static struct resource standard_io_resources[] = {  		.flags = IORESOURCE_BUSY | IORESOURCE_IO }  }; -static void __init reserve_standard_io_resources(void) +void __init reserve_standard_io_resources(void)  {  	int i; @@ -638,10 +628,6 @@ static int __init setup_elfcorehdr(char *arg)  early_param("elfcorehdr", setup_elfcorehdr);  #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; -  #ifdef CONFIG_X86_RESERVE_LOW_64K  static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)  { @@ -712,21 +698,6 @@ void __init setup_arch(char **cmdline_p)  	printk(KERN_INFO "Command line: %s\n", boot_command_line);  #endif -	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); -	*cmdline_p = command_line; - -#ifdef CONFIG_X86_64 -	/* -	 * Must call this twice: Once just to detect whether hardware doesn't -	 * support NX (so that the early EHCI debug console setup can safely -	 * call set_fixmap(), and then again after parsing early parameters to -	 * honor the respective command line option. -	 */ -	check_efer(); -#endif - -	parse_early_param(); -  	/* VMI may relocate the fixmap; do this before touching ioremap area */  	vmi_init(); @@ -773,7 +744,7 @@ void __init setup_arch(char **cmdline_p)  	}  #endif -	ARCH_SETUP +	x86_init.oem.arch_setup();  	setup_memory_map();  	parse_setup_data(); @@ -809,6 +780,21 @@ void __init setup_arch(char **cmdline_p)  #endif  #endif +	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); +	*cmdline_p = command_line; + +#ifdef CONFIG_X86_64 +	/* +	 * Must call this twice: Once just to detect whether hardware doesn't +	 * support NX (so that the early EHCI debug console setup can safely +	 * call set_fixmap(), and then again after parsing early parameters to +	 * honor the respective command line option. +	 */ +	check_efer(); +#endif + +	parse_early_param(); +  #ifdef CONFIG_X86_64  	check_efer();  #endif @@ -844,11 +830,9 @@ void __init setup_arch(char **cmdline_p)  	 * VMware detection requires dmi to be available, so this  	 * needs to be done after dmi_scan_machine, for the BP.  	 */ -	init_hypervisor(&boot_cpu_data); +	init_hypervisor_platform(); -#ifdef CONFIG_X86_32 -	probe_roms(); -#endif +	x86_init.resources.probe_roms();  	/* after parse_early_param, so could debug it */  	insert_resource(&iomem_resource, &code_resource); @@ -983,10 +967,9 @@ void __init setup_arch(char **cmdline_p)  	kvmclock_init();  #endif -	paravirt_pagetable_setup_start(swapper_pg_dir); +	x86_init.paging.pagetable_setup_start(swapper_pg_dir);  	paging_init(); -	paravirt_pagetable_setup_done(swapper_pg_dir); -	paravirt_post_allocator_init(); +	x86_init.paging.pagetable_setup_done(swapper_pg_dir);  	tboot_probe(); @@ -1003,13 +986,13 @@ void __init setup_arch(char **cmdline_p)  	 */  	acpi_boot_init(); -#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) +	sfi_init(); +  	/*  	 * get boot-time SMP configuration:  	 */  	if (smp_found_config)  		get_smp_config(); -#endif  	prefill_possible_map(); @@ -1028,10 +1011,7 @@ void __init setup_arch(char **cmdline_p)  	e820_reserve_resources();  	e820_mark_nosave_regions(max_low_pfn); -#ifdef CONFIG_X86_32 -	request_resource(&iomem_resource, &video_ram_resource); -#endif -	reserve_standard_io_resources(); +	x86_init.resources.reserve_resources();  	e820_setup_gap(); @@ -1043,78 +1023,22 @@ void __init setup_arch(char **cmdline_p)  	conswitchp = &dummy_con;  #endif  #endif +	x86_init.oem.banner();  }  #ifdef CONFIG_X86_32 -/** - * x86_quirk_intr_init - post gate setup interrupt initialisation - * - * Description: - *	Fill in any interrupts that may have been left out by the general - *	init_IRQ() routine.  interrupts having to do with the machine rather - *	than the devices on the I/O bus (like APIC interrupts in intel MP - *	systems) are started here. - **/ -void __init x86_quirk_intr_init(void) -{ -	if (x86_quirks->arch_intr_init) { -		if (x86_quirks->arch_intr_init()) -			return; -	} -} - -/** - * x86_quirk_trap_init - initialise system specific traps - * - * Description: - *	Called as the final act of trap_init().  Used in VISWS to initialise - *	the various board specific APIC traps. - **/ -void __init x86_quirk_trap_init(void) -{ -	if (x86_quirks->arch_trap_init) { -		if (x86_quirks->arch_trap_init()) -			return; -	} -} - -static struct irqaction irq0  = { -	.handler = timer_interrupt, -	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, -	.name = "timer" +static struct resource video_ram_resource = { +	.name	= "Video RAM area", +	.start	= 0xa0000, +	.end	= 0xbffff, +	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM  }; -/** - * x86_quirk_pre_time_init - do any specific initialisations before. - * - **/ -void __init x86_quirk_pre_time_init(void) +void __init i386_reserve_resources(void)  { -	if (x86_quirks->arch_pre_time_init) -		x86_quirks->arch_pre_time_init(); +	request_resource(&iomem_resource, &video_ram_resource); +	reserve_standard_io_resources();  } -/** - * x86_quirk_time_init - do any specific initialisations for the system timer. - * - * Description: - *	Must plug the system timer interrupt source at HZ into the IRQ listed - *	in irq_vectors.h:TIMER_IRQ - **/ -void __init x86_quirk_time_init(void) -{ -	if (x86_quirks->arch_time_init) { -		/* -		 * A nonzero return code does not mean failure, it means -		 * that the architecture quirk does not want any -		 * generic (timer) setup to be performed after this: -		 */ -		if (x86_quirks->arch_time_init()) -			return; -	} - -	irq0.mask = cpumask_of_cpu(0); -	setup_irq(0, &irq0); -}  #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c new file mode 100644 index 00000000000..34e09938265 --- /dev/null +++ b/arch/x86/kernel/sfi.c @@ -0,0 +1,122 @@ +/* + * sfi.c - x86 architecture SFI support. + * + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#define KMSG_COMPONENT "SFI" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/acpi.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/io.h> + +#include <asm/io_apic.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; + +void __init mp_sfi_register_lapic_address(unsigned long address) +{ +	mp_lapic_addr = address; + +	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); +	if (boot_cpu_physical_apicid == -1U) +		boot_cpu_physical_apicid = read_apic_id(); + +	pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + +/* All CPUs enumerated by SFI must be present and enabled */ +void __cpuinit mp_sfi_register_lapic(u8 id) +{ +	if (MAX_APICS - id <= 0) { +		pr_warning("Processor #%d invalid (max %d)\n", +			id, MAX_APICS); +		return; +	} + +	pr_info("registering lapic[%d]\n", id); + +	generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); +} + +static int __init sfi_parse_cpus(struct sfi_table_header *table) +{ +	struct sfi_table_simple *sb; +	struct sfi_cpu_table_entry *pentry; +	int i; +	int cpu_num; + +	sb = (struct sfi_table_simple *)table; +	cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); +	pentry = (struct sfi_cpu_table_entry *)sb->pentry; + +	for (i = 0; i < cpu_num; i++) { +		mp_sfi_register_lapic(pentry->apic_id); +		pentry++; +	} + +	smp_found_config = 1; +	return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static u32 gsi_base; + +static int __init sfi_parse_ioapic(struct sfi_table_header *table) +{ +	struct sfi_table_simple *sb; +	struct sfi_apic_table_entry *pentry; +	int i, num; + +	sb = (struct sfi_table_simple *)table; +	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); +	pentry = (struct sfi_apic_table_entry *)sb->pentry; + +	for (i = 0; i < num; i++) { +		mp_register_ioapic(i, pentry->phys_addr, gsi_base); +		gsi_base += io_apic_get_redir_entries(i); +		pentry++; +	} + +	WARN(pic_mode, KERN_WARNING +		"SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); +	pic_mode = 0; +	return 0; +} +#endif /* CONFIG_X86_IO_APIC */ + +/* + * sfi_platform_init(): register lapics & io-apics + */ +int __init sfi_platform_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC +	mp_sfi_register_lapic_address(sfi_lapic_addr); +	sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); +#endif +#ifdef CONFIG_X86_IO_APIC +	sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); +#endif +	return 0; +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 81e58238c4c..6a44a76055a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs)  void  do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)  { -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE  	/* notify userspace of pending MCEs */  	if (thread_info_flags & _TIF_MCE_NOTIFY)  		mce_notify_process(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a25eeec0008..09c5e077dff 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -324,7 +324,7 @@ notrace static void __cpuinit start_secondary(void *unused)  	/* enable local interrupts */  	local_irq_enable(); -	setup_secondary_clock(); +	x86_cpuinit.setup_percpu_clockev();  	wmb();  	cpu_idle(); @@ -1114,7 +1114,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)  	printk(KERN_INFO "CPU%d: ", 0);  	print_cpu_info(&cpu_data(0)); -	setup_boot_clock(); +	x86_init.timers.setup_percpu_clockev();  	if (is_uv_system())  		uv_system_init(); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d51321ddafd..0157cd26d7c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -335,4 +335,4 @@ ENTRY(sys_call_table)  	.long sys_preadv  	.long sys_pwritev  	.long sys_rt_tgsigqueueinfo	/* 335 */ -	.long sys_perf_counter_open +	.long sys_perf_event_open diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c new file mode 100644 index 00000000000..e293ac56c72 --- /dev/null +++ b/arch/x86/kernel/time.c @@ -0,0 +1,121 @@ +/* + *  Copyright (c) 1991,1992,1995  Linus Torvalds + *  Copyright (c) 1994  Alan Modra + *  Copyright (c) 1995  Markus Kuhn + *  Copyright (c) 1996  Ingo Molnar + *  Copyright (c) 1998  Andrea Arcangeli + *  Copyright (c) 2002,2006  Vojtech Pavlik + *  Copyright (c) 2003  Andi Kleen + * + */ + +#include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/time.h> +#include <linux/mca.h> + +#include <asm/vsyscall.h> +#include <asm/x86_init.h> +#include <asm/i8259.h> +#include <asm/i8253.h> +#include <asm/timer.h> +#include <asm/hpet.h> +#include <asm/time.h> + +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) +int timer_ack; +#endif + +#ifdef CONFIG_X86_64 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +#endif + +unsigned long profile_pc(struct pt_regs *regs) +{ +	unsigned long pc = instruction_pointer(regs); + +	if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER +		return *(unsigned long *)(regs->bp + sizeof(long)); +#else +		unsigned long *sp = (unsigned long *)regs->sp; +		/* +		 * Return address is either directly at stack pointer +		 * or above a saved flags. Eflags has bits 22-31 zero, +		 * kernel addresses don't. +		 */ +		if (sp[0] >> 22) +			return sp[0]; +		if (sp[1] >> 22) +			return sp[1]; +#endif +	} +	return pc; +} +EXPORT_SYMBOL(profile_pc); + +/* + * Default timer interrupt handler for PIT/HPET + */ +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ +	/* Keep nmi watchdog up to date */ +	inc_irq_stat(irq0_irqs); + +	/* Optimized out for !IO_APIC and x86_64 */ +	if (timer_ack) { +		/* +		 * Subtle, when I/O APICs are used we have to ack timer IRQ +		 * manually to deassert NMI lines for the watchdog if run +		 * on an 82489DX-based system. +		 */ +		spin_lock(&i8259A_lock); +		outb(0x0c, PIC_MASTER_OCW3); +		/* Ack the IRQ; AEOI will end it automatically. */ +		inb(PIC_MASTER_POLL); +		spin_unlock(&i8259A_lock); +	} + +	global_clock_event->event_handler(global_clock_event); + +	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ +	if (MCA_bus) +		outb_p(inb_p(0x61)| 0x80, 0x61); + +	return IRQ_HANDLED; +} + +static struct irqaction irq0  = { +	.handler = timer_interrupt, +	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, +	.name = "timer" +}; + +void __init setup_default_timer_irq(void) +{ +	irq0.mask = cpumask_of_cpu(0); +	setup_irq(0, &irq0); +} + +/* Default timer init function */ +void __init hpet_time_init(void) +{ +	if (!hpet_enable()) +		setup_pit_timer(); +	setup_default_timer_irq(); +} + +static __init void x86_late_time_init(void) +{ +	x86_init.timers.timer_init(); +	tsc_init(); +} + +/* + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. + */ +void __init time_init(void) +{ +	late_time_init = x86_late_time_init; +} diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c deleted file mode 100644 index 5c5d87f0b2e..00000000000 --- a/arch/x86/kernel/time_32.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - *  Copyright (C) 1991, 1992, 1995  Linus Torvalds - * - * This file contains the PC-specific time handling details: - * reading the RTC at bootup, etc.. - * 1994-07-02    Alan Modra - *	fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26    Markus Kuhn - *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - *      precision CMOS clock update - * 1996-05-03    Ingo Molnar - *      fixed time warps in do_[slow|fast]_gettimeoffset() - * 1997-09-10	Updated NTP code according to technical memorandum Jan '96 - *		"A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-09-05    (Various) - *	More robust do_fast_gettimeoffset() algorithm implemented - *	(works with APM, Cyrix 6x86MX and Centaur C6), - *	monotonic gettimeofday() with fast_get_timeoffset(), - *	drift-proof precision TSC calibration on boot - *	(C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D. - *	Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>; - *	ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>). - * 1998-12-16    Andrea Arcangeli - *	Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy - *	because was not accounting lost_ticks. - * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli - *	Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - *	serialize accesses to xtime/lost_ticks). - */ - -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/time.h> -#include <linux/mca.h> - -#include <asm/setup.h> -#include <asm/hpet.h> -#include <asm/time.h> -#include <asm/timer.h> - -#include <asm/do_timer.h> - -int timer_ack; - -unsigned long profile_pc(struct pt_regs *regs) -{ -	unsigned long pc = instruction_pointer(regs); - -#ifdef CONFIG_SMP -	if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER -		return *(unsigned long *)(regs->bp + sizeof(long)); -#else -		unsigned long *sp = (unsigned long *)®s->sp; - -		/* Return address is either directly at stack pointer -		   or above a saved flags. Eflags has bits 22-31 zero, -		   kernel addresses don't. */ -		if (sp[0] >> 22) -			return sp[0]; -		if (sp[1] >> 22) -			return sp[1]; -#endif -	} -#endif -	return pc; -} -EXPORT_SYMBOL(profile_pc); - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id) -{ -	/* Keep nmi watchdog up to date */ -	inc_irq_stat(irq0_irqs); - -#ifdef CONFIG_X86_IO_APIC -	if (timer_ack) { -		/* -		 * Subtle, when I/O APICs are used we have to ack timer IRQ -		 * manually to deassert NMI lines for the watchdog if run -		 * on an 82489DX-based system. -		 */ -		spin_lock(&i8259A_lock); -		outb(0x0c, PIC_MASTER_OCW3); -		/* Ack the IRQ; AEOI will end it automatically. */ -		inb(PIC_MASTER_POLL); -		spin_unlock(&i8259A_lock); -	} -#endif - -	do_timer_interrupt_hook(); - -#ifdef CONFIG_MCA -	if (MCA_bus) { -		/* The PS/2 uses level-triggered interrupts.  You can't -		turn them off, nor would you want to (any attempt to -		enable edge-triggered interrupts usually gets intercepted by a -		special hardware circuit).  Hence we have to acknowledge -		the timer interrupt.  Through some incredibly stupid -		design idea, the reset for IRQ 0 is done by setting the -		high bit of the PPI port B (0x61).  Note that some PS/2s, -		notably the 55SX, work fine if this is removed.  */ - -		u8 irq_v = inb_p(0x61);		/* read the current state */ -		outb_p(irq_v | 0x80, 0x61);	/* reset the IRQ */ -	} -#endif - -	return IRQ_HANDLED; -} - -/* Duplicate of time_init() below, with hpet_enable part added */ -void __init hpet_time_init(void) -{ -	if (!hpet_enable()) -		setup_pit_timer(); -	x86_quirk_time_init(); -} - -/* - * This is called directly from init code; we must delay timer setup in the - * HPET case as we can't make the decision to turn on HPET this early in the - * boot process. - * - * The chosen time_init function will usually be hpet_time_init, above, but - * in the case of virtual hardware, an alternative function may be substituted. - */ -void __init time_init(void) -{ -	x86_quirk_pre_time_init(); -	tsc_init(); -	late_time_init = choose_time_init(); -} diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c deleted file mode 100644 index 5ba343e6184..00000000000 --- a/arch/x86/kernel/time_64.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - *  "High Precision Event Timer" based timekeeping. - * - *  Copyright (c) 1991,1992,1995  Linus Torvalds - *  Copyright (c) 1994  Alan Modra - *  Copyright (c) 1995  Markus Kuhn - *  Copyright (c) 1996  Ingo Molnar - *  Copyright (c) 1998  Andrea Arcangeli - *  Copyright (c) 2002,2006  Vojtech Pavlik - *  Copyright (c) 2003  Andi Kleen - *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c - */ - -#include <linux/clockchips.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/time.h> -#include <linux/mca.h> -#include <linux/nmi.h> - -#include <asm/i8253.h> -#include <asm/hpet.h> -#include <asm/vgtod.h> -#include <asm/time.h> -#include <asm/timer.h> - -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; - -unsigned long profile_pc(struct pt_regs *regs) -{ -	unsigned long pc = instruction_pointer(regs); - -	/* Assume the lock function has either no stack frame or a copy -	   of flags from PUSHF -	   Eflags always has bits 22 and up cleared unlike kernel addresses. */ -	if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER -		return *(unsigned long *)(regs->bp + sizeof(long)); -#else -		unsigned long *sp = (unsigned long *)regs->sp; -		if (sp[0] >> 22) -			return sp[0]; -		if (sp[1] >> 22) -			return sp[1]; -#endif -	} -	return pc; -} -EXPORT_SYMBOL(profile_pc); - -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ -	inc_irq_stat(irq0_irqs); - -	global_clock_event->event_handler(global_clock_event); - -#ifdef CONFIG_MCA -	if (MCA_bus) { -		u8 irq_v = inb_p(0x61);       /* read the current state */ -		outb_p(irq_v|0x80, 0x61);     /* reset the IRQ */ -	} -#endif - -	return IRQ_HANDLED; -} - -/* calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency */ -#define TICK_COUNT 100000000 -unsigned long __init calibrate_cpu(void) -{ -	int tsc_start, tsc_now; -	int i, no_ctr_free; -	unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; -	unsigned long flags; - -	for (i = 0; i < 4; i++) -		if (avail_to_resrv_perfctr_nmi_bit(i)) -			break; -	no_ctr_free = (i == 4); -	if (no_ctr_free) { -		WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " -		     "cpu_khz value may be incorrect.\n"); -		i = 3; -		rdmsrl(MSR_K7_EVNTSEL3, evntsel3); -		wrmsrl(MSR_K7_EVNTSEL3, 0); -		rdmsrl(MSR_K7_PERFCTR3, pmc3); -	} else { -		reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); -		reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); -	} -	local_irq_save(flags); -	/* start measuring cycles, incrementing from 0 */ -	wrmsrl(MSR_K7_PERFCTR0 + i, 0); -	wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); -	rdtscl(tsc_start); -	do { -		rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); -		tsc_now = get_cycles(); -	} while ((tsc_now - tsc_start) < TICK_COUNT); - -	local_irq_restore(flags); -	if (no_ctr_free) { -		wrmsrl(MSR_K7_EVNTSEL3, 0); -		wrmsrl(MSR_K7_PERFCTR3, pmc3); -		wrmsrl(MSR_K7_EVNTSEL3, evntsel3); -	} else { -		release_perfctr_nmi(MSR_K7_PERFCTR0 + i); -		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); -	} - -	return pmc_now * tsc_khz / (tsc_now - tsc_start); -} - -static struct irqaction irq0 = { -	.handler	= timer_interrupt, -	.flags		= IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER, -	.name		= "timer" -}; - -void __init hpet_time_init(void) -{ -	if (!hpet_enable()) -		setup_pit_timer(); - -	setup_irq(0, &irq0); -} - -void __init time_init(void) -{ -	tsc_init(); - -	late_time_init = choose_time_init(); -} diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 808031a5ba1..699f7eeb896 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -4,7 +4,7 @@  #include <asm/e820.h>  /* ready for x86_64 and x86 */ -unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE);  void __init reserve_trampoline_memory(void)  { @@ -26,7 +26,7 @@ void __init reserve_trampoline_memory(void)   * bootstrap into the page concerned. The caller   * has made sure it's suitably aligned.   */ -unsigned long setup_trampoline(void) +unsigned long __cpuinit setup_trampoline(void)  {  	memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);  	return virt_to_phys(trampoline_base); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 66d874e5404..8508237e8e4 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -28,16 +28,12 @@   */  #include <linux/linkage.h> +#include <linux/init.h>  #include <asm/segment.h>  #include <asm/page_types.h>  /* We can free up trampoline after bootup if cpu hotplug is not supported. */ -#ifndef CONFIG_HOTPLUG_CPU -.section ".cpuinit.data","aw",@progbits -#else -.section .rodata,"a",@progbits -#endif - +__CPUINITRODATA  .code16  ENTRY(trampoline_data) diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index cddfb8d386b..596d54c660a 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -25,14 +25,15 @@   */  #include <linux/linkage.h> +#include <linux/init.h>  #include <asm/pgtable_types.h>  #include <asm/page_types.h>  #include <asm/msr.h>  #include <asm/segment.h>  #include <asm/processor-flags.h> -.section .rodata, "a", @progbits - +/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ +__CPUINITRODATA  .code16  ENTRY(trampoline_data) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 83264922a87..9346e102338 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -59,12 +59,12 @@  #include <asm/mach_traps.h>  #ifdef CONFIG_X86_64 +#include <asm/x86_init.h>  #include <asm/pgalloc.h>  #include <asm/proto.h>  #else  #include <asm/processor-flags.h>  #include <asm/setup.h> -#include <asm/traps.h>  asmlinkage int system_call(void); @@ -972,7 +972,5 @@ void __init trap_init(void)  	 */  	cpu_init(); -#ifdef CONFIG_X86_32 -	x86_quirk_trap_init(); -#endif +	x86_init.irqs.trap_init();  } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357..cd982f48e23 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -17,6 +17,8 @@  #include <asm/time.h>  #include <asm/delay.h>  #include <asm/hypervisor.h> +#include <asm/nmi.h> +#include <asm/x86_init.h>  unsigned int __read_mostly cpu_khz;	/* TSC clocks / usec, not used here */  EXPORT_SYMBOL(cpu_khz); @@ -400,15 +402,9 @@ unsigned long native_calibrate_tsc(void)  {  	u64 tsc1, tsc2, delta, ref1, ref2;  	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; -	unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; +	unsigned long flags, latch, ms, fast_calibrate;  	int hpet = is_hpet_enabled(), i, loopmin; -	hv_tsc_khz = get_hypervisor_tsc_freq(); -	if (hv_tsc_khz) { -		printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); -		return hv_tsc_khz; -	} -  	local_irq_save(flags);  	fast_calibrate = quick_pit_calibrate();  	local_irq_restore(flags); @@ -566,7 +562,7 @@ int recalibrate_cpu_khz(void)  	unsigned long cpu_khz_old = cpu_khz;  	if (cpu_has_tsc) { -		tsc_khz = calibrate_tsc(); +		tsc_khz = x86_platform.calibrate_tsc();  		cpu_khz = tsc_khz;  		cpu_data(0).loops_per_jiffy =  			cpufreq_scale(cpu_data(0).loops_per_jiffy, @@ -670,7 +666,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,  	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||  			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||  			(val == CPUFREQ_RESUMECHANGE)) { -		*lpj = 	cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); +		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);  		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);  		if (!(freq->flags & CPUFREQ_CONST_LOOPS)) @@ -744,10 +740,16 @@ static cycle_t __vsyscall_fn vread_tsc(void)  }  #endif +static void resume_tsc(void) +{ +	clocksource_tsc.cycle_last = 0; +} +  static struct clocksource clocksource_tsc = {  	.name                   = "tsc",  	.rating                 = 300,  	.read                   = read_tsc, +	.resume			= resume_tsc,  	.mask                   = CLOCKSOURCE_MASK(64),  	.shift                  = 22,  	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS | @@ -761,12 +763,14 @@ void mark_tsc_unstable(char *reason)  {  	if (!tsc_unstable) {  		tsc_unstable = 1; -		printk("Marking TSC unstable due to %s\n", reason); +		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);  		/* Change only the rating, when not registered */  		if (clocksource_tsc.mult) -			clocksource_change_rating(&clocksource_tsc, 0); -		else +			clocksource_mark_unstable(&clocksource_tsc); +		else { +			clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;  			clocksource_tsc.rating = 0; +		}  	}  } @@ -852,15 +856,71 @@ static void __init init_tsc_clocksource(void)  	clocksource_register(&clocksource_tsc);  } +#ifdef CONFIG_X86_64 +/* + * calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency + */ +#define TICK_COUNT 100000000 +static unsigned long __init calibrate_cpu(void) +{ +	int tsc_start, tsc_now; +	int i, no_ctr_free; +	unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; +	unsigned long flags; + +	for (i = 0; i < 4; i++) +		if (avail_to_resrv_perfctr_nmi_bit(i)) +			break; +	no_ctr_free = (i == 4); +	if (no_ctr_free) { +		WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " +		     "cpu_khz value may be incorrect.\n"); +		i = 3; +		rdmsrl(MSR_K7_EVNTSEL3, evntsel3); +		wrmsrl(MSR_K7_EVNTSEL3, 0); +		rdmsrl(MSR_K7_PERFCTR3, pmc3); +	} else { +		reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); +		reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); +	} +	local_irq_save(flags); +	/* start measuring cycles, incrementing from 0 */ +	wrmsrl(MSR_K7_PERFCTR0 + i, 0); +	wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); +	rdtscl(tsc_start); +	do { +		rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); +		tsc_now = get_cycles(); +	} while ((tsc_now - tsc_start) < TICK_COUNT); + +	local_irq_restore(flags); +	if (no_ctr_free) { +		wrmsrl(MSR_K7_EVNTSEL3, 0); +		wrmsrl(MSR_K7_PERFCTR3, pmc3); +		wrmsrl(MSR_K7_EVNTSEL3, evntsel3); +	} else { +		release_perfctr_nmi(MSR_K7_PERFCTR0 + i); +		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); +	} + +	return pmc_now * tsc_khz / (tsc_now - tsc_start); +} +#else +static inline unsigned long calibrate_cpu(void) { return cpu_khz; } +#endif +  void __init tsc_init(void)  {  	u64 lpj;  	int cpu; +	x86_init.timers.tsc_pre_init(); +  	if (!cpu_has_tsc)  		return; -	tsc_khz = calibrate_tsc(); +	tsc_khz = x86_platform.calibrate_tsc();  	cpu_khz = tsc_khz;  	if (!tsc_khz) { @@ -868,11 +928,9 @@ void __init tsc_init(void)  		return;  	} -#ifdef CONFIG_X86_64  	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&  			(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))  		cpu_khz = calibrate_cpu(); -#endif  	printk("Detected %lu.%03lu MHz processor.\n",  			(unsigned long)cpu_khz / 1000, diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 31ffc24eec4..f068553a1b1 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -30,6 +30,7 @@  #include <asm/setup.h>  #include <asm/apic.h>  #include <asm/e820.h> +#include <asm/time.h>  #include <asm/io.h>  #include <linux/kernel_stat.h> @@ -53,7 +54,7 @@ int is_visws_box(void)  	return visws_board_type >= 0;  } -static int __init visws_time_init(void) +static void __init visws_time_init(void)  {  	printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -66,21 +67,13 @@ static int __init visws_time_init(void)  	/* Enable (unmask) the timer interrupt */  	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); -	/* -	 * Zero return means the generic timer setup code will set up -	 * the standard vector: -	 */ -	return 0; +	setup_default_timer_irq();  } -static int __init visws_pre_intr_init(void) +/* Replaces the default init_ISA_irqs in the generic setup */ +static void __init visws_pre_intr_init(void)  {  	init_VISWS_APIC_irqs(); - -	/* -	 * We dont want ISA irqs to be set up by the generic code: -	 */ -	return 1;  }  /* Quirk for machine specific memory setup. */ @@ -156,12 +149,8 @@ static void visws_machine_power_off(void)  	outl(PIIX_SPECIAL_STOP, 0xCFC);  } -static int __init visws_get_smp_config(unsigned int early) +static void __init visws_get_smp_config(unsigned int early)  { -	/* -	 * Prevent MP-table parsing by the generic code: -	 */ -	return 1;  }  /* @@ -208,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)  	apic_version[m->apicid] = ver;  } -static int __init visws_find_smp_config(unsigned int reserve) +static void __init visws_find_smp_config(unsigned int reserve)  {  	struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);  	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -230,21 +219,9 @@ static int __init visws_find_smp_config(unsigned int reserve)  		MP_processor_info(mp++);  	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - -	return 1;  } -static int visws_trap_init(void); - -static struct x86_quirks visws_x86_quirks __initdata = { -	.arch_time_init		= visws_time_init, -	.arch_pre_intr_init	= visws_pre_intr_init, -	.arch_memory_setup	= visws_memory_setup, -	.arch_intr_init		= NULL, -	.arch_trap_init		= visws_trap_init, -	.mach_get_smp_config	= visws_get_smp_config, -	.mach_find_smp_config	= visws_find_smp_config, -}; +static void visws_trap_init(void);  void __init visws_early_detect(void)  { @@ -257,11 +234,14 @@ void __init visws_early_detect(void)  		return;  	/* -	 * Install special quirks for timer, interrupt and memory setup: -	 * Fall back to generic behavior for traps: -	 * Override generic MP-table parsing: +	 * Override the default platform setup functions  	 */ -	x86_quirks = &visws_x86_quirks; +	x86_init.resources.memory_setup = visws_memory_setup; +	x86_init.mpparse.get_smp_config = visws_get_smp_config; +	x86_init.mpparse.find_smp_config = visws_find_smp_config; +	x86_init.irqs.pre_vector_init = visws_pre_intr_init; +	x86_init.irqs.trap_init = visws_trap_init; +	x86_init.timers.timer_init = visws_time_init;  	/*  	 * Install reboot quirks: @@ -400,12 +380,10 @@ static __init void cobalt_init(void)  		co_apic_read(CO_APIC_ID));  } -static int __init visws_trap_init(void) +static void __init visws_trap_init(void)  {  	lithium_init();  	cobalt_init(); - -	return 1;  }  /* diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95a7289e4b0..31e6f6cfe53 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -817,15 +817,15 @@ static inline int __init activate_vmi(void)  		vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);  		vmi_timer_ops.cancel_alarm =  			 vmi_get_function(VMI_CALL_CancelAlarm); -		pv_time_ops.time_init = vmi_time_init; -		pv_time_ops.get_wallclock = vmi_get_wallclock; -		pv_time_ops.set_wallclock = vmi_set_wallclock; +		x86_init.timers.timer_init = vmi_time_init;  #ifdef CONFIG_X86_LOCAL_APIC -		pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; -		pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; +		x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init; +		x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;  #endif  		pv_time_ops.sched_clock = vmi_sched_clock; -		pv_time_ops.get_tsc_khz = vmi_tsc_khz; +		x86_platform.calibrate_tsc = vmi_tsc_khz; +		x86_platform.get_wallclock = vmi_get_wallclock; +		x86_platform.set_wallclock = vmi_set_wallclock;  		/* We have true wallclock functions; disable CMOS clock sync */  		no_sync_cmos_clock = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 2b3eb82efee..611b9e2360d 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void)  	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));  } -/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ +/* x86_platform.calibrate_tsc = vmi_tsc_khz */  unsigned long vmi_tsc_khz(void)  {  	unsigned long long khz; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0ccb57d5ee3..a46acccec38 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -45,9 +45,9 @@ PHDRS {  	text PT_LOAD FLAGS(5);          /* R_E */  	data PT_LOAD FLAGS(7);          /* RWE */  #ifdef CONFIG_X86_64 -	user PT_LOAD FLAGS(7);          /* RWE */ +	user PT_LOAD FLAGS(5);          /* R_E */  #ifdef CONFIG_SMP -	percpu PT_LOAD FLAGS(7);        /* RWE */ +	percpu PT_LOAD FLAGS(6);        /* RW_ */  #endif  	init PT_LOAD FLAGS(7);          /* RWE */  #endif diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 25ee06a80aa..cf53a78e2dc 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)  	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;  	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;  	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; +	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();  	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);  } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c new file mode 100644 index 00000000000..4449a4a2c2e --- /dev/null +++ b/arch/x86/kernel/x86_init.c @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de> + * + *  For licencing details see kernel-base/COPYING + */ +#include <linux/init.h> + +#include <asm/bios_ebda.h> +#include <asm/paravirt.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/e820.h> +#include <asm/time.h> +#include <asm/irq.h> +#include <asm/tsc.h> + +void __cpuinit x86_init_noop(void) { } +void __init x86_init_uint_noop(unsigned int unused) { } +void __init x86_init_pgd_noop(pgd_t *unused) { } + +/* + * The platform setup functions are preset with the default functions + * for standard PC hardware. + */ +struct x86_init_ops x86_init __initdata = { + +	.resources = { +		.probe_roms		= x86_init_noop, +		.reserve_resources	= reserve_standard_io_resources, +		.memory_setup		= default_machine_specific_memory_setup, +	}, + +	.mpparse = { +		.mpc_record		= x86_init_uint_noop, +		.setup_ioapic_ids	= x86_init_noop, +		.mpc_apic_id		= default_mpc_apic_id, +		.smp_read_mpc_oem	= default_smp_read_mpc_oem, +		.mpc_oem_bus_info	= default_mpc_oem_bus_info, +		.find_smp_config	= default_find_smp_config, +		.get_smp_config		= default_get_smp_config, +	}, + +	.irqs = { +		.pre_vector_init	= init_ISA_irqs, +		.intr_init		= native_init_IRQ, +		.trap_init		= x86_init_noop, +	}, + +	.oem = { +		.arch_setup		= x86_init_noop, +		.banner			= default_banner, +	}, + +	.paging = { +		.pagetable_setup_start	= native_pagetable_setup_start, +		.pagetable_setup_done	= native_pagetable_setup_done, +	}, + +	.timers = { +		.setup_percpu_clockev	= setup_boot_APIC_clock, +		.tsc_pre_init		= x86_init_noop, +		.timer_init		= hpet_time_init, +	}, +}; + +struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { +	.setup_percpu_clockev		= setup_secondary_APIC_clock, +}; + +struct x86_platform_ops x86_platform = { +	.calibrate_tsc			= native_calibrate_tsc, +	.get_wallclock			= mach_get_cmos_time, +	.set_wallclock			= mach_set_rtc_mmss, +};  |