diff options
Diffstat (limited to 'arch/x86/kernel')
58 files changed, 1695 insertions, 1300 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 56ebd1f9844..bb8529275aa 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -49,7 +49,6 @@ obj-y				+= cpu/  obj-y				+= acpi/  obj-y				+= reboot.o  obj-$(CONFIG_X86_32)		+= reboot_32.o -obj-$(CONFIG_MCA)		+= mca_32.o  obj-$(CONFIG_X86_MSR)		+= msr.o  obj-$(CONFIG_X86_CPUID)		+= cpuid.o  obj-$(CONFIG_PCI)		+= early-quirks.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7c439fe4941..8afb6931981 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -990,7 +990,7 @@ void __init mp_config_acpi_legacy_irqs(void)  	int i;  	struct mpc_intsrc mp_irq; -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +#ifdef CONFIG_EISA  	/*  	 * Fabricate the legacy ISA bus (bus #31).  	 */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 3722179a49d..39a222e094a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1326,11 +1326,13 @@ void __cpuinit setup_local_APIC(void)  			       acked);  			break;  		} -		if (cpu_has_tsc) { -			rdtscll(ntsc); -			max_loops = (cpu_khz << 10) - (ntsc - tsc); -		} else -			max_loops--; +		if (queued) { +			if (cpu_has_tsc) { +				rdtscll(ntsc); +				max_loops = (cpu_khz << 10) - (ntsc - tsc); +			} else +				max_loops--; +		}  	} while (queued && max_loops > 0);  	WARN_ON(max_loops <= 0); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 359b6899a36..0e881c46e8c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -227,6 +227,7 @@ static struct apic apic_flat =  {  	.read				= native_apic_mem_read,  	.write				= native_apic_mem_write, +	.eoi_write			= native_apic_mem_write,  	.icr_read			= native_apic_icr_read,  	.icr_write			= native_apic_icr_write,  	.wait_icr_idle			= native_apic_wait_icr_idle, @@ -386,6 +387,7 @@ static struct apic apic_physflat =  {  	.read				= native_apic_mem_read,  	.write				= native_apic_mem_write, +	.eoi_write			= native_apic_mem_write,  	.icr_read			= native_apic_icr_read,  	.icr_write			= native_apic_icr_write,  	.wait_icr_idle			= native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 634ae6cdd5c..a6e4c6e06c0 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -181,6 +181,7 @@ struct apic apic_noop = {  	.read				= noop_apic_read,  	.write				= noop_apic_write, +	.eoi_write			= noop_apic_write,  	.icr_read			= noop_apic_icr_read,  	.icr_write			= noop_apic_icr_write,  	.wait_icr_idle			= noop_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 23e75422e01..6ec6d5d297c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -295,6 +295,7 @@ static struct apic apic_numachip __refconst = {  	.read				= native_apic_mem_read,  	.write				= native_apic_mem_write, +	.eoi_write			= native_apic_mem_write,  	.icr_read			= native_apic_icr_read,  	.icr_write			= native_apic_icr_write,  	.wait_icr_idle			= native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 0cdec7065af..31fbdbfbf96 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -248,6 +248,7 @@ static struct apic apic_bigsmp = {  	.read				= native_apic_mem_read,  	.write				= native_apic_mem_write, +	.eoi_write			= native_apic_mem_write,  	.icr_read			= native_apic_icr_read,  	.icr_write			= native_apic_icr_write,  	.wait_icr_idle			= native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index e42d1d3b913..db4ab1be3c7 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -678,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = {  	.read				= native_apic_mem_read,  	.write				= native_apic_mem_write, +	.eoi_write			= native_apic_mem_write,  	.icr_read			= native_apic_icr_read,  	.icr_write			= native_apic_icr_write,  	.wait_icr_idle			= native_apic_wait_icr_idle, @@ -742,6 +743,7 @@ static struct apic __refdata apic_es7000 = {  	.read				= native_apic_mem_read,  	.write				= native_apic_mem_write, +	.eoi_write			= native_apic_mem_write,  	.icr_read			= native_apic_icr_read,  	.icr_write			= native_apic_icr_write,  	.wait_icr_idle			= native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ef0648cd708..ac96561d1a9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -68,24 +68,6 @@  #define for_each_irq_pin(entry, head) \  	for (entry = head; entry; entry = entry->next) -static void		__init __ioapic_init_mappings(void); - -static unsigned int	__io_apic_read  (unsigned int apic, unsigned int reg); -static void		__io_apic_write (unsigned int apic, unsigned int reg, unsigned int val); -static void		__io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); - -static struct io_apic_ops io_apic_ops = { -	.init	= __ioapic_init_mappings, -	.read	= __io_apic_read, -	.write	= __io_apic_write, -	.modify = __io_apic_modify, -}; - -void __init set_io_apic_ops(const struct io_apic_ops *ops) -{ -	io_apic_ops = *ops; -} -  #ifdef CONFIG_IRQ_REMAP  static void irq_remap_modify_chip_defaults(struct irq_chip *chip);  static inline bool irq_remapped(struct irq_cfg *cfg) @@ -158,7 +140,7 @@ int mp_irq_entries;  /* GSI interrupts */  static int nr_irqs_gsi = NR_IRQS_LEGACY; -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +#ifdef CONFIG_EISA  int mp_bus_id_to_type[MAX_MP_BUSSES];  #endif @@ -329,21 +311,6 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg)  	irq_free_desc(at);  } -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ -	return io_apic_ops.read(apic, reg); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ -	io_apic_ops.write(apic, reg, value); -} - -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ -	io_apic_ops.modify(apic, reg, value); -} -  struct io_apic {  	unsigned int index; @@ -365,14 +332,14 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)  	writel(vector, &io_apic->eoi);  } -static unsigned int __io_apic_read(unsigned int apic, unsigned int reg) +unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)  {  	struct io_apic __iomem *io_apic = io_apic_base(apic);  	writel(reg, &io_apic->index);  	return readl(&io_apic->data);  } -static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)  {  	struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -386,7 +353,7 @@ static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int va   *   * Older SiS APIC requires we rewrite the index register   */ -static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)  {  	struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -395,29 +362,6 @@ static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int v  	writel(value, &io_apic->data);  } -static bool io_apic_level_ack_pending(struct irq_cfg *cfg) -{ -	struct irq_pin_list *entry; -	unsigned long flags; - -	raw_spin_lock_irqsave(&ioapic_lock, flags); -	for_each_irq_pin(entry, cfg->irq_2_pin) { -		unsigned int reg; -		int pin; - -		pin = entry->pin; -		reg = io_apic_read(entry->apic, 0x10 + pin*2); -		/* Is the remote IRR bit set? */ -		if (reg & IO_APIC_REDIR_REMOTE_IRR) { -			raw_spin_unlock_irqrestore(&ioapic_lock, flags); -			return true; -		} -	} -	raw_spin_unlock_irqrestore(&ioapic_lock, flags); - -	return false; -} -  union entry_union {  	struct { u32 w1, w2; };  	struct IO_APIC_route_entry entry; @@ -891,7 +835,7 @@ static int __init find_isa_irq_apic(int irq, int type)  	return -1;  } -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA  /*   * EISA Edge/Level control register, ELCR   */ @@ -928,12 +872,6 @@ static int EISA_ELCR(unsigned int irq)  #define default_PCI_trigger(idx)	(1)  #define default_PCI_polarity(idx)	(1) -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx)	(1) -#define default_MCA_polarity(idx)	default_ISA_polarity(idx) -  static int irq_polarity(int idx)  {  	int bus = mp_irqs[idx].srcbus; @@ -991,7 +929,7 @@ static int irq_trigger(int idx)  				trigger = default_ISA_trigger(idx);  			else  				trigger = default_PCI_trigger(idx); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA  			switch (mp_bus_id_to_type[bus]) {  				case MP_BUS_ISA: /* ISA pin */  				{ @@ -1008,11 +946,6 @@ static int irq_trigger(int idx)  					/* set before the switch */  					break;  				} -				case MP_BUS_MCA: /* MCA pin */ -				{ -					trigger = default_MCA_trigger(idx); -					break; -				}  				default:  				{  					printk(KERN_WARNING "broken BIOS!!\n"); @@ -2439,6 +2372,29 @@ static void ack_apic_edge(struct irq_data *data)  atomic_t irq_mis_count;  #ifdef CONFIG_GENERIC_PENDING_IRQ +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +{ +	struct irq_pin_list *entry; +	unsigned long flags; + +	raw_spin_lock_irqsave(&ioapic_lock, flags); +	for_each_irq_pin(entry, cfg->irq_2_pin) { +		unsigned int reg; +		int pin; + +		pin = entry->pin; +		reg = io_apic_read(entry->apic, 0x10 + pin*2); +		/* Is the remote IRR bit set? */ +		if (reg & IO_APIC_REDIR_REMOTE_IRR) { +			raw_spin_unlock_irqrestore(&ioapic_lock, flags); +			return true; +		} +	} +	raw_spin_unlock_irqrestore(&ioapic_lock, flags); + +	return false; +} +  static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)  {  	/* If we are moving the irq we need to mask it */ @@ -3756,12 +3712,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)  	return res;  } -void __init ioapic_and_gsi_init(void) -{ -	io_apic_ops.init(); -} - -static void __init __ioapic_init_mappings(void) +void __init native_io_apic_init_mappings(void)  {  	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;  	struct resource *ioapic_res; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 00d2422ca7c..f00a68cca37 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -530,6 +530,7 @@ static struct apic __refdata apic_numaq = {  	.read				= native_apic_mem_read,  	.write				= native_apic_mem_write, +	.eoi_write			= native_apic_mem_write,  	.icr_read			= native_apic_icr_read,  	.icr_write			= native_apic_icr_write,  	.wait_icr_idle			= native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index ff2c1b9aac4..1b291da09e6 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -142,6 +142,7 @@ static struct apic apic_default = {  	.read				= native_apic_mem_read,  	.write				= native_apic_mem_write, +	.eoi_write			= native_apic_mem_write,  	.icr_read			= native_apic_icr_read,  	.icr_write			= native_apic_icr_write,  	.wait_icr_idle			= native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index fea000b27f0..659897c0075 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -546,6 +546,7 @@ static struct apic apic_summit = {  	.read				= native_apic_mem_read,  	.write				= native_apic_mem_write, +	.eoi_write			= native_apic_mem_write,  	.icr_read			= native_apic_icr_read,  	.icr_write			= native_apic_icr_write,  	.wait_icr_idle			= native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 48f3103b3c9..ff35cff0e1a 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -260,6 +260,7 @@ static struct apic apic_x2apic_cluster = {  	.read				= native_apic_msr_read,  	.write				= native_apic_msr_write, +	.eoi_write			= native_apic_msr_eoi_write,  	.icr_read			= native_x2apic_icr_read,  	.icr_write			= native_x2apic_icr_write,  	.wait_icr_idle			= native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 991e315f422..c17e982db27 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -172,6 +172,7 @@ static struct apic apic_x2apic_phys = {  	.read				= native_apic_msr_read,  	.write				= native_apic_msr_write, +	.eoi_write			= native_apic_msr_eoi_write,  	.icr_read			= native_x2apic_icr_read,  	.icr_write			= native_x2apic_icr_write,  	.wait_icr_idle			= native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 87bfa69e216..c6d03f7a440 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -404,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = {  	.read				= native_apic_msr_read,  	.write				= native_apic_msr_write, +	.eoi_write			= native_apic_msr_eoi_write,  	.icr_read			= native_x2apic_icr_read,  	.icr_write			= native_x2apic_icr_write,  	.wait_icr_idle			= native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 5da1269e8dd..e2dbcb7dabd 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -27,21 +27,29 @@ static int num_scan_areas;  static __init int set_corruption_check(char *arg)  { -	char *end; +	ssize_t ret; +	unsigned long val; -	memory_corruption_check = simple_strtol(arg, &end, 10); +	ret = kstrtoul(arg, 10, &val); +	if (ret) +		return ret; -	return (*end == 0) ? 0 : -EINVAL; +	memory_corruption_check = val; +	return 0;  }  early_param("memory_corruption_check", set_corruption_check);  static __init int set_corruption_check_period(char *arg)  { -	char *end; +	ssize_t ret; +	unsigned long val; -	corruption_check_period = simple_strtoul(arg, &end, 10); +	ret = kstrtoul(arg, 10, &val); +	if (ret) +		return ret; -	return (*end == 0) ? 0 : -EINVAL; +	corruption_check_period = val; +	return 0;  }  early_param("memory_corruption_check_period", set_corruption_check_period); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cf79302198a..82f29e70d05 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1185,7 +1185,7 @@ void __cpuinit cpu_init(void)  	oist = &per_cpu(orig_ist, cpu);  #ifdef CONFIG_NUMA -	if (cpu != 0 && percpu_read(numa_node) == 0 && +	if (cpu != 0 && this_cpu_read(numa_node) == 0 &&  	    early_cpu_to_node(cpu) != NUMA_NO_NODE)  		set_numa_node(early_cpu_to_node(cpu));  #endif diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index b8f3653dddb..9a7c90d80bc 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)  					new_l2 = this_leaf.size/1024;  					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;  					index_msb = get_count_order(num_threads_sharing); -					l2_id = c->apicid >> index_msb; +					l2_id = c->apicid & ~((1 << index_msb) - 1);  					break;  				case 3:  					new_l3 = this_leaf.size/1024;  					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;  					index_msb = get_count_order(  							num_threads_sharing); -					l3_id = c->apicid >> index_msb; +					l3_id = c->apicid & ~((1 << index_msb) - 1);  					break;  				default:  					break; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 5502b289341..36565373af8 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -23,7 +23,7 @@   * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)   *   * Arrays used to match for this should also be declared using - * MODULE_DEVICE_TABLE(x86_cpu, ...) + * MODULE_DEVICE_TABLE(x86cpu, ...)   *   * This always matches against the boot cpu, assuming models and features are   * consistent over all CPUs. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 11c9166c333..2afcbd253e1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -583,7 +583,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)  	struct mce m;  	int i; -	percpu_inc(mce_poll_count); +	this_cpu_inc(mce_poll_count);  	mce_gather_info(&m, NULL); @@ -1017,7 +1017,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)  	atomic_inc(&mce_entry); -	percpu_inc(mce_exception_count); +	this_cpu_inc(mce_exception_count);  	if (!banks)  		goto out; @@ -1431,6 +1431,43 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)  		 */  		 if (c->x86 == 6 && banks > 0)  			mce_banks[0].ctl = 0; + +		 /* +		  * Turn off MC4_MISC thresholding banks on those models since +		  * they're not supported there. +		  */ +		 if (c->x86 == 0x15 && +		     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { +			 int i; +			 u64 val, hwcr; +			 bool need_toggle; +			 u32 msrs[] = { +				0x00000413, /* MC4_MISC0 */ +				0xc0000408, /* MC4_MISC1 */ +			 }; + +			 rdmsrl(MSR_K7_HWCR, hwcr); + +			 /* McStatusWrEn has to be set */ +			 need_toggle = !(hwcr & BIT(18)); + +			 if (need_toggle) +				 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + +			 for (i = 0; i < ARRAY_SIZE(msrs); i++) { +				 rdmsrl(msrs[i], val); + +				 /* CntP bit set? */ +				 if (val & BIT(62)) { +					 val &= ~BIT(62); +					 wrmsrl(msrs[i], val); +				 } +			 } + +			 /* restore old settings */ +			 if (need_toggle) +				 wrmsrl(MSR_K7_HWCR, hwcr); +		 }  	}  	if (c->x86_vendor == X86_VENDOR_INTEL) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 99b57179f91..f4873a64f46 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -51,6 +51,7 @@ struct threshold_block {  	unsigned int		cpu;  	u32			address;  	u16			interrupt_enable; +	bool			interrupt_capable;  	u16			threshold_limit;  	struct kobject		kobj;  	struct list_head	miscj; @@ -83,6 +84,21 @@ struct thresh_restart {  	u16			old_limit;  }; +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ +	/* +	 * bank 4 supports APIC LVT interrupts implicitly since forever. +	 */ +	if (bank == 4) +		return true; + +	/* +	 * IntP: interrupt present; if this bit is set, the thresholding +	 * bank can generate APIC LVT interrupts +	 */ +	return msr_high_bits & BIT(28); +} +  static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)  {  	int msr = (hi & MASK_LVTOFF_HI) >> 20; @@ -104,8 +120,10 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)  	return 1;  }; -/* must be called with correct cpu affinity */ -/* Called via smp_call_function_single() */ +/* + * Called via smp_call_function_single(), must be called with correct + * cpu affinity. + */  static void threshold_restart_bank(void *_tr)  {  	struct thresh_restart *tr = _tr; @@ -128,6 +146,12 @@ static void threshold_restart_bank(void *_tr)  		    (new_count & THRESHOLD_MAX);  	} +	/* clear IntType */ +	hi &= ~MASK_INT_TYPE_HI; + +	if (!tr->b->interrupt_capable) +		goto done; +  	if (tr->set_lvt_off) {  		if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {  			/* set new lvt offset */ @@ -136,9 +160,10 @@ static void threshold_restart_bank(void *_tr)  		}  	} -	tr->b->interrupt_enable ? -	    (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : -	    (hi &= ~MASK_INT_TYPE_HI); +	if (tr->b->interrupt_enable) +		hi |= INT_TYPE_APIC; + + done:  	hi |= MASK_COUNT_EN_HI;  	wrmsr(tr->b->address, lo, hi); @@ -202,14 +227,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)  			if (shared_bank[bank] && c->cpu_core_id)  				break; -			offset = setup_APIC_mce(offset, -						(high & MASK_LVTOFF_HI) >> 20); -  			memset(&b, 0, sizeof(b)); -			b.cpu		= cpu; -			b.bank		= bank; -			b.block		= block; -			b.address	= address; +			b.cpu			= cpu; +			b.bank			= bank; +			b.block			= block; +			b.address		= address; +			b.interrupt_capable	= lvt_interrupt_supported(bank, high); + +			if (b.interrupt_capable) { +				int new = (high & MASK_LVTOFF_HI) >> 20; +				offset  = setup_APIC_mce(offset, new); +			}  			mce_threshold_block_init(&b, offset);  			mce_threshold_vector = amd_threshold_interrupt; @@ -309,6 +337,9 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)  	struct thresh_restart tr;  	unsigned long new; +	if (!b->interrupt_capable) +		return -EINVAL; +  	if (strict_strtoul(buf, 0, &new) < 0)  		return -EINVAL; @@ -390,10 +421,10 @@ RW_ATTR(threshold_limit);  RW_ATTR(error_count);  static struct attribute *default_attrs[] = { -	&interrupt_enable.attr,  	&threshold_limit.attr,  	&error_count.attr, -	NULL +	NULL,	/* possibly interrupt_enable if supported, see below */ +	NULL,  };  #define to_block(k)	container_of(k, struct threshold_block, kobj) @@ -467,8 +498,14 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,  	b->cpu			= cpu;  	b->address		= address;  	b->interrupt_enable	= 0; +	b->interrupt_capable	= lvt_interrupt_supported(bank, high);  	b->threshold_limit	= THRESHOLD_MAX; +	if (b->interrupt_capable) +		threshold_ktype.default_attrs[2] = &interrupt_enable.attr; +	else +		threshold_ktype.default_attrs[2] = NULL; +  	INIT_LIST_HEAD(&b->miscj);  	if (per_cpu(threshold_banks, cpu)[bank]->blocks) { diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bb8e03407e1..e049d6da018 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event)  	/* mark unused */  	event->hw.extra_reg.idx = EXTRA_REG_NONE; - -	/* mark not used */ -	event->hw.extra_reg.idx = EXTRA_REG_NONE;  	event->hw.branch_reg.idx = EXTRA_REG_NONE;  	return x86_pmu.hw_config(event); @@ -1186,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs)  	int idx, handled = 0;  	u64 val; -	perf_sample_data_init(&data, 0); -  	cpuc = &__get_cpu_var(cpu_hw_events);  	/* @@ -1222,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)  		 * event overflow  		 */  		handled++; -		data.period	= event->hw.last_period; +		perf_sample_data_init(&data, 0, event->hw.last_period);  		if (!x86_perf_event_set_period(event))  			continue; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 95e7fe1c5f0..11a4eb9131d 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event)  static int amd_pmu_hw_config(struct perf_event *event)  { -	int ret = x86_pmu_hw_config(event); +	int ret; +	/* pass precise event sampling to ibs: */ +	if (event->attr.precise_ip && get_ibs_caps()) +		return -ENOENT; + +	ret = x86_pmu_hw_config(event);  	if (ret)  		return ret; @@ -205,10 +210,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,  	 * when we come here  	 */  	for (i = 0; i < x86_pmu.num_counters; i++) { -		if (nb->owners[i] == event) { -			cmpxchg(nb->owners+i, event, NULL); +		if (cmpxchg(nb->owners + i, event, NULL) == event)  			break; -		}  	}  } @@ -493,6 +496,7 @@ static __initconst const struct x86_pmu amd_pmu = {   * 0x023	DE	PERF_CTL[2:0]   * 0x02D	LS	PERF_CTL[3]   * 0x02E	LS	PERF_CTL[3,0] + * 0x031	LS	PERF_CTL[2:0] (**)   * 0x043	CU	PERF_CTL[2:0]   * 0x045	CU	PERF_CTL[2:0]   * 0x046	CU	PERF_CTL[2:0] @@ -506,10 +510,12 @@ static __initconst const struct x86_pmu amd_pmu = {   * 0x0DD	LS	PERF_CTL[5:0]   * 0x0DE	LS	PERF_CTL[5:0]   * 0x0DF	LS	PERF_CTL[5:0] + * 0x1C0	EX	PERF_CTL[5:3]   * 0x1D6	EX	PERF_CTL[5:0]   * 0x1D8	EX	PERF_CTL[5:0]   * - * (*) depending on the umask all FPU counters may be used + * (*)  depending on the umask all FPU counters may be used + * (**) only one unitmask enabled at a time   */  static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0); @@ -559,6 +565,12 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev  			return &amd_f15_PMC3;  		case 0x02E:  			return &amd_f15_PMC30; +		case 0x031: +			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) +				return &amd_f15_PMC20; +			return &emptyconstraint; +		case 0x1C0: +			return &amd_f15_PMC53;  		default:  			return &amd_f15_PMC50;  		} diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 3b8a2d30d14..da9bcdcd985 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -9,6 +9,7 @@  #include <linux/perf_event.h>  #include <linux/module.h>  #include <linux/pci.h> +#include <linux/ptrace.h>  #include <asm/apic.h> @@ -16,36 +17,591 @@ static u32 ibs_caps;  #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) -static struct pmu perf_ibs; +#include <linux/kprobes.h> +#include <linux/hardirq.h> + +#include <asm/nmi.h> + +#define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) +#define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT + +enum ibs_states { +	IBS_ENABLED	= 0, +	IBS_STARTED	= 1, +	IBS_STOPPING	= 2, + +	IBS_MAX_STATES, +}; + +struct cpu_perf_ibs { +	struct perf_event	*event; +	unsigned long		state[BITS_TO_LONGS(IBS_MAX_STATES)]; +}; + +struct perf_ibs { +	struct pmu	pmu; +	unsigned int	msr; +	u64		config_mask; +	u64		cnt_mask; +	u64		enable_mask; +	u64		valid_mask; +	u64		max_period; +	unsigned long	offset_mask[1]; +	int		offset_max; +	struct cpu_perf_ibs __percpu *pcpu; +	u64		(*get_count)(u64 config); +}; + +struct perf_ibs_data { +	u32		size; +	union { +		u32	data[0];	/* data buffer starts here */ +		u32	caps; +	}; +	u64		regs[MSR_AMD64_IBS_REG_COUNT_MAX]; +}; + +static int +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) +{ +	s64 left = local64_read(&hwc->period_left); +	s64 period = hwc->sample_period; +	int overflow = 0; + +	/* +	 * If we are way outside a reasonable range then just skip forward: +	 */ +	if (unlikely(left <= -period)) { +		left = period; +		local64_set(&hwc->period_left, left); +		hwc->last_period = period; +		overflow = 1; +	} + +	if (unlikely(left < (s64)min)) { +		left += period; +		local64_set(&hwc->period_left, left); +		hwc->last_period = period; +		overflow = 1; +	} + +	/* +	 * If the hw period that triggers the sw overflow is too short +	 * we might hit the irq handler. This biases the results. +	 * Thus we shorten the next-to-last period and set the last +	 * period to the max period. +	 */ +	if (left > max) { +		left -= max; +		if (left > max) +			left = max; +		else if (left < min) +			left = min; +	} + +	*hw_period = (u64)left; + +	return overflow; +} + +static  int +perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) +{ +	struct hw_perf_event *hwc = &event->hw; +	int shift = 64 - width; +	u64 prev_raw_count; +	u64 delta; + +	/* +	 * Careful: an NMI might modify the previous event value. +	 * +	 * Our tactic to handle this is to first atomically read and +	 * exchange a new raw count - then add that new-prev delta +	 * count to the generic event atomically: +	 */ +	prev_raw_count = local64_read(&hwc->prev_count); +	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, +					new_raw_count) != prev_raw_count) +		return 0; + +	/* +	 * Now we have the new raw value and have updated the prev +	 * timestamp already. We can now calculate the elapsed delta +	 * (event-)time and add that to the generic event. +	 * +	 * Careful, not all hw sign-extends above the physical width +	 * of the count. +	 */ +	delta = (new_raw_count << shift) - (prev_raw_count << shift); +	delta >>= shift; + +	local64_add(delta, &event->count); +	local64_sub(delta, &hwc->period_left); + +	return 1; +} + +static struct perf_ibs perf_ibs_fetch; +static struct perf_ibs perf_ibs_op; + +static struct perf_ibs *get_ibs_pmu(int type) +{ +	if (perf_ibs_fetch.pmu.type == type) +		return &perf_ibs_fetch; +	if (perf_ibs_op.pmu.type == type) +		return &perf_ibs_op; +	return NULL; +} + +/* + * Use IBS for precise event sampling: + * + *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count + *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p + *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops + * + * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, + * MSRC001_1033) is used to select either cycle or micro-ops counting + * mode. + * + * The rip of IBS samples has skid 0. Thus, IBS supports precise + * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the + * rip is invalid when IBS was not able to record the rip correctly. + * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. + * + */ +static int perf_ibs_precise_event(struct perf_event *event, u64 *config) +{ +	switch (event->attr.precise_ip) { +	case 0: +		return -ENOENT; +	case 1: +	case 2: +		break; +	default: +		return -EOPNOTSUPP; +	} + +	switch (event->attr.type) { +	case PERF_TYPE_HARDWARE: +		switch (event->attr.config) { +		case PERF_COUNT_HW_CPU_CYCLES: +			*config = 0; +			return 0; +		} +		break; +	case PERF_TYPE_RAW: +		switch (event->attr.config) { +		case 0x0076: +			*config = 0; +			return 0; +		case 0x00C1: +			*config = IBS_OP_CNT_CTL; +			return 0; +		} +		break; +	default: +		return -ENOENT; +	} + +	return -EOPNOTSUPP; +}  static int perf_ibs_init(struct perf_event *event)  { -	if (perf_ibs.type != event->attr.type) +	struct hw_perf_event *hwc = &event->hw; +	struct perf_ibs *perf_ibs; +	u64 max_cnt, config; +	int ret; + +	perf_ibs = get_ibs_pmu(event->attr.type); +	if (perf_ibs) { +		config = event->attr.config; +	} else { +		perf_ibs = &perf_ibs_op; +		ret = perf_ibs_precise_event(event, &config); +		if (ret) +			return ret; +	} + +	if (event->pmu != &perf_ibs->pmu)  		return -ENOENT; + +	if (config & ~perf_ibs->config_mask) +		return -EINVAL; + +	if (hwc->sample_period) { +		if (config & perf_ibs->cnt_mask) +			/* raw max_cnt may not be set */ +			return -EINVAL; +		if (!event->attr.sample_freq && hwc->sample_period & 0x0f) +			/* +			 * lower 4 bits can not be set in ibs max cnt, +			 * but allowing it in case we adjust the +			 * sample period to set a frequency. +			 */ +			return -EINVAL; +		hwc->sample_period &= ~0x0FULL; +		if (!hwc->sample_period) +			hwc->sample_period = 0x10; +	} else { +		max_cnt = config & perf_ibs->cnt_mask; +		config &= ~perf_ibs->cnt_mask; +		event->attr.sample_period = max_cnt << 4; +		hwc->sample_period = event->attr.sample_period; +	} + +	if (!hwc->sample_period) +		return -EINVAL; + +	/* +	 * If we modify hwc->sample_period, we also need to update +	 * hwc->last_period and hwc->period_left. +	 */ +	hwc->last_period = hwc->sample_period; +	local64_set(&hwc->period_left, hwc->sample_period); + +	hwc->config_base = perf_ibs->msr; +	hwc->config = config; +  	return 0;  } +static int perf_ibs_set_period(struct perf_ibs *perf_ibs, +			       struct hw_perf_event *hwc, u64 *period) +{ +	int overflow; + +	/* ignore lower 4 bits in min count: */ +	overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); +	local64_set(&hwc->prev_count, 0); + +	return overflow; +} + +static u64 get_ibs_fetch_count(u64 config) +{ +	return (config & IBS_FETCH_CNT) >> 12; +} + +static u64 get_ibs_op_count(u64 config) +{ +	u64 count = 0; + +	if (config & IBS_OP_VAL) +		count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ + +	if (ibs_caps & IBS_CAPS_RDWROPCNT) +		count += (config & IBS_OP_CUR_CNT) >> 32; + +	return count; +} + +static void +perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, +		      u64 *config) +{ +	u64 count = perf_ibs->get_count(*config); + +	/* +	 * Set width to 64 since we do not overflow on max width but +	 * instead on max count. In perf_ibs_set_period() we clear +	 * prev count manually on overflow. +	 */ +	while (!perf_event_try_update(event, count, 64)) { +		rdmsrl(event->hw.config_base, *config); +		count = perf_ibs->get_count(*config); +	} +} + +static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, +					 struct hw_perf_event *hwc, u64 config) +{ +	wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); +} + +/* + * Erratum #420 Instruction-Based Sampling Engine May Generate + * Interrupt that Cannot Be Cleared: + * + * Must clear counter mask first, then clear the enable bit. See + * Revision Guide for AMD Family 10h Processors, Publication #41322. + */ +static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, +					  struct hw_perf_event *hwc, u64 config) +{ +	config &= ~perf_ibs->cnt_mask; +	wrmsrl(hwc->config_base, config); +	config &= ~perf_ibs->enable_mask; +	wrmsrl(hwc->config_base, config); +} + +/* + * We cannot restore the ibs pmu state, so we always needs to update + * the event while stopping it and then reset the state when starting + * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in + * perf_ibs_start()/perf_ibs_stop() and instead always do it. + */ +static void perf_ibs_start(struct perf_event *event, int flags) +{ +	struct hw_perf_event *hwc = &event->hw; +	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); +	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); +	u64 period; + +	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) +		return; + +	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); +	hwc->state = 0; + +	perf_ibs_set_period(perf_ibs, hwc, &period); +	set_bit(IBS_STARTED, pcpu->state); +	perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + +	perf_event_update_userpage(event); +} + +static void perf_ibs_stop(struct perf_event *event, int flags) +{ +	struct hw_perf_event *hwc = &event->hw; +	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); +	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); +	u64 config; +	int stopping; + +	stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); + +	if (!stopping && (hwc->state & PERF_HES_UPTODATE)) +		return; + +	rdmsrl(hwc->config_base, config); + +	if (stopping) { +		set_bit(IBS_STOPPING, pcpu->state); +		perf_ibs_disable_event(perf_ibs, hwc, config); +		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); +		hwc->state |= PERF_HES_STOPPED; +	} + +	if (hwc->state & PERF_HES_UPTODATE) +		return; + +	/* +	 * Clear valid bit to not count rollovers on update, rollovers +	 * are only updated in the irq handler. +	 */ +	config &= ~perf_ibs->valid_mask; + +	perf_ibs_event_update(perf_ibs, event, &config); +	hwc->state |= PERF_HES_UPTODATE; +} +  static int perf_ibs_add(struct perf_event *event, int flags)  { +	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); +	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + +	if (test_and_set_bit(IBS_ENABLED, pcpu->state)) +		return -ENOSPC; + +	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + +	pcpu->event = event; + +	if (flags & PERF_EF_START) +		perf_ibs_start(event, PERF_EF_RELOAD); +  	return 0;  }  static void perf_ibs_del(struct perf_event *event, int flags)  { +	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); +	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + +	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) +		return; + +	perf_ibs_stop(event, PERF_EF_UPDATE); + +	pcpu->event = NULL; + +	perf_event_update_userpage(event);  } -static struct pmu perf_ibs = { -	.event_init= perf_ibs_init, -	.add= perf_ibs_add, -	.del= perf_ibs_del, +static void perf_ibs_read(struct perf_event *event) { } + +static struct perf_ibs perf_ibs_fetch = { +	.pmu = { +		.task_ctx_nr	= perf_invalid_context, + +		.event_init	= perf_ibs_init, +		.add		= perf_ibs_add, +		.del		= perf_ibs_del, +		.start		= perf_ibs_start, +		.stop		= perf_ibs_stop, +		.read		= perf_ibs_read, +	}, +	.msr			= MSR_AMD64_IBSFETCHCTL, +	.config_mask		= IBS_FETCH_CONFIG_MASK, +	.cnt_mask		= IBS_FETCH_MAX_CNT, +	.enable_mask		= IBS_FETCH_ENABLE, +	.valid_mask		= IBS_FETCH_VAL, +	.max_period		= IBS_FETCH_MAX_CNT << 4, +	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK }, +	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT, + +	.get_count		= get_ibs_fetch_count, +}; + +static struct perf_ibs perf_ibs_op = { +	.pmu = { +		.task_ctx_nr	= perf_invalid_context, + +		.event_init	= perf_ibs_init, +		.add		= perf_ibs_add, +		.del		= perf_ibs_del, +		.start		= perf_ibs_start, +		.stop		= perf_ibs_stop, +		.read		= perf_ibs_read, +	}, +	.msr			= MSR_AMD64_IBSOPCTL, +	.config_mask		= IBS_OP_CONFIG_MASK, +	.cnt_mask		= IBS_OP_MAX_CNT, +	.enable_mask		= IBS_OP_ENABLE, +	.valid_mask		= IBS_OP_VAL, +	.max_period		= IBS_OP_MAX_CNT << 4, +	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK }, +	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT, + +	.get_count		= get_ibs_op_count,  }; +static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) +{ +	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); +	struct perf_event *event = pcpu->event; +	struct hw_perf_event *hwc = &event->hw; +	struct perf_sample_data data; +	struct perf_raw_record raw; +	struct pt_regs regs; +	struct perf_ibs_data ibs_data; +	int offset, size, check_rip, offset_max, throttle = 0; +	unsigned int msr; +	u64 *buf, *config, period; + +	if (!test_bit(IBS_STARTED, pcpu->state)) { +		/* +		 * Catch spurious interrupts after stopping IBS: After +		 * disabling IBS there could be still incomming NMIs +		 * with samples that even have the valid bit cleared. +		 * Mark all this NMIs as handled. +		 */ +		return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; +	} + +	msr = hwc->config_base; +	buf = ibs_data.regs; +	rdmsrl(msr, *buf); +	if (!(*buf++ & perf_ibs->valid_mask)) +		return 0; + +	config = &ibs_data.regs[0]; +	perf_ibs_event_update(perf_ibs, event, config); +	perf_sample_data_init(&data, 0, hwc->last_period); +	if (!perf_ibs_set_period(perf_ibs, hwc, &period)) +		goto out;	/* no sw counter overflow */ + +	ibs_data.caps = ibs_caps; +	size = 1; +	offset = 1; +	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); +	if (event->attr.sample_type & PERF_SAMPLE_RAW) +		offset_max = perf_ibs->offset_max; +	else if (check_rip) +		offset_max = 2; +	else +		offset_max = 1; +	do { +		rdmsrl(msr + offset, *buf++); +		size++; +		offset = find_next_bit(perf_ibs->offset_mask, +				       perf_ibs->offset_max, +				       offset + 1); +	} while (offset < offset_max); +	ibs_data.size = sizeof(u64) * size; + +	regs = *iregs; +	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { +		regs.flags &= ~PERF_EFLAGS_EXACT; +	} else { +		instruction_pointer_set(®s, ibs_data.regs[1]); +		regs.flags |= PERF_EFLAGS_EXACT; +	} + +	if (event->attr.sample_type & PERF_SAMPLE_RAW) { +		raw.size = sizeof(u32) + ibs_data.size; +		raw.data = ibs_data.data; +		data.raw = &raw; +	} + +	throttle = perf_event_overflow(event, &data, ®s); +out: +	if (throttle) +		perf_ibs_disable_event(perf_ibs, hwc, *config); +	else +		perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + +	perf_event_update_userpage(event); + +	return 1; +} + +static int __kprobes +perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ +	int handled = 0; + +	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); +	handled += perf_ibs_handle_irq(&perf_ibs_op, regs); + +	if (handled) +		inc_irq_stat(apic_perf_irqs); + +	return handled; +} + +static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) +{ +	struct cpu_perf_ibs __percpu *pcpu; +	int ret; + +	pcpu = alloc_percpu(struct cpu_perf_ibs); +	if (!pcpu) +		return -ENOMEM; + +	perf_ibs->pcpu = pcpu; + +	ret = perf_pmu_register(&perf_ibs->pmu, name, -1); +	if (ret) { +		perf_ibs->pcpu = NULL; +		free_percpu(pcpu); +	} + +	return ret; +} +  static __init int perf_event_ibs_init(void)  {  	if (!ibs_caps)  		return -ENODEV;	/* ibs not supported by the cpu */ -	perf_pmu_register(&perf_ibs, "ibs", -1); +	perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); +	if (ibs_caps & IBS_CAPS_OPCNT) +		perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; +	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); +	register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");  	printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);  	return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 26b3e2fef10..166546ec6ae 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)  	u64 status;  	int handled; -	perf_sample_data_init(&data, 0); -  	cpuc = &__get_cpu_var(cpu_hw_events);  	/* @@ -1082,7 +1080,7 @@ again:  		if (!intel_pmu_save_and_restart(event))  			continue; -		data.period = event->hw.last_period; +		perf_sample_data_init(&data, 0, event->hw.last_period);  		if (has_branch_stack(event))  			data.br_stack = &cpuc->lbr_stack; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7f64df19e7d..5a3edc27f6e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void)  	ds->bts_index = ds->bts_buffer_base; -	perf_sample_data_init(&data, 0); -	data.period = event->hw.last_period; +	perf_sample_data_init(&data, 0, event->hw.last_period);  	regs.ip     = 0;  	/* @@ -564,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,  	if (!intel_pmu_save_and_restart(event))  		return; -	perf_sample_data_init(&data, 0); -	data.period = event->hw.last_period; +	perf_sample_data_init(&data, 0, event->hw.last_period);  	/*  	 * We use the interrupt regs as a base because the PEBS record diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index a2dfacfd710..47124a73dd7 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)  	int idx, handled = 0;  	u64 val; -	perf_sample_data_init(&data, 0); -  	cpuc = &__get_cpu_var(cpu_hw_events);  	for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)  		handled += overflow;  		/* event overflow for sure */ -		data.period = event->hw.last_period; +		perf_sample_data_init(&data, 0, hwc->last_period);  		if (!x86_perf_event_set_period(event))  			continue; + +  		if (perf_event_overflow(event, &data, regs))  			x86_pmu_stop(event, 0);  	} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1b81839b6c8..571246d81ed 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -271,7 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)  			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)  		return 1; -	show_registers(regs); +	show_regs(regs);  #ifdef CONFIG_X86_32  	if (user_mode_vm(regs)) {  		sp = regs->sp; @@ -311,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err)  static int __init kstack_setup(char *s)  { +	ssize_t ret; +	unsigned long val; +  	if (!s)  		return -EINVAL; -	kstack_depth_to_print = simple_strtoul(s, NULL, 0); + +	ret = kstrtoul(s, 0, &val); +	if (ret) +		return ret; +	kstack_depth_to_print = val;  	return 0;  }  early_param("kstack", kstack_setup);  static int __init code_bytes_setup(char *s)  { -	code_bytes = simple_strtoul(s, NULL, 0); +	ssize_t ret; +	unsigned long val; + +	if (!s) +		return -EINVAL; + +	ret = kstrtoul(s, 0, &val); +	if (ret) +		return ret; + +	code_bytes = val;  	if (code_bytes > 8192)  		code_bytes = 8192; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 88ec9129271..e0b1d783daa 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -82,7 +82,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  } -void show_registers(struct pt_regs *regs) +void show_regs(struct pt_regs *regs)  {  	int i; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 17107bd6e1f..791b76122aa 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -245,7 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  	show_trace_log_lvl(task, regs, sp, bp, log_lvl);  } -void show_registers(struct pt_regs *regs) +void show_regs(struct pt_regs *regs)  {  	int i;  	unsigned long sp; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 7b784f4ef1e..01ccf9b7147 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -56,6 +56,7 @@  #include <asm/irq_vectors.h>  #include <asm/cpufeature.h>  #include <asm/alternative-asm.h> +#include <asm/asm.h>  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */  #include <linux/elf-em.h> @@ -151,10 +152,8 @@  .pushsection .fixup, "ax"  99:	movl $0, (%esp)  	jmp 98b -.section __ex_table, "a" -	.align 4 -	.long 98b, 99b  .popsection +	_ASM_EXTABLE(98b,99b)  .endm  .macro PTGS_TO_GS @@ -164,10 +163,8 @@  .pushsection .fixup, "ax"  99:	movl $0, PT_GS(%esp)  	jmp 98b -.section __ex_table, "a" -	.align 4 -	.long 98b, 99b  .popsection +	_ASM_EXTABLE(98b,99b)  .endm  .macro GS_TO_REG reg @@ -249,12 +246,10 @@  	jmp 2b  6:	movl $0, (%esp)  	jmp 3b -.section __ex_table, "a" -	.align 4 -	.long 1b, 4b -	.long 2b, 5b -	.long 3b, 6b  .popsection +	_ASM_EXTABLE(1b,4b) +	_ASM_EXTABLE(2b,5b) +	_ASM_EXTABLE(3b,6b)  	POP_GS_EX  .endm @@ -415,10 +410,7 @@ sysenter_past_esp:  	jae syscall_fault  1:	movl (%ebp),%ebp  	movl %ebp,PT_EBP(%esp) -.section __ex_table,"a" -	.align 4 -	.long 1b,syscall_fault -.previous +	_ASM_EXTABLE(1b,syscall_fault)  	GET_THREAD_INFO(%ebp) @@ -485,10 +477,8 @@ sysexit_audit:  .pushsection .fixup,"ax"  2:	movl $0,PT_FS(%esp)  	jmp 1b -.section __ex_table,"a" -	.align 4 -	.long 1b,2b  .popsection +	_ASM_EXTABLE(1b,2b)  	PTGS_TO_GS_EX  ENDPROC(ia32_sysenter_target) @@ -543,10 +533,7 @@ ENTRY(iret_exc)  	pushl $do_iret_error  	jmp error_code  .previous -.section __ex_table,"a" -	.align 4 -	.long irq_return,iret_exc -.previous +	_ASM_EXTABLE(irq_return,iret_exc)  	CFI_RESTORE_STATE  ldt_ss: @@ -901,10 +888,7 @@ END(device_not_available)  #ifdef CONFIG_PARAVIRT  ENTRY(native_iret)  	iret -.section __ex_table,"a" -	.align 4 -	.long native_iret, iret_exc -.previous +	_ASM_EXTABLE(native_iret, iret_exc)  END(native_iret)  ENTRY(native_irq_enable_sysexit) @@ -1093,13 +1077,10 @@ ENTRY(xen_failsafe_callback)  	movl %eax,16(%esp)  	jmp 4b  .previous -.section __ex_table,"a" -	.align 4 -	.long 1b,6b -	.long 2b,7b -	.long 3b,8b -	.long 4b,9b -.previous +	_ASM_EXTABLE(1b,6b) +	_ASM_EXTABLE(2b,7b) +	_ASM_EXTABLE(3b,8b) +	_ASM_EXTABLE(4b,9b)  ENDPROC(xen_failsafe_callback)  BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index cdc79b5cfcd..320852d0202 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@  #include <asm/paravirt.h>  #include <asm/ftrace.h>  #include <asm/percpu.h> +#include <asm/asm.h>  #include <linux/err.h>  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */ @@ -900,18 +901,12 @@ restore_args:  irq_return:  	INTERRUPT_RETURN - -	.section __ex_table, "a" -	.quad irq_return, bad_iret -	.previous +	_ASM_EXTABLE(irq_return, bad_iret)  #ifdef CONFIG_PARAVIRT  ENTRY(native_iret)  	iretq - -	.section __ex_table,"a" -	.quad native_iret, bad_iret -	.previous +	_ASM_EXTABLE(native_iret, bad_iret)  #endif  	.section .fixup,"ax" @@ -1181,10 +1176,7 @@ gs_change:  	CFI_ENDPROC  END(native_load_gs_index) -	.section __ex_table,"a" -	.align 8 -	.quad gs_change,bad_gs -	.previous +	_ASM_EXTABLE(gs_change,bad_gs)  	.section .fixup,"ax"  	/* running with kernelgs */  bad_gs: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c9a281f272f..32ff36596ab 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,40 +24,21 @@  #include <trace/syscall.h>  #include <asm/cacheflush.h> +#include <asm/kprobes.h>  #include <asm/ftrace.h>  #include <asm/nops.h> -#include <asm/nmi.h> -  #ifdef CONFIG_DYNAMIC_FTRACE -/* - * modifying_code is set to notify NMIs that they need to use - * memory barriers when entering or exiting. But we don't want - * to burden NMIs with unnecessary memory barriers when code - * modification is not being done (which is most of the time). - * - * A mutex is already held when ftrace_arch_code_modify_prepare - * and post_process are called. No locks need to be taken here. - * - * Stop machine will make sure currently running NMIs are done - * and new NMIs will see the updated variable before we need - * to worry about NMIs doing memory barriers. - */ -static int modifying_code __read_mostly; -static DEFINE_PER_CPU(int, save_modifying_code); -  int ftrace_arch_code_modify_prepare(void)  {  	set_kernel_text_rw();  	set_all_modules_text_rw(); -	modifying_code = 1;  	return 0;  }  int ftrace_arch_code_modify_post_process(void)  { -	modifying_code = 0;  	set_all_modules_text_ro();  	set_kernel_text_ro();  	return 0; @@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)  	return calc.code;  } -/* - * Modifying code must take extra care. On an SMP machine, if - * the code being modified is also being executed on another CPU - * that CPU will have undefined results and possibly take a GPF. - * We use kstop_machine to stop other CPUS from exectuing code. - * But this does not stop NMIs from happening. We still need - * to protect against that. We separate out the modification of - * the code to take care of this. - * - * Two buffers are added: An IP buffer and a "code" buffer. - * - * 1) Put the instruction pointer into the IP buffer - *    and the new code into the "code" buffer. - * 2) Wait for any running NMIs to finish and set a flag that says - *    we are modifying code, it is done in an atomic operation. - * 3) Write the code - * 4) clear the flag. - * 5) Wait for any running NMIs to finish. - * - * If an NMI is executed, the first thing it does is to call - * "ftrace_nmi_enter". This will check if the flag is set to write - * and if it is, it will write what is in the IP and "code" buffers. - * - * The trick is, it does not matter if everyone is writing the same - * content to the code location. Also, if a CPU is executing code - * it is OK to write to that code location if the contents being written - * are the same as what exists. - */ - -#define MOD_CODE_WRITE_FLAG (1 << 31)	/* set when NMI should do the write */ -static atomic_t nmi_running = ATOMIC_INIT(0); -static int mod_code_status;		/* holds return value of text write */ -static void *mod_code_ip;		/* holds the IP to write to */ -static const void *mod_code_newcode;	/* holds the text to write to the IP */ - -static unsigned nmi_wait_count; -static atomic_t nmi_update_count = ATOMIC_INIT(0); - -int ftrace_arch_read_dyn_info(char *buf, int size) -{ -	int r; - -	r = snprintf(buf, size, "%u %u", -		     nmi_wait_count, -		     atomic_read(&nmi_update_count)); -	return r; -} - -static void clear_mod_flag(void) -{ -	int old = atomic_read(&nmi_running); - -	for (;;) { -		int new = old & ~MOD_CODE_WRITE_FLAG; - -		if (old == new) -			break; - -		old = atomic_cmpxchg(&nmi_running, old, new); -	} -} - -static void ftrace_mod_code(void) -{ -	/* -	 * Yes, more than one CPU process can be writing to mod_code_status. -	 *    (and the code itself) -	 * But if one were to fail, then they all should, and if one were -	 * to succeed, then they all should. -	 */ -	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, -					     MCOUNT_INSN_SIZE); - -	/* if we fail, then kill any new writers */ -	if (mod_code_status) -		clear_mod_flag(); -} - -void ftrace_nmi_enter(void) -{ -	__this_cpu_write(save_modifying_code, modifying_code); - -	if (!__this_cpu_read(save_modifying_code)) -		return; - -	if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { -		smp_rmb(); -		ftrace_mod_code(); -		atomic_inc(&nmi_update_count); -	} -	/* Must have previous changes seen before executions */ -	smp_mb(); -} - -void ftrace_nmi_exit(void) -{ -	if (!__this_cpu_read(save_modifying_code)) -		return; - -	/* Finish all executions before clearing nmi_running */ -	smp_mb(); -	atomic_dec(&nmi_running); -} - -static void wait_for_nmi_and_set_mod_flag(void) -{ -	if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) -		return; - -	do { -		cpu_relax(); -	} while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); - -	nmi_wait_count++; -} - -static void wait_for_nmi(void) -{ -	if (!atomic_read(&nmi_running)) -		return; - -	do { -		cpu_relax(); -	} while (atomic_read(&nmi_running)); - -	nmi_wait_count++; -} -  static inline int  within(unsigned long addr, unsigned long start, unsigned long end)  { @@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)  	if (within(ip, (unsigned long)_text, (unsigned long)_etext))  		ip = (unsigned long)__va(__pa(ip)); -	mod_code_ip = (void *)ip; -	mod_code_newcode = new_code; - -	/* The buffers need to be visible before we let NMIs write them */ -	smp_mb(); - -	wait_for_nmi_and_set_mod_flag(); - -	/* Make sure all running NMIs have finished before we write the code */ -	smp_mb(); - -	ftrace_mod_code(); - -	/* Make sure the write happens before clearing the bit */ -	smp_mb(); - -	clear_mod_flag(); -	wait_for_nmi(); - -	return mod_code_status; +	return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);  }  static const unsigned char *ftrace_nop_replace(void) @@ -334,6 +168,336 @@ int ftrace_update_ftrace_func(ftrace_func_t func)  	return ret;  } +int modifying_ftrace_code __read_mostly; + +/* + * A breakpoint was added to the code address we are about to + * modify, and this is the handle that will just skip over it. + * We are either changing a nop into a trace call, or a trace + * call to a nop. While the change is taking place, we treat + * it just like it was a nop. + */ +int ftrace_int3_handler(struct pt_regs *regs) +{ +	if (WARN_ON_ONCE(!regs)) +		return 0; + +	if (!ftrace_location(regs->ip - 1)) +		return 0; + +	regs->ip += MCOUNT_INSN_SIZE - 1; + +	return 1; +} + +static int ftrace_write(unsigned long ip, const char *val, int size) +{ +	/* +	 * On x86_64, kernel text mappings are mapped read-only with +	 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead +	 * of the kernel text mapping to modify the kernel text. +	 * +	 * For 32bit kernels, these mappings are same and we can use +	 * kernel identity mapping to modify code. +	 */ +	if (within(ip, (unsigned long)_text, (unsigned long)_etext)) +		ip = (unsigned long)__va(__pa(ip)); + +	return probe_kernel_write((void *)ip, val, size); +} + +static int add_break(unsigned long ip, const char *old) +{ +	unsigned char replaced[MCOUNT_INSN_SIZE]; +	unsigned char brk = BREAKPOINT_INSTRUCTION; + +	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) +		return -EFAULT; + +	/* Make sure it is what we expect it to be */ +	if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) +		return -EINVAL; + +	if (ftrace_write(ip, &brk, 1)) +		return -EPERM; + +	return 0; +} + +static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned const char *old; +	unsigned long ip = rec->ip; + +	old = ftrace_call_replace(ip, addr); + +	return add_break(rec->ip, old); +} + + +static int add_brk_on_nop(struct dyn_ftrace *rec) +{ +	unsigned const char *old; + +	old = ftrace_nop_replace(); + +	return add_break(rec->ip, old); +} + +static int add_breakpoints(struct dyn_ftrace *rec, int enable) +{ +	unsigned long ftrace_addr; +	int ret; + +	ret = ftrace_test_record(rec, enable); + +	ftrace_addr = (unsigned long)FTRACE_ADDR; + +	switch (ret) { +	case FTRACE_UPDATE_IGNORE: +		return 0; + +	case FTRACE_UPDATE_MAKE_CALL: +		/* converting nop to call */ +		return add_brk_on_nop(rec); + +	case FTRACE_UPDATE_MAKE_NOP: +		/* converting a call to a nop */ +		return add_brk_on_call(rec, ftrace_addr); +	} +	return 0; +} + +/* + * On error, we need to remove breakpoints. This needs to + * be done caefully. If the address does not currently have a + * breakpoint, we know we are done. Otherwise, we look at the + * remaining 4 bytes of the instruction. If it matches a nop + * we replace the breakpoint with the nop. Otherwise we replace + * it with the call instruction. + */ +static int remove_breakpoint(struct dyn_ftrace *rec) +{ +	unsigned char ins[MCOUNT_INSN_SIZE]; +	unsigned char brk = BREAKPOINT_INSTRUCTION; +	const unsigned char *nop; +	unsigned long ftrace_addr; +	unsigned long ip = rec->ip; + +	/* If we fail the read, just give up */ +	if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) +		return -EFAULT; + +	/* If this does not have a breakpoint, we are done */ +	if (ins[0] != brk) +		return -1; + +	nop = ftrace_nop_replace(); + +	/* +	 * If the last 4 bytes of the instruction do not match +	 * a nop, then we assume that this is a call to ftrace_addr. +	 */ +	if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { +		/* +		 * For extra paranoidism, we check if the breakpoint is on +		 * a call that would actually jump to the ftrace_addr. +		 * If not, don't touch the breakpoint, we make just create +		 * a disaster. +		 */ +		ftrace_addr = (unsigned long)FTRACE_ADDR; +		nop = ftrace_call_replace(ip, ftrace_addr); + +		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) +			return -EINVAL; +	} + +	return probe_kernel_write((void *)ip, &nop[0], 1); +} + +static int add_update_code(unsigned long ip, unsigned const char *new) +{ +	/* skip breakpoint */ +	ip++; +	new++; +	if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) +		return -EPERM; +	return 0; +} + +static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned long ip = rec->ip; +	unsigned const char *new; + +	new = ftrace_call_replace(ip, addr); +	return add_update_code(ip, new); +} + +static int add_update_nop(struct dyn_ftrace *rec) +{ +	unsigned long ip = rec->ip; +	unsigned const char *new; + +	new = ftrace_nop_replace(); +	return add_update_code(ip, new); +} + +static int add_update(struct dyn_ftrace *rec, int enable) +{ +	unsigned long ftrace_addr; +	int ret; + +	ret = ftrace_test_record(rec, enable); + +	ftrace_addr = (unsigned long)FTRACE_ADDR; + +	switch (ret) { +	case FTRACE_UPDATE_IGNORE: +		return 0; + +	case FTRACE_UPDATE_MAKE_CALL: +		/* converting nop to call */ +		return add_update_call(rec, ftrace_addr); + +	case FTRACE_UPDATE_MAKE_NOP: +		/* converting a call to a nop */ +		return add_update_nop(rec); +	} + +	return 0; +} + +static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned long ip = rec->ip; +	unsigned const char *new; + +	new = ftrace_call_replace(ip, addr); + +	if (ftrace_write(ip, new, 1)) +		return -EPERM; + +	return 0; +} + +static int finish_update_nop(struct dyn_ftrace *rec) +{ +	unsigned long ip = rec->ip; +	unsigned const char *new; + +	new = ftrace_nop_replace(); + +	if (ftrace_write(ip, new, 1)) +		return -EPERM; +	return 0; +} + +static int finish_update(struct dyn_ftrace *rec, int enable) +{ +	unsigned long ftrace_addr; +	int ret; + +	ret = ftrace_update_record(rec, enable); + +	ftrace_addr = (unsigned long)FTRACE_ADDR; + +	switch (ret) { +	case FTRACE_UPDATE_IGNORE: +		return 0; + +	case FTRACE_UPDATE_MAKE_CALL: +		/* converting nop to call */ +		return finish_update_call(rec, ftrace_addr); + +	case FTRACE_UPDATE_MAKE_NOP: +		/* converting a call to a nop */ +		return finish_update_nop(rec); +	} + +	return 0; +} + +static void do_sync_core(void *data) +{ +	sync_core(); +} + +static void run_sync(void) +{ +	int enable_irqs = irqs_disabled(); + +	/* We may be called with interrupts disbled (on bootup). */ +	if (enable_irqs) +		local_irq_enable(); +	on_each_cpu(do_sync_core, NULL, 1); +	if (enable_irqs) +		local_irq_disable(); +} + +void ftrace_replace_code(int enable) +{ +	struct ftrace_rec_iter *iter; +	struct dyn_ftrace *rec; +	const char *report = "adding breakpoints"; +	int count = 0; +	int ret; + +	for_ftrace_rec_iter(iter) { +		rec = ftrace_rec_iter_record(iter); + +		ret = add_breakpoints(rec, enable); +		if (ret) +			goto remove_breakpoints; +		count++; +	} + +	run_sync(); + +	report = "updating code"; + +	for_ftrace_rec_iter(iter) { +		rec = ftrace_rec_iter_record(iter); + +		ret = add_update(rec, enable); +		if (ret) +			goto remove_breakpoints; +	} + +	run_sync(); + +	report = "removing breakpoints"; + +	for_ftrace_rec_iter(iter) { +		rec = ftrace_rec_iter_record(iter); + +		ret = finish_update(rec, enable); +		if (ret) +			goto remove_breakpoints; +	} + +	run_sync(); + +	return; + + remove_breakpoints: +	ftrace_bug(ret, rec ? rec->ip : 0); +	printk(KERN_WARNING "Failed on %s (%d):\n", report, count); +	for_ftrace_rec_iter(iter) { +		rec = ftrace_rec_iter_record(iter); +		remove_breakpoint(rec); +	} +} + +void arch_ftrace_update_code(int command) +{ +	modifying_ftrace_code++; + +	ftrace_modify_all_code(command); + +	modifying_ftrace_code--; +} +  int __init ftrace_dyn_arch_init(void *data)  {  	/* The return code is retured via data */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ce0be7cd085..463c9797ca6 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -21,6 +21,7 @@  #include <asm/msr-index.h>  #include <asm/cpufeature.h>  #include <asm/percpu.h> +#include <asm/nops.h>  /* Physical address */  #define pa(X) ((X) - __PAGE_OFFSET) @@ -363,28 +364,23 @@ default_entry:  	pushl $0  	popfl -#ifdef CONFIG_SMP -	cmpb $0, ready -	jnz checkCPUtype -#endif /* CONFIG_SMP */ -  /*   * start system 32-bit setup. We need to re-do some of the things done   * in 16-bit mode for the "real" operations.   */ -	call setup_idt - -checkCPUtype: - -	movl $-1,X86_CPUID		#  -1 for no CPUID initially - +	movl setup_once_ref,%eax +	andl %eax,%eax +	jz 1f				# Did we do this already? +	call *%eax +1: +	  /* check if it is 486 or 386. */  /*   * XXX - this does a lot of unnecessary setup.  Alignment checks don't   * apply at our cpl of 0 and the stack ought to be aligned already, and   * we don't need to preserve eflags.   */ - +	movl $-1,X86_CPUID	# -1 for no CPUID initially  	movb $3,X86		# at least 386  	pushfl			# push EFLAGS  	popl %eax		# get EFLAGS @@ -450,21 +446,6 @@ is386:	movl $2,%ecx		# set MP  	movl $(__KERNEL_PERCPU), %eax  	movl %eax,%fs			# set this cpu's percpu -#ifdef CONFIG_CC_STACKPROTECTOR -	/* -	 * The linker can't handle this by relocation.  Manually set -	 * base address in stack canary segment descriptor. -	 */ -	cmpb $0,ready -	jne 1f -	movl $gdt_page,%eax -	movl $stack_canary,%ecx -	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) -	shrl $16, %ecx -	movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) -	movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) -1: -#endif  	movl $(__KERNEL_STACK_CANARY),%eax  	movl %eax,%gs @@ -473,7 +454,6 @@ is386:	movl $2,%ecx		# set MP  	cld			# gcc2 wants the direction flag cleared at all times  	pushl $0		# fake return address for unwinder -	movb $1, ready  	jmp *(initial_code)  /* @@ -495,81 +475,122 @@ check_x87:  	.byte 0xDB,0xE4		/* fsetpm for 287, ignored by 387 */  	ret +	 +#include "verify_cpu.S" +  /* - *  setup_idt + *  setup_once   * - *  sets up a idt with 256 entries pointing to - *  ignore_int, interrupt gates. It doesn't actually load - *  idt - that can be done only after paging has been enabled - *  and the kernel moved to PAGE_OFFSET. Interrupts - *  are enabled elsewhere, when we can be relatively - *  sure everything is ok. + *  The setup work we only want to run on the BSP.   *   *  Warning: %esi is live across this function.   */ -setup_idt: -	lea ignore_int,%edx -	movl $(__KERNEL_CS << 16),%eax -	movw %dx,%ax		/* selector = 0x0010 = cs */ -	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */ +__INIT +setup_once: +	/* +	 * Set up a idt with 256 entries pointing to ignore_int, +	 * interrupt gates. It doesn't actually load idt - that needs +	 * to be done on each CPU. Interrupts are enabled elsewhere, +	 * when we can be relatively sure everything is ok. +	 */ -	lea idt_table,%edi -	mov $256,%ecx -rp_sidt: +	movl $idt_table,%edi +	movl $early_idt_handlers,%eax +	movl $NUM_EXCEPTION_VECTORS,%ecx +1:  	movl %eax,(%edi) -	movl %edx,4(%edi) +	movl %eax,4(%edi) +	/* interrupt gate, dpl=0, present */ +	movl $(0x8E000000 + __KERNEL_CS),2(%edi) +	addl $9,%eax  	addl $8,%edi -	dec %ecx -	jne rp_sidt +	loop 1b -.macro	set_early_handler handler,trapno -	lea \handler,%edx +	movl $256 - NUM_EXCEPTION_VECTORS,%ecx +	movl $ignore_int,%edx  	movl $(__KERNEL_CS << 16),%eax -	movw %dx,%ax +	movw %dx,%ax		/* selector = 0x0010 = cs */  	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */ -	lea idt_table,%edi -	movl %eax,8*\trapno(%edi) -	movl %edx,8*\trapno+4(%edi) -.endm +2: +	movl %eax,(%edi) +	movl %edx,4(%edi) +	addl $8,%edi +	loop 2b -	set_early_handler handler=early_divide_err,trapno=0 -	set_early_handler handler=early_illegal_opcode,trapno=6 -	set_early_handler handler=early_protection_fault,trapno=13 -	set_early_handler handler=early_page_fault,trapno=14 +#ifdef CONFIG_CC_STACKPROTECTOR +	/* +	 * Configure the stack canary. The linker can't handle this by +	 * relocation.  Manually set base address in stack canary +	 * segment descriptor. +	 */ +	movl $gdt_page,%eax +	movl $stack_canary,%ecx +	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) +	shrl $16, %ecx +	movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) +	movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +#endif +	andl $0,setup_once_ref	/* Once is enough, thanks */  	ret -early_divide_err: -	xor %edx,%edx -	pushl $0	/* fake errcode */ -	jmp early_fault +ENTRY(early_idt_handlers) +	# 36(%esp) %eflags +	# 32(%esp) %cs +	# 28(%esp) %eip +	# 24(%rsp) error code +	i = 0 +	.rept NUM_EXCEPTION_VECTORS +	.if (EXCEPTION_ERRCODE_MASK >> i) & 1 +	ASM_NOP2 +	.else +	pushl $0		# Dummy error code, to make stack frame uniform +	.endif +	pushl $i		# 20(%esp) Vector number +	jmp early_idt_handler +	i = i + 1 +	.endr +ENDPROC(early_idt_handlers) +	 +	/* This is global to keep gas from relaxing the jumps */ +ENTRY(early_idt_handler) +	cld +	cmpl $2,%ss:early_recursion_flag +	je hlt_loop +	incl %ss:early_recursion_flag -early_illegal_opcode: -	movl $6,%edx -	pushl $0	/* fake errcode */ -	jmp early_fault +	push %eax		# 16(%esp) +	push %ecx		# 12(%esp) +	push %edx		#  8(%esp) +	push %ds		#  4(%esp) +	push %es		#  0(%esp) +	movl $(__KERNEL_DS),%eax +	movl %eax,%ds +	movl %eax,%es -early_protection_fault: -	movl $13,%edx -	jmp early_fault +	cmpl $(__KERNEL_CS),32(%esp) +	jne 10f -early_page_fault: -	movl $14,%edx -	jmp early_fault +	leal 28(%esp),%eax	# Pointer to %eip +	call early_fixup_exception +	andl %eax,%eax +	jnz ex_entry		/* found an exception entry */ -early_fault: -	cld +10:  #ifdef CONFIG_PRINTK -	pusha -	movl $(__KERNEL_DS),%eax -	movl %eax,%ds -	movl %eax,%es -	cmpl $2,early_recursion_flag -	je hlt_loop -	incl early_recursion_flag +	xorl %eax,%eax +	movw %ax,2(%esp)	/* clean up the segment values on some cpus */ +	movw %ax,6(%esp) +	movw %ax,34(%esp) +	leal  40(%esp),%eax +	pushl %eax		/* %esp before the exception */ +	pushl %ebx +	pushl %ebp +	pushl %esi +	pushl %edi  	movl %cr2,%eax  	pushl %eax -	pushl %edx		/* trapno */ +	pushl (20+6*4)(%esp)	/* trapno */  	pushl $fault_msg  	call printk  #endif @@ -578,6 +599,17 @@ hlt_loop:  	hlt  	jmp hlt_loop +ex_entry: +	pop %es +	pop %ds +	pop %edx +	pop %ecx +	pop %eax +	addl $8,%esp		/* drop vector number and error code */ +	decl %ss:early_recursion_flag +	iret +ENDPROC(early_idt_handler) +  /* This is the default interrupt "handler" :-) */  	ALIGN  ignore_int: @@ -611,13 +643,18 @@ ignore_int:  	popl %eax  #endif  	iret +ENDPROC(ignore_int) +__INITDATA +	.align 4 +early_recursion_flag: +	.long 0 -#include "verify_cpu.S" - -	__REFDATA -.align 4 +__REFDATA +	.align 4  ENTRY(initial_code)  	.long i386_start_kernel +ENTRY(setup_once_ref) +	.long setup_once  /*   * BSS section @@ -670,22 +707,19 @@ ENTRY(initial_page_table)  ENTRY(stack_start)  	.long init_thread_union+THREAD_SIZE -early_recursion_flag: -	.long 0 - -ready:	.byte 0 - +__INITRODATA  int_msg:  	.asciz "Unknown interrupt or fault at: %p %p %p\n"  fault_msg:  /* fault info: */  	.ascii "BUG: Int %d: CR2 %p\n" -/* pusha regs: */ -	.ascii "     EDI %p  ESI %p  EBP %p  ESP %p\n" -	.ascii "     EBX %p  EDX %p  ECX %p  EAX %p\n" +/* regs pushed in early_idt_handler: */ +	.ascii "     EDI %p  ESI %p  EBP %p  EBX %p\n" +	.ascii "     ESP %p   ES %p   DS %p\n" +	.ascii "     EDX %p  ECX %p  EAX %p\n"  /* fault frame: */ -	.ascii "     err %p  EIP %p   CS %p  flg %p\n" +	.ascii "     vec %p  err %p  EIP %p   CS %p  flg %p\n"  	.ascii "Stack: %p %p %p %p %p %p %p %p\n"  	.ascii "       %p %p %p %p %p %p %p %p\n"  	.asciz "       %p %p %p %p %p %p %p %p\n" @@ -699,6 +733,7 @@ fault_msg:   * segment size, and 32-bit linear address value:   */ +	.data  .globl boot_gdt_descr  .globl idt_descr diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 40f4eb3766d..7a40f244732 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -19,12 +19,15 @@  #include <asm/cache.h>  #include <asm/processor-flags.h>  #include <asm/percpu.h> +#include <asm/nops.h>  #ifdef CONFIG_PARAVIRT  #include <asm/asm-offsets.h>  #include <asm/paravirt.h> +#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg  #else -#define GET_CR2_INTO_RCX movq %cr2, %rcx +#define GET_CR2_INTO(reg) movq %cr2, reg +#define INTERRUPT_RETURN iretq  #endif  /* we are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -270,36 +273,56 @@ bad_address:  	jmp bad_address  	.section ".init.text","ax" -#ifdef CONFIG_EARLY_PRINTK  	.globl early_idt_handlers  early_idt_handlers: +	# 104(%rsp) %rflags +	#  96(%rsp) %cs +	#  88(%rsp) %rip +	#  80(%rsp) error code  	i = 0  	.rept NUM_EXCEPTION_VECTORS -	movl $i, %esi +	.if (EXCEPTION_ERRCODE_MASK >> i) & 1 +	ASM_NOP2 +	.else +	pushq $0		# Dummy error code, to make stack frame uniform +	.endif +	pushq $i		# 72(%rsp) Vector number  	jmp early_idt_handler  	i = i + 1  	.endr -#endif  ENTRY(early_idt_handler) -#ifdef CONFIG_EARLY_PRINTK +	cld +  	cmpl $2,early_recursion_flag(%rip)  	jz  1f  	incl early_recursion_flag(%rip) -	GET_CR2_INTO_RCX -	movq %rcx,%r9 -	xorl %r8d,%r8d		# zero for error code -	movl %esi,%ecx		# get vector number -	# Test %ecx against mask of vectors that push error code. -	cmpl $31,%ecx -	ja 0f -	movl $1,%eax -	salq %cl,%rax -	testl $0x27d00,%eax -	je 0f -	popq %r8		# get error code -0:	movq 0(%rsp),%rcx	# get ip -	movq 8(%rsp),%rdx	# get cs + +	pushq %rax		# 64(%rsp) +	pushq %rcx		# 56(%rsp) +	pushq %rdx		# 48(%rsp) +	pushq %rsi		# 40(%rsp) +	pushq %rdi		# 32(%rsp) +	pushq %r8		# 24(%rsp) +	pushq %r9		# 16(%rsp) +	pushq %r10		#  8(%rsp) +	pushq %r11		#  0(%rsp) + +	cmpl $__KERNEL_CS,96(%rsp) +	jne 10f + +	leaq 88(%rsp),%rdi	# Pointer to %rip +	call early_fixup_exception +	andl %eax,%eax +	jnz 20f			# Found an exception entry + +10: +#ifdef CONFIG_EARLY_PRINTK +	GET_CR2_INTO(%r9)	# can clobber any volatile register if pv +	movl 80(%rsp),%r8d	# error code +	movl 72(%rsp),%esi	# vector number +	movl 96(%rsp),%edx	# %cs +	movq 88(%rsp),%rcx	# %rip  	xorl %eax,%eax  	leaq early_idt_msg(%rip),%rdi  	call early_printk @@ -308,17 +331,32 @@ ENTRY(early_idt_handler)  	call dump_stack  #ifdef CONFIG_KALLSYMS	  	leaq early_idt_ripmsg(%rip),%rdi -	movq 0(%rsp),%rsi	# get rip again +	movq 40(%rsp),%rsi	# %rip again  	call __print_symbol  #endif  #endif /* EARLY_PRINTK */  1:	hlt  	jmp 1b -#ifdef CONFIG_EARLY_PRINTK +20:	# Exception table entry found +	popq %r11 +	popq %r10 +	popq %r9 +	popq %r8 +	popq %rdi +	popq %rsi +	popq %rdx +	popq %rcx +	popq %rax +	addq $16,%rsp		# drop vector number and error code +	decl early_recursion_flag(%rip) +	INTERRUPT_RETURN + +	.balign 4  early_recursion_flag:  	.long 0 +#ifdef CONFIG_EARLY_PRINTK  early_idt_msg:  	.asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"  early_idt_ripmsg: diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 2d6e6498c17..f250431fb50 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -88,7 +88,7 @@ void kernel_fpu_begin(void)  		__thread_clear_has_fpu(me);  		/* We do 'stts()' in kernel_fpu_end() */  	} else { -		percpu_write(fpu_owner_task, NULL); +		this_cpu_write(fpu_owner_task, NULL);  		clts();  	}  } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e213fc8408d..e2f751efb7b 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1037,9 +1037,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)  			       "current sp %p does not match saved sp %p\n",  			       stack_addr(regs), kcb->jprobe_saved_sp);  			printk(KERN_ERR "Saved registers for jprobe %p\n", jp); -			show_registers(saved_regs); +			show_regs(saved_regs);  			printk(KERN_ERR "Current registers\n"); -			show_registers(regs); +			show_regs(regs);  			BUG();  		}  		*regs = kcb->jprobe_saved_regs; diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c deleted file mode 100644 index 7eb1e2b9782..00000000000 --- a/arch/x86/kernel/mca_32.c +++ /dev/null @@ -1,476 +0,0 @@ -/* - *  Written by Martin Kolinek, February 1996 - * - * Changes: - * - *	Chris Beauregard July 28th, 1996 - *	- Fixed up integrated SCSI detection - * - *	Chris Beauregard August 3rd, 1996 - *	- Made mca_info local - *	- Made integrated registers accessible through standard function calls - *	- Added name field - *	- More sanity checking - * - *	Chris Beauregard August 9th, 1996 - *	- Rewrote /proc/mca - * - *	Chris Beauregard January 7th, 1997 - *	- Added basic NMI-processing - *	- Added more information to mca_info structure - * - *	David Weinehall October 12th, 1998 - *	- Made a lot of cleaning up in the source - *	- Added use of save_flags / restore_flags - *	- Added the 'driver_loaded' flag in MCA_adapter - *	- Added an alternative implemention of ZP Gu's mca_find_unused_adapter - * - *	David Weinehall March 24th, 1999 - *	- Fixed the output of 'Driver Installed' in /proc/mca/pos - *	- Made the Integrated Video & SCSI show up even if they have id 0000 - * - *	Alexander Viro November 9th, 1999 - *	- Switched to regular procfs methods - * - *	Alfred Arnold & David Weinehall August 23rd, 2000 - *	- Added support for Planar POS-registers - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/mca.h> -#include <linux/kprobes.h> -#include <linux/slab.h> -#include <asm/io.h> -#include <linux/proc_fs.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/ioport.h> -#include <asm/uaccess.h> -#include <linux/init.h> - -static unsigned char which_scsi; - -int MCA_bus; -EXPORT_SYMBOL(MCA_bus); - -/* - * Motherboard register spinlock. Untested on SMP at the moment, but - * are there any MCA SMP boxes? - * - * Yes - Alan - */ -static DEFINE_SPINLOCK(mca_lock); - -/* Build the status info for the adapter */ - -static void mca_configure_adapter_status(struct mca_device *mca_dev) -{ -	mca_dev->status = MCA_ADAPTER_NONE; - -	mca_dev->pos_id = mca_dev->pos[0] -		+ (mca_dev->pos[1] << 8); - -	if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) { - -		/* -		 * id = 0x0000 usually indicates hardware failure, -		 * however, ZP Gu (zpg@castle.net> reports that his 9556 -		 * has 0x0000 as id and everything still works. There -		 * also seem to be an adapter with id = 0x0000; the -		 * NCR Parallel Bus Memory Card. Until this is confirmed, -		 * however, this code will stay. -		 */ - -		mca_dev->status = MCA_ADAPTER_ERROR; - -		return; -	} else if (mca_dev->pos_id != 0xffff) { - -		/* -		 * 0xffff usually indicates that there's no adapter, -		 * however, some integrated adapters may have 0xffff as -		 * their id and still be valid. Examples are on-board -		 * VGA of the 55sx, the integrated SCSI of the 56 & 57, -		 * and possibly also the 95 ULTIMEDIA. -		 */ - -		mca_dev->status = MCA_ADAPTER_NORMAL; -	} - -	if ((mca_dev->pos_id == 0xffff || -	    mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) { -		int j; - -		for (j = 2; j < 8; j++) { -			if (mca_dev->pos[j] != 0xff) { -				mca_dev->status = MCA_ADAPTER_NORMAL; -				break; -			} -		} -	} - -	if (!(mca_dev->pos[2] & MCA_ENABLED)) { - -		/* enabled bit is in POS 2 */ - -		mca_dev->status = MCA_ADAPTER_DISABLED; -	} -} /* mca_configure_adapter_status */ - -/*--------------------------------------------------------------------*/ - -static struct resource mca_standard_resources[] = { -	{ .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" }, -	{ .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" }, -	{ .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" }, -	{ .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" }, -	{ .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" }, -	{ .start = 0x96, .end = 0x97, .name = "POS (MCA)" }, -	{ .start = 0x100, .end = 0x107, .name = "POS (MCA)" } -}; - -#define MCA_STANDARD_RESOURCES	ARRAY_SIZE(mca_standard_resources) - -/* - *	mca_read_and_store_pos - read the POS registers into a memory buffer - *      @pos: a char pointer to 8 bytes, contains the POS register value on - *            successful return - * - *	Returns 1 if a card actually exists (i.e. the pos isn't - *	all 0xff) or 0 otherwise - */ -static int mca_read_and_store_pos(unsigned char *pos) -{ -	int j; -	int found = 0; - -	for (j = 0; j < 8; j++) { -		pos[j] = inb_p(MCA_POS_REG(j)); -		if (pos[j] != 0xff) { -			/* 0xff all across means no device. 0x00 means -			 * something's broken, but a device is -			 * probably there.  However, if you get 0x00 -			 * from a motherboard register it won't matter -			 * what we find.  For the record, on the -			 * 57SLC, the integrated SCSI adapter has -			 * 0xffff for the adapter ID, but nonzero for -			 * other registers.  */ - -			found = 1; -		} -	} -	return found; -} - -static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg) -{ -	unsigned char byte; -	unsigned long flags; - -	if (reg < 0 || reg >= 8) -		return 0; - -	spin_lock_irqsave(&mca_lock, flags); -	if (mca_dev->pos_register) { -		/* Disable adapter setup, enable motherboard setup */ - -		outb_p(0, MCA_ADAPTER_SETUP_REG); -		outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); - -		byte = inb_p(MCA_POS_REG(reg)); -		outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); -	} else { - -		/* Make sure motherboard setup is off */ - -		outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - -		/* Read the appropriate register */ - -		outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG); -		byte = inb_p(MCA_POS_REG(reg)); -		outb_p(0, MCA_ADAPTER_SETUP_REG); -	} -	spin_unlock_irqrestore(&mca_lock, flags); - -	mca_dev->pos[reg] = byte; - -	return byte; -} - -static void mca_pc_write_pos(struct mca_device *mca_dev, int reg, -			     unsigned char byte) -{ -	unsigned long flags; - -	if (reg < 0 || reg >= 8) -		return; - -	spin_lock_irqsave(&mca_lock, flags); - -	/* Make sure motherboard setup is off */ - -	outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - -	/* Read in the appropriate register */ - -	outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG); -	outb_p(byte, MCA_POS_REG(reg)); -	outb_p(0, MCA_ADAPTER_SETUP_REG); - -	spin_unlock_irqrestore(&mca_lock, flags); - -	/* Update the global register list, while we have the byte */ - -	mca_dev->pos[reg] = byte; - -} - -/* for the primary MCA bus, we have identity transforms */ -static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq) -{ -	return irq; -} - -static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port) -{ -	return port; -} - -static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem) -{ -	return mem; -} - - -static int __init mca_init(void) -{ -	unsigned int i, j; -	struct mca_device *mca_dev; -	unsigned char pos[8]; -	short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00}; -	struct mca_bus *bus; - -	/* -	 * WARNING: Be careful when making changes here. Putting an adapter -	 * and the motherboard simultaneously into setup mode may result in -	 * damage to chips (according to The Indispensable PC Hardware Book -	 * by Hans-Peter Messmer). Also, we disable system interrupts (so -	 * that we are not disturbed in the middle of this). -	 */ - -	/* Make sure the MCA bus is present */ - -	if (mca_system_init()) { -		printk(KERN_ERR "MCA bus system initialisation failed\n"); -		return -ENODEV; -	} - -	if (!MCA_bus) -		return -ENODEV; - -	printk(KERN_INFO "Micro Channel bus detected.\n"); - -	/* All MCA systems have at least a primary bus */ -	bus = mca_attach_bus(MCA_PRIMARY_BUS); -	if (!bus) -		goto out_nomem; -	bus->default_dma_mask = 0xffffffffLL; -	bus->f.mca_write_pos = mca_pc_write_pos; -	bus->f.mca_read_pos = mca_pc_read_pos; -	bus->f.mca_transform_irq = mca_dummy_transform_irq; -	bus->f.mca_transform_ioport = mca_dummy_transform_ioport; -	bus->f.mca_transform_memory = mca_dummy_transform_memory; - -	/* get the motherboard device */ -	mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL); -	if (unlikely(!mca_dev)) -		goto out_nomem; - -	/* -	 * We do not expect many MCA interrupts during initialization, -	 * but let us be safe: -	 */ -	spin_lock_irq(&mca_lock); - -	/* Make sure adapter setup is off */ - -	outb_p(0, MCA_ADAPTER_SETUP_REG); - -	/* Read motherboard POS registers */ - -	mca_dev->pos_register = 0x7f; -	outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); -	mca_dev->name[0] = 0; -	mca_read_and_store_pos(mca_dev->pos); -	mca_configure_adapter_status(mca_dev); -	/* fake POS and slot for a motherboard */ -	mca_dev->pos_id = MCA_MOTHERBOARD_POS; -	mca_dev->slot = MCA_MOTHERBOARD; -	mca_register_device(MCA_PRIMARY_BUS, mca_dev); - -	mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); -	if (unlikely(!mca_dev)) -		goto out_unlock_nomem; - -	/* Put motherboard into video setup mode, read integrated video -	 * POS registers, and turn motherboard setup off. -	 */ - -	mca_dev->pos_register = 0xdf; -	outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); -	mca_dev->name[0] = 0; -	mca_read_and_store_pos(mca_dev->pos); -	mca_configure_adapter_status(mca_dev); -	/* fake POS and slot for the integrated video */ -	mca_dev->pos_id = MCA_INTEGVIDEO_POS; -	mca_dev->slot = MCA_INTEGVIDEO; -	mca_register_device(MCA_PRIMARY_BUS, mca_dev); - -	/* -	 * Put motherboard into scsi setup mode, read integrated scsi -	 * POS registers, and turn motherboard setup off. -	 * -	 * It seems there are two possible SCSI registers. Martin says that -	 * for the 56,57, 0xf7 is the one, but fails on the 76. -	 * Alfredo (apena@vnet.ibm.com) says -	 * 0xfd works on his machine. We'll try both of them. I figure it's -	 * a good bet that only one could be valid at a time. This could -	 * screw up though if one is used for something else on the other -	 * machine. -	 */ - -	for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) { -		outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG); -		if (mca_read_and_store_pos(pos)) -			break; -	} -	if (which_scsi) { -		/* found a scsi card */ -		mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); -		if (unlikely(!mca_dev)) -			goto out_unlock_nomem; - -		for (j = 0; j < 8; j++) -			mca_dev->pos[j] = pos[j]; - -		mca_configure_adapter_status(mca_dev); -		/* fake POS and slot for integrated SCSI controller */ -		mca_dev->pos_id = MCA_INTEGSCSI_POS; -		mca_dev->slot = MCA_INTEGSCSI; -		mca_dev->pos_register = which_scsi; -		mca_register_device(MCA_PRIMARY_BUS, mca_dev); -	} - -	/* Turn off motherboard setup */ - -	outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - -	/* -	 * Now loop over MCA slots: put each adapter into setup mode, and -	 * read its POS registers. Then put adapter setup off. -	 */ - -	for (i = 0; i < MCA_MAX_SLOT_NR; i++) { -		outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG); -		if (!mca_read_and_store_pos(pos)) -			continue; - -		mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); -		if (unlikely(!mca_dev)) -			goto out_unlock_nomem; - -		for (j = 0; j < 8; j++) -			mca_dev->pos[j] = pos[j]; - -		mca_dev->driver_loaded = 0; -		mca_dev->slot = i; -		mca_dev->pos_register = 0; -		mca_configure_adapter_status(mca_dev); -		mca_register_device(MCA_PRIMARY_BUS, mca_dev); -	} -	outb_p(0, MCA_ADAPTER_SETUP_REG); - -	/* Enable interrupts and return memory start */ -	spin_unlock_irq(&mca_lock); - -	for (i = 0; i < MCA_STANDARD_RESOURCES; i++) -		request_resource(&ioport_resource, mca_standard_resources + i); - -	mca_do_proc_init(); - -	return 0; - - out_unlock_nomem: -	spin_unlock_irq(&mca_lock); - out_nomem: -	printk(KERN_EMERG "Failed memory allocation in MCA setup!\n"); -	return -ENOMEM; -} - -subsys_initcall(mca_init); - -/*--------------------------------------------------------------------*/ - -static __kprobes void -mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) -{ -	int slot = mca_dev->slot; - -	if (slot == MCA_INTEGSCSI) { -		printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n", -			mca_dev->name); -	} else if (slot == MCA_INTEGVIDEO) { -		printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n", -			mca_dev->name); -	} else if (slot == MCA_MOTHERBOARD) { -		printk(KERN_CRIT "NMI: caused by motherboard (%s)\n", -			mca_dev->name); -	} - -	/* More info available in POS 6 and 7? */ - -	if (check_flag) { -		unsigned char pos6, pos7; - -		pos6 = mca_device_read_pos(mca_dev, 6); -		pos7 = mca_device_read_pos(mca_dev, 7); - -		printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7); -	} - -} /* mca_handle_nmi_slot */ - -/*--------------------------------------------------------------------*/ - -static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data) -{ -	struct mca_device *mca_dev = to_mca_device(dev); -	unsigned char pos5; - -	pos5 = mca_device_read_pos(mca_dev, 5); - -	if (!(pos5 & 0x80)) { -		/* -		 *  Bit 7 of POS 5 is reset when this adapter has a hardware -		 * error. Bit 7 it reset if there's error information -		 * available in POS 6 and 7. -		 */ -		mca_handle_nmi_device(mca_dev, !(pos5 & 0x40)); -		return 1; -	} -	return 0; -} - -void __kprobes mca_handle_nmi(void) -{ -	/* -	 *  First try - scan the various adapters and see if a specific -	 * adapter was responsible for the error. -	 */ -	bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); -} diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index c9bda6d6035..fbdfc691718 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -299,12 +299,11 @@ static ssize_t reload_store(struct device *dev,  {  	unsigned long val;  	int cpu = dev->id; -	int ret = 0; -	char *end; +	ssize_t ret = 0; -	val = simple_strtoul(buf, &end, 0); -	if (end == buf) -		return -EINVAL; +	ret = kstrtoul(buf, 0, &val); +	if (ret) +		return ret;  	if (val == 1) {  		get_online_cpus(); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index ca470e4c92d..b02d4dd6b8a 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -97,7 +97,7 @@ static void __init MP_bus_info(struct mpc_bus *m)  	set_bit(m->busid, mp_bus_not_pci);  	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA  		mp_bus_id_to_type[m->busid] = MP_BUS_ISA;  #endif  	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { @@ -105,12 +105,10 @@ static void __init MP_bus_info(struct mpc_bus *m)  			x86_init.mpparse.mpc_oem_pci_bus(m);  		clear_bit(m->busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA  		mp_bus_id_to_type[m->busid] = MP_BUS_PCI;  	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {  		mp_bus_id_to_type[m->busid] = MP_BUS_EISA; -	} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { -		mp_bus_id_to_type[m->busid] = MP_BUS_MCA;  #endif  	} else  		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); @@ -368,9 +366,6 @@ static void __init construct_ioapic_table(int mpc_default_type)  	case 3:  		memcpy(bus.bustype, "EISA  ", 6);  		break; -	case 4: -	case 7: -		memcpy(bus.bustype, "MCA   ", 6);  	}  	MP_bus_info(&bus);  	if (mpc_default_type > 4) { @@ -623,7 +618,7 @@ void __init default_find_smp_config(void)  		return;  	/*  	 * If it is an SMP machine we should know now, unless the -	 * configuration is in an EISA/MCA bus machine with an +	 * configuration is in an EISA bus machine with an  	 * extended bios data area.  	 *  	 * there is a real-mode segmented pointer pointing to the diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 585be4bd71a..90875279ef3 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -19,8 +19,6 @@  #include <linux/slab.h>  #include <linux/export.h> -#include <linux/mca.h> -  #if defined(CONFIG_EDAC)  #include <linux/edac.h>  #endif @@ -84,7 +82,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);  #define nmi_to_desc(type) (&nmi_desc[type]) -static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)  {  	struct nmi_desc *desc = nmi_to_desc(type);  	struct nmiaction *a; @@ -166,7 +164,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)  }  EXPORT_SYMBOL_GPL(unregister_nmi_handler); -static notrace __kprobes void +static __kprobes void  pci_serr_error(unsigned char reason, struct pt_regs *regs)  {  	/* check to see if anyone registered against these types of errors */ @@ -197,7 +195,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)  	outb(reason, NMI_REASON_PORT);  } -static notrace __kprobes void +static __kprobes void  io_check_error(unsigned char reason, struct pt_regs *regs)  {  	unsigned long i; @@ -209,7 +207,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)  	pr_emerg(  	"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",  		 reason, smp_processor_id()); -	show_registers(regs); +	show_regs(regs);  	if (panic_on_io_nmi)  		panic("NMI IOCK error: Not continuing"); @@ -228,7 +226,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)  	outb(reason, NMI_REASON_PORT);  } -static notrace __kprobes void +static __kprobes void  unknown_nmi_error(unsigned char reason, struct pt_regs *regs)  {  	int handled; @@ -247,16 +245,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)  	__this_cpu_add(nmi_stats.unknown, 1); -#ifdef CONFIG_MCA -	/* -	 * Might actually be able to figure out what the guilty party -	 * is: -	 */ -	if (MCA_bus) { -		mca_handle_nmi(); -		return; -	} -#endif  	pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",  		 reason, smp_processor_id()); @@ -270,7 +258,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)  static DEFINE_PER_CPU(bool, swallow_nmi);  static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +static __kprobes void default_do_nmi(struct pt_regs *regs)  {  	unsigned char reason = 0;  	int handled; diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 2c39dcd510f..e31bf8d5c4d 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -13,6 +13,7 @@  #include <linux/cpumask.h>  #include <linux/delay.h>  #include <linux/init.h> +#include <linux/percpu.h>  #include <asm/apic.h>  #include <asm/nmi.h> @@ -117,15 +118,15 @@ static void __init dotest(void (*testcase_fn)(void), int expected)  		unexpected_testcase_failures++;  		if (nmi_fail == FAILURE) -			printk("FAILED |"); +			printk(KERN_CONT "FAILED |");  		else if (nmi_fail == TIMEOUT) -			printk("TIMEOUT|"); +			printk(KERN_CONT "TIMEOUT|");  		else -			printk("ERROR  |"); +			printk(KERN_CONT "ERROR  |");  		dump_stack();  	} else {  		testcase_successes++; -		printk("  ok  |"); +		printk(KERN_CONT "  ok  |");  	}  	testcase_total++; @@ -150,10 +151,10 @@ void __init nmi_selftest(void)  	print_testname("remote IPI");  	dotest(remote_ipi, SUCCESS); -	printk("\n"); +	printk(KERN_CONT "\n");  	print_testname("local IPI");  	dotest(local_ipi, SUCCESS); -	printk("\n"); +	printk(KERN_CONT "\n");  	cleanup_nmi_testsuite(); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ab137605e69..9ce885996fd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -241,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA  static inline void enter_lazy(enum paravirt_lazy_mode mode)  { -	BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); +	BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); -	percpu_write(paravirt_lazy_mode, mode); +	this_cpu_write(paravirt_lazy_mode, mode);  }  static void leave_lazy(enum paravirt_lazy_mode mode)  { -	BUG_ON(percpu_read(paravirt_lazy_mode) != mode); +	BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode); -	percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); +	this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);  }  void paravirt_enter_lazy_mmu(void) @@ -267,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev)  {  	BUG_ON(preemptible()); -	if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { +	if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {  		arch_leave_lazy_mmu_mode();  		set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);  	} @@ -289,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)  	if (in_interrupt())  		return PARAVIRT_LAZY_NONE; -	return percpu_read(paravirt_lazy_mode); +	return this_cpu_read(paravirt_lazy_mode);  }  void arch_flush_lazy_mmu_mode(void) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index d0b2fb9ccbb..b72838bae64 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1480,8 +1480,9 @@ cleanup:  static int __init calgary_parse_options(char *p)  {  	unsigned int bridge; +	unsigned long val;  	size_t len; -	char* endp; +	ssize_t ret;  	while (*p) {  		if (!strncmp(p, "64k", 3)) @@ -1512,10 +1513,11 @@ static int __init calgary_parse_options(char *p)  				++p;  			if (*p == '\0')  				break; -			bridge = simple_strtoul(p, &endp, 0); -			if (p == endp) +			ret = kstrtoul(p, 0, &val); +			if (ret)  				break; +			bridge = val;  			if (bridge < MAX_PHB_BUS_NUM) {  				printk(KERN_INFO "Calgary: disabling "  				       "translation for PHB %#x\n", bridge); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e8173154800..735279e54e5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -56,10 +56,16 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);  struct kmem_cache *task_xstate_cachep;  EXPORT_SYMBOL_GPL(task_xstate_cachep); +/* + * this gets called so that we can store lazy state into memory and copy the + * current task into the new thread. + */  int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)  {  	int ret; +	unlazy_fpu(src); +  	*dst = *src;  	if (fpu_allocated(&src->thread.fpu)) {  		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); @@ -89,6 +95,16 @@ void arch_task_cache_init(void)  				  SLAB_PANIC | SLAB_NOTRACK, NULL);  } +static inline void drop_fpu(struct task_struct *tsk) +{ +	/* +	 * Forget coprocessor state.. +	 */ +	tsk->fpu_counter = 0; +	clear_fpu(tsk); +	clear_used_math(); +} +  /*   * Free current thread data structures etc..   */ @@ -111,12 +127,8 @@ void exit_thread(void)  		put_cpu();  		kfree(bp);  	} -} -void show_regs(struct pt_regs *regs) -{ -	show_registers(regs); -	show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0); +	drop_fpu(me);  }  void show_regs_common(void) @@ -151,12 +163,7 @@ void flush_thread(void)  	flush_ptrace_hw_breakpoint(tsk);  	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); -	/* -	 * Forget coprocessor state.. -	 */ -	tsk->fpu_counter = 0; -	clear_fpu(tsk); -	clear_used_math(); +	drop_fpu(tsk);  }  static void hard_disable_TSC(void) @@ -385,7 +392,7 @@ static inline void play_dead(void)  #ifdef CONFIG_X86_64  void enter_idle(void)  { -	percpu_write(is_idle, 1); +	this_cpu_write(is_idle, 1);  	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);  } @@ -582,9 +589,17 @@ int mwait_usable(const struct cpuinfo_x86 *c)  {  	u32 eax, ebx, ecx, edx; +	/* Use mwait if idle=mwait boot option is given */  	if (boot_option_idle_override == IDLE_FORCE_MWAIT)  		return 1; +	/* +	 * Any idle= boot option other than idle=mwait means that we must not +	 * use mwait. Eg: idle=halt or idle=poll or idle=nomwait +	 */ +	if (boot_option_idle_override != IDLE_NO_OVERRIDE) +		return 0; +  	if (c->cpuid_level < MWAIT_INFO)  		return 0; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ae6847303e2..516fa186121 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -126,15 +126,6 @@ void release_thread(struct task_struct *dead_task)  	release_vm86_irqs(dead_task);  } -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ -	unlazy_fpu(tsk); -} -  int copy_thread(unsigned long clone_flags, unsigned long sp,  	unsigned long unused,  	struct task_struct *p, struct pt_regs *regs) @@ -302,7 +293,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	switch_fpu_finish(next_p, fpu); -	percpu_write(current_task, next_p); +	this_cpu_write(current_task, next_p);  	return prev_p;  } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 43d8b48b23e..61cdf7fdf09 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -145,15 +145,6 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)  	return get_desc_base(&t->thread.tls_array[tls]);  } -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ -	unlazy_fpu(tsk); -} -  int copy_thread(unsigned long clone_flags, unsigned long sp,  		unsigned long unused,  	struct task_struct *p, struct pt_regs *regs) @@ -237,7 +228,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,  	current->thread.usersp	= new_sp;  	regs->ip		= new_ip;  	regs->sp		= new_sp; -	percpu_write(old_rsp, new_sp); +	this_cpu_write(old_rsp, new_sp);  	regs->cs		= _cs;  	regs->ss		= _ss;  	regs->flags		= X86_EFLAGS_IF; @@ -359,11 +350,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	/*  	 * Switch the PDA and FPU contexts.  	 */ -	prev->usersp = percpu_read(old_rsp); -	percpu_write(old_rsp, next->usersp); -	percpu_write(current_task, next_p); +	prev->usersp = this_cpu_read(old_rsp); +	this_cpu_write(old_rsp, next->usersp); +	this_cpu_write(current_task, next_p); -	percpu_write(kernel_stack, +	this_cpu_write(kernel_stack,  		  (unsigned long)task_stack_page(next_p) +  		  THREAD_SIZE - KERNEL_STACK_OFFSET); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 685845cf16e..13b1990c7c5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1480,7 +1480,11 @@ long syscall_trace_enter(struct pt_regs *regs)  		regs->flags |= X86_EFLAGS_TF;  	/* do the secure computing check first */ -	secure_computing(regs->orig_ax); +	if (secure_computing(regs->orig_ax)) { +		/* seccomp failures shouldn't expose any additional code. */ +		ret = -1L; +		goto out; +	}  	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))  		ret = -1L; @@ -1505,6 +1509,7 @@ long syscall_trace_enter(struct pt_regs *regs)  				    regs->dx, regs->r10);  #endif +out:  	return ret ?: regs->orig_ax;  } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d840e69a853..77215c23fba 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -39,7 +39,8 @@ static int reboot_mode;  enum reboot_type reboot_type = BOOT_ACPI;  int reboot_force; -/* This variable is used privately to keep track of whether or not +/* + * This variable is used privately to keep track of whether or not   * reboot_type is still set to its default value (i.e., reboot= hasn't   * been set on the command line).  This is needed so that we can   * suppress DMI scanning for reboot quirks.  Without it, it's @@ -51,7 +52,8 @@ static int reboot_default = 1;  static int reboot_cpu = -1;  #endif -/* This is set if we need to go through the 'emergency' path. +/* + * This is set if we need to go through the 'emergency' path.   * When machine_emergency_restart() is called, we may be on   * an inconsistent state and won't be able to do a clean cleanup   */ @@ -60,22 +62,24 @@ static int reboot_emergency;  /* This is set by the PCI code if either type 1 or type 2 PCI is detected */  bool port_cf9_safe = false; -/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] -   warm   Don't set the cold reboot flag -   cold   Set the cold reboot flag -   bios   Reboot by jumping through the BIOS (only for X86_32) -   smp    Reboot by executing reset on BSP or other CPU (only for X86_32) -   triple Force a triple fault (init) -   kbd    Use the keyboard controller. cold reset (default) -   acpi   Use the RESET_REG in the FADT -   efi    Use efi reset_system runtime service -   pci    Use the so-called "PCI reset register", CF9 -   force  Avoid anything that could hang. +/* + * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] + * warm   Don't set the cold reboot flag + * cold   Set the cold reboot flag + * bios   Reboot by jumping through the BIOS (only for X86_32) + * smp    Reboot by executing reset on BSP or other CPU (only for X86_32) + * triple Force a triple fault (init) + * kbd    Use the keyboard controller. cold reset (default) + * acpi   Use the RESET_REG in the FADT + * efi    Use efi reset_system runtime service + * pci    Use the so-called "PCI reset register", CF9 + * force  Avoid anything that could hang.   */  static int __init reboot_setup(char *str)  {  	for (;;) { -		/* Having anything passed on the command line via +		/* +		 * Having anything passed on the command line via  		 * reboot= will cause us to disable DMI checking  		 * below.  		 */ @@ -98,9 +102,11 @@ static int __init reboot_setup(char *str)  				if (isdigit(*(str+2)))  					reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');  			} -				/* we will leave sorting out the final value -				   when we are ready to reboot, since we might not -				   have detected BSP APIC ID or smp_num_cpu */ +			/* +			 * We will leave sorting out the final value +			 * when we are ready to reboot, since we might not +			 * have detected BSP APIC ID or smp_num_cpu +			 */  			break;  #endif /* CONFIG_SMP */ @@ -150,6 +156,82 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)  	return 0;  } +extern const unsigned char machine_real_restart_asm[]; +extern const u64 machine_real_restart_gdt[3]; + +void machine_real_restart(unsigned int type) +{ +	void *restart_va; +	unsigned long restart_pa; +	void (*restart_lowmem)(unsigned int); +	u64 *lowmem_gdt; + +	local_irq_disable(); + +	/* +	 * Write zero to CMOS register number 0x0f, which the BIOS POST +	 * routine will recognize as telling it to do a proper reboot.  (Well +	 * that's what this book in front of me says -- it may only apply to +	 * the Phoenix BIOS though, it's not clear).  At the same time, +	 * disable NMIs by setting the top bit in the CMOS address register, +	 * as we're about to do peculiar things to the CPU.  I'm not sure if +	 * `outb_p' is needed instead of just `outb'.  Use it to be on the +	 * safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.) +	 */ +	spin_lock(&rtc_lock); +	CMOS_WRITE(0x00, 0x8f); +	spin_unlock(&rtc_lock); + +	/* +	 * Switch back to the initial page table. +	 */ +	load_cr3(initial_page_table); + +	/* +	 * Write 0x1234 to absolute memory location 0x472.  The BIOS reads +	 * this on booting to tell it to "Bypass memory test (also warm +	 * boot)".  This seems like a fairly standard thing that gets set by +	 * REBOOT.COM programs, and the previous reset routine did this +	 * too. */ +	*((unsigned short *)0x472) = reboot_mode; + +	/* Patch the GDT in the low memory trampoline */ +	lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); + +	restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); +	restart_pa = virt_to_phys(restart_va); +	restart_lowmem = (void (*)(unsigned int))restart_pa; + +	/* GDT[0]: GDT self-pointer */ +	lowmem_gdt[0] = +		(u64)(sizeof(machine_real_restart_gdt) - 1) + +		((u64)virt_to_phys(lowmem_gdt) << 16); +	/* GDT[1]: 64K real mode code segment */ +	lowmem_gdt[1] = +		GDT_ENTRY(0x009b, restart_pa, 0xffff); + +	/* Jump to the identity-mapped low memory code */ +	restart_lowmem(type); +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(machine_real_restart); +#endif + +#endif /* CONFIG_X86_32 */ + +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ +	if (reboot_type != BOOT_CF9) { +		reboot_type = BOOT_CF9; +		printk(KERN_INFO "%s series board detected. " +		       "Selecting PCI-method for reboots.\n", d->ident); +	} +	return 0; +} +  static int __init set_kbd_reboot(const struct dmi_system_id *d)  {  	if (reboot_type != BOOT_KBD) { @@ -159,7 +241,12 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)  	return 0;  } +/* + * This is a single dmi_table handling all reboot quirks.  Note that + * REBOOT_BIOS is only available for 32bit + */  static struct dmi_system_id __initdata reboot_dmi_table[] = { +#ifdef CONFIG_X86_32  	{	/* Handle problems with rebooting on Dell E520's */  		.callback = set_bios_reboot,  		.ident = "Dell E520", @@ -184,7 +271,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),  		},  	}, -	{       /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ +	{	/* Handle problems with rebooting on Dell Optiplex 745's SFF */  		.callback = set_bios_reboot,  		.ident = "Dell OptiPlex 745",  		.matches = { @@ -192,7 +279,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),  		},  	}, -	{       /* Handle problems with rebooting on Dell Optiplex 745's DFF*/ +	{	/* Handle problems with rebooting on Dell Optiplex 745's DFF */  		.callback = set_bios_reboot,  		.ident = "Dell OptiPlex 745",  		.matches = { @@ -201,7 +288,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_BOARD_NAME, "0MM599"),  		},  	}, -	{       /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ +	{	/* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */  		.callback = set_bios_reboot,  		.ident = "Dell OptiPlex 745",  		.matches = { @@ -210,7 +297,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_BOARD_NAME, "0KW626"),  		},  	}, -	{   /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ +	{	/* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */  		.callback = set_bios_reboot,  		.ident = "Dell OptiPlex 330",  		.matches = { @@ -219,7 +306,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_BOARD_NAME, "0KP561"),  		},  	}, -	{   /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ +	{	/* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */  		.callback = set_bios_reboot,  		.ident = "Dell OptiPlex 360",  		.matches = { @@ -228,7 +315,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_BOARD_NAME, "0T656F"),  		},  	}, -	{	/* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ +	{	/* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */  		.callback = set_bios_reboot,  		.ident = "Dell OptiPlex 760",  		.matches = { @@ -301,7 +388,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),  		},  	}, -	{       /* Handle problems with rebooting on ASUS P4S800 */ +	{	/* Handle problems with rebooting on ASUS P4S800 */  		.callback = set_bios_reboot,  		.ident = "ASUS P4S800",  		.matches = { @@ -309,7 +396,9 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_BOARD_NAME, "P4S800"),  		},  	}, -	{ /* Handle reboot issue on Acer Aspire one */ +#endif /* CONFIG_X86_32 */ + +	{	/* Handle reboot issue on Acer Aspire one */  		.callback = set_kbd_reboot,  		.ident = "Acer Aspire One A110",  		.matches = { @@ -317,96 +406,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),  		},  	}, -	{ } -}; - -static int __init reboot_init(void) -{ -	/* Only do the DMI check if reboot_type hasn't been overridden -	 * on the command line -	 */ -	if (reboot_default) { -		dmi_check_system(reboot_dmi_table); -	} -	return 0; -} -core_initcall(reboot_init); - -extern const unsigned char machine_real_restart_asm[]; -extern const u64 machine_real_restart_gdt[3]; - -void machine_real_restart(unsigned int type) -{ -	void *restart_va; -	unsigned long restart_pa; -	void (*restart_lowmem)(unsigned int); -	u64 *lowmem_gdt; - -	local_irq_disable(); - -	/* Write zero to CMOS register number 0x0f, which the BIOS POST -	   routine will recognize as telling it to do a proper reboot.  (Well -	   that's what this book in front of me says -- it may only apply to -	   the Phoenix BIOS though, it's not clear).  At the same time, -	   disable NMIs by setting the top bit in the CMOS address register, -	   as we're about to do peculiar things to the CPU.  I'm not sure if -	   `outb_p' is needed instead of just `outb'.  Use it to be on the -	   safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.) -	 */ -	spin_lock(&rtc_lock); -	CMOS_WRITE(0x00, 0x8f); -	spin_unlock(&rtc_lock); - -	/* -	 * Switch back to the initial page table. -	 */ -	load_cr3(initial_page_table); - -	/* Write 0x1234 to absolute memory location 0x472.  The BIOS reads -	   this on booting to tell it to "Bypass memory test (also warm -	   boot)".  This seems like a fairly standard thing that gets set by -	   REBOOT.COM programs, and the previous reset routine did this -	   too. */ -	*((unsigned short *)0x472) = reboot_mode; - -	/* Patch the GDT in the low memory trampoline */ -	lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); - -	restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); -	restart_pa = virt_to_phys(restart_va); -	restart_lowmem = (void (*)(unsigned int))restart_pa; - -	/* GDT[0]: GDT self-pointer */ -	lowmem_gdt[0] = -		(u64)(sizeof(machine_real_restart_gdt) - 1) + -		((u64)virt_to_phys(lowmem_gdt) << 16); -	/* GDT[1]: 64K real mode code segment */ -	lowmem_gdt[1] = -		GDT_ENTRY(0x009b, restart_pa, 0xffff); - -	/* Jump to the identity-mapped low memory code */ -	restart_lowmem(type); -} -#ifdef CONFIG_APM_MODULE -EXPORT_SYMBOL(machine_real_restart); -#endif - -#endif /* CONFIG_X86_32 */ - -/* - * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot - */ -static int __init set_pci_reboot(const struct dmi_system_id *d) -{ -	if (reboot_type != BOOT_CF9) { -		reboot_type = BOOT_CF9; -		printk(KERN_INFO "%s series board detected. " -		       "Selecting PCI-method for reboots.\n", d->ident); -	} -	return 0; -} - -static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {  	{	/* Handle problems with rebooting on Apple MacBook5 */  		.callback = set_pci_reboot,  		.ident = "Apple MacBook5", @@ -474,17 +473,17 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {  	{ }  }; -static int __init pci_reboot_init(void) +static int __init reboot_init(void)  { -	/* Only do the DMI check if reboot_type hasn't been overridden +	/* +	 * Only do the DMI check if reboot_type hasn't been overridden  	 * on the command line  	 */ -	if (reboot_default) { -		dmi_check_system(pci_reboot_dmi_table); -	} +	if (reboot_default) +		dmi_check_system(reboot_dmi_table);  	return 0;  } -core_initcall(pci_reboot_init); +core_initcall(reboot_init);  static inline void kb_wait(void)  { @@ -502,14 +501,14 @@ static void vmxoff_nmi(int cpu, struct pt_regs *regs)  	cpu_emergency_vmxoff();  } -/* Use NMIs as IPIs to tell all CPUs to disable virtualization - */ +/* Use NMIs as IPIs to tell all CPUs to disable virtualization */  static void emergency_vmx_disable_all(void)  {  	/* Just make sure we won't change CPUs while doing this */  	local_irq_disable(); -	/* We need to disable VMX on all CPUs before rebooting, otherwise +	/* +	 * We need to disable VMX on all CPUs before rebooting, otherwise  	 * we risk hanging up the machine, because the CPU ignore INIT  	 * signals when VMX is enabled.  	 * @@ -528,8 +527,7 @@ static void emergency_vmx_disable_all(void)  	 * is still enabling VMX.  	 */  	if (cpu_has_vmx() && cpu_vmx_enabled()) { -		/* Disable VMX on this CPU. -		 */ +		/* Disable VMX on this CPU. */  		cpu_vmxoff();  		/* Halt and disable VMX on the other CPUs */ @@ -574,12 +572,12 @@ static void native_machine_emergency_restart(void)  		/* Could also try the reset bit in the Hammer NB */  		switch (reboot_type) {  		case BOOT_KBD: -			mach_reboot_fixups(); /* for board specific fixups */ +			mach_reboot_fixups(); /* For board specific fixups */  			for (i = 0; i < 10; i++) {  				kb_wait();  				udelay(50); -				outb(0xfe, 0x64); /* pulse reset low */ +				outb(0xfe, 0x64); /* Pulse reset low */  				udelay(50);  			}  			if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { @@ -621,7 +619,7 @@ static void native_machine_emergency_restart(void)  		case BOOT_CF9:  			port_cf9_safe = true; -			/* fall through */ +			/* Fall through */  		case BOOT_CF9_COND:  			if (port_cf9_safe) { @@ -659,7 +657,8 @@ void native_machine_shutdown(void)  	/* Make certain I only run on the appropriate processor */  	set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); -	/* O.K Now that I'm on the appropriate processor, +	/* +	 * O.K Now that I'm on the appropriate processor,  	 * stop all of the others.  	 */  	stop_other_cpus(); @@ -697,12 +696,11 @@ static void native_machine_restart(char *__unused)  static void native_machine_halt(void)  { -	/* stop other cpus and apics */ +	/* Stop other cpus and apics */  	machine_shutdown();  	tboot_shutdown(TB_SHUTDOWN_HALT); -	/* stop this cpu */  	stop_this_cpu(NULL);  } @@ -713,7 +711,7 @@ static void native_machine_power_off(void)  			machine_shutdown();  		pm_power_off();  	} -	/* a fallback in case there is no PM info available */ +	/* A fallback in case there is no PM info available */  	tboot_shutdown(TB_SHUTDOWN_HALT);  } @@ -775,7 +773,8 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)  	cpu = raw_smp_processor_id(); -	/* Don't do anything if this handler is invoked on crashing cpu. +	/* +	 * Don't do anything if this handler is invoked on crashing cpu.  	 * Otherwise, system will completely hang. Crashing cpu can get  	 * an NMI if system was initially booted with nmi_watchdog parameter.  	 */ @@ -799,7 +798,8 @@ static void smp_send_nmi_allbutself(void)  	apic->send_IPI_allbutself(NMI_VECTOR);  } -/* Halt all other CPUs, calling the specified function on each of them +/* + * Halt all other CPUs, calling the specified function on each of them   *   * This function can be used to halt all other CPUs on crash   * or emergency reboot time. The function passed as parameter @@ -810,7 +810,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)  	unsigned long msecs;  	local_irq_disable(); -	/* Make a note of crashing cpu. Will be used in NMI callback.*/ +	/* Make a note of crashing cpu. Will be used in NMI callback. */  	crashing_cpu = safe_smp_processor_id();  	shootdown_callback = callback; @@ -819,8 +819,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)  	/* Would it be better to replace the trap vector here? */  	if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,  				 NMI_FLAG_FIRST, "crash")) -		return;		/* return what? */ -	/* Ensure the new callback function is set before sending +		return;		/* Return what? */ +	/* +	 * Ensure the new callback function is set before sending  	 * out the NMI  	 */  	wmb(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1a290156205..366c688d619 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -34,7 +34,6 @@  #include <linux/memblock.h>  #include <linux/seq_file.h>  #include <linux/console.h> -#include <linux/mca.h>  #include <linux/root_dev.h>  #include <linux/highmem.h>  #include <linux/module.h> @@ -179,12 +178,6 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};  /* common cpu data for all cpus */  struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};  EXPORT_SYMBOL(boot_cpu_data); -static void set_mca_bus(int x) -{ -#ifdef CONFIG_MCA -	MCA_bus = x; -#endif -}  unsigned int def_to_bigsmp; @@ -393,10 +386,9 @@ static void __init reserve_initrd(void)  	initrd_start = 0;  	if (ramdisk_size >= (end_of_lowmem>>1)) { -		memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); -		printk(KERN_ERR "initrd too large to handle, " -		       "disabling initrd\n"); -		return; +		panic("initrd too large to handle, " +		       "disabling initrd (%lld needed, %lld available)\n", +		       ramdisk_size, end_of_lowmem>>1);  	}  	printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, @@ -717,7 +709,6 @@ void __init setup_arch(char **cmdline_p)  	apm_info.bios = boot_params.apm_bios_info;  	ist_info = boot_params.ist_info;  	if (boot_params.sys_desc_table.length != 0) { -		set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);  		machine_id = boot_params.sys_desc_table.table[0];  		machine_submodel_id = boot_params.sys_desc_table.table[1];  		BIOS_revision = boot_params.sys_desc_table.table[2]; @@ -1012,7 +1003,8 @@ void __init setup_arch(char **cmdline_p)  	init_cpu_to_node();  	init_apic_mappings(); -	ioapic_and_gsi_init(); +	if (x86_io_apic_ops.init) +		x86_io_apic_ops.init();  	kvm_guest_init(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 66c74f481ca..48d2b7ded42 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -109,6 +109,9 @@   *	about nothing of note with C stepping upwards.   */ +static atomic_t stopping_cpu = ATOMIC_INIT(-1); +static bool smp_no_nmi_ipi = false; +  /*   * this function sends a 'reschedule' IPI to another CPU.   * it goes straight through and wastes no time serializing @@ -149,8 +152,6 @@ void native_send_call_func_ipi(const struct cpumask *mask)  	free_cpumask_var(allbutself);  } -static atomic_t stopping_cpu = ATOMIC_INIT(-1); -  static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)  {  	/* We are registered on stopping cpu too, avoid spurious NMI */ @@ -162,7 +163,19 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)  	return NMI_HANDLED;  } -static void native_nmi_stop_other_cpus(int wait) +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ + +asmlinkage void smp_reboot_interrupt(void) +{ +	ack_APIC_irq(); +	irq_enter(); +	stop_this_cpu(NULL); +	irq_exit(); +} + +static void native_stop_other_cpus(int wait)  {  	unsigned long flags;  	unsigned long timeout; @@ -174,20 +187,25 @@ static void native_nmi_stop_other_cpus(int wait)  	 * Use an own vector here because smp_call_function  	 * does lots of things not suitable in a panic situation.  	 */ + +	/* +	 * We start by using the REBOOT_VECTOR irq. +	 * The irq is treated as a sync point to allow critical +	 * regions of code on other cpus to release their spin locks +	 * and re-enable irqs.  Jumping straight to an NMI might +	 * accidentally cause deadlocks with further shutdown/panic +	 * code.  By syncing, we give the cpus up to one second to +	 * finish their work before we force them off with the NMI. +	 */  	if (num_online_cpus() > 1) {  		/* did someone beat us here? */  		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)  			return; -		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, -					 NMI_FLAG_FIRST, "smp_stop")) -			/* Note: we ignore failures here */ -			return; - -		/* sync above data before sending NMI */ +		/* sync above data before sending IRQ */  		wmb(); -		apic->send_IPI_allbutself(NMI_VECTOR); +		apic->send_IPI_allbutself(REBOOT_VECTOR);  		/*  		 * Don't wait longer than a second if the caller @@ -197,63 +215,37 @@ static void native_nmi_stop_other_cpus(int wait)  		while (num_online_cpus() > 1 && (wait || timeout--))  			udelay(1);  	} +	 +	/* if the REBOOT_VECTOR didn't work, try with the NMI */ +	if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi))  { +		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, +					 NMI_FLAG_FIRST, "smp_stop")) +			/* Note: we ignore failures here */ +			/* Hope the REBOOT_IRQ is good enough */ +			goto finish; -	local_irq_save(flags); -	disable_local_APIC(); -	local_irq_restore(flags); -} - -/* - * this function calls the 'stop' function on all other CPUs in the system. - */ - -asmlinkage void smp_reboot_interrupt(void) -{ -	ack_APIC_irq(); -	irq_enter(); -	stop_this_cpu(NULL); -	irq_exit(); -} - -static void native_irq_stop_other_cpus(int wait) -{ -	unsigned long flags; -	unsigned long timeout; +		/* sync above data before sending IRQ */ +		wmb(); -	if (reboot_force) -		return; +		pr_emerg("Shutting down cpus with NMI\n"); -	/* -	 * Use an own vector here because smp_call_function -	 * does lots of things not suitable in a panic situation. -	 * On most systems we could also use an NMI here, -	 * but there are a few systems around where NMI -	 * is problematic so stay with an non NMI for now -	 * (this implies we cannot stop CPUs spinning with irq off -	 * currently) -	 */ -	if (num_online_cpus() > 1) { -		apic->send_IPI_allbutself(REBOOT_VECTOR); +		apic->send_IPI_allbutself(NMI_VECTOR);  		/* -		 * Don't wait longer than a second if the caller +		 * Don't wait longer than a 10 ms if the caller  		 * didn't ask us to wait.  		 */ -		timeout = USEC_PER_SEC; +		timeout = USEC_PER_MSEC * 10;  		while (num_online_cpus() > 1 && (wait || timeout--))  			udelay(1);  	} +finish:  	local_irq_save(flags);  	disable_local_APIC();  	local_irq_restore(flags);  } -static void native_smp_disable_nmi_ipi(void) -{ -	smp_ops.stop_other_cpus = native_irq_stop_other_cpus; -} -  /*   * Reschedule call back.   */ @@ -287,8 +279,8 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)  static int __init nonmi_ipi_setup(char *str)  { -        native_smp_disable_nmi_ipi(); -        return 1; +	smp_no_nmi_ipi = true; +	return 1;  }  __setup("nonmi_ipi", nonmi_ipi_setup); @@ -298,7 +290,7 @@ struct smp_ops smp_ops = {  	.smp_prepare_cpus	= native_smp_prepare_cpus,  	.smp_cpus_done		= native_smp_cpus_done, -	.stop_other_cpus	= native_nmi_stop_other_cpus, +	.stop_other_cpus	= native_stop_other_cpus,  	.smp_send_reschedule	= native_smp_send_reschedule,  	.cpu_up			= native_cpu_up, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3acaf51dfdd..433529e29be 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -299,59 +299,90 @@ void __cpuinit smp_store_cpu_info(int id)  		identify_secondary_cpu(c);  } -static void __cpuinit link_thread_siblings(int cpu1, int cpu2) +static bool __cpuinit +topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)  { -	cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); -	cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); -	cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); -	cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); -	cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); -	cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); +	int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + +	return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), +		"sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " +		"[node: %d != %d]. Ignoring dependency.\n", +		cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); +} + +#define link_mask(_m, c1, c2)						\ +do {									\ +	cpumask_set_cpu((c1), cpu_##_m##_mask(c2));			\ +	cpumask_set_cpu((c2), cpu_##_m##_mask(c1));			\ +} while (0) + +static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ +	if (cpu_has(c, X86_FEATURE_TOPOEXT)) { +		int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + +		if (c->phys_proc_id == o->phys_proc_id && +		    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && +		    c->compute_unit_id == o->compute_unit_id) +			return topology_sane(c, o, "smt"); + +	} else if (c->phys_proc_id == o->phys_proc_id && +		   c->cpu_core_id == o->cpu_core_id) { +		return topology_sane(c, o, "smt"); +	} + +	return false; +} + +static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ +	int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + +	if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID && +	    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) +		return topology_sane(c, o, "llc"); + +	return false;  } +static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ +	if (c->phys_proc_id == o->phys_proc_id) +		return topology_sane(c, o, "mc"); + +	return false; +}  void __cpuinit set_cpu_sibling_map(int cpu)  { -	int i; +	bool has_mc = boot_cpu_data.x86_max_cores > 1; +	bool has_smt = smp_num_siblings > 1;  	struct cpuinfo_x86 *c = &cpu_data(cpu); +	struct cpuinfo_x86 *o; +	int i;  	cpumask_set_cpu(cpu, cpu_sibling_setup_mask); -	if (smp_num_siblings > 1) { -		for_each_cpu(i, cpu_sibling_setup_mask) { -			struct cpuinfo_x86 *o = &cpu_data(i); - -			if (cpu_has(c, X86_FEATURE_TOPOEXT)) { -				if (c->phys_proc_id == o->phys_proc_id && -				    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && -				    c->compute_unit_id == o->compute_unit_id) -					link_thread_siblings(cpu, i); -			} else if (c->phys_proc_id == o->phys_proc_id && -				   c->cpu_core_id == o->cpu_core_id) { -				link_thread_siblings(cpu, i); -			} -		} -	} else { +	if (!has_smt && !has_mc) {  		cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); -	} - -	cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); - -	if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { -		cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); +		cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); +		cpumask_set_cpu(cpu, cpu_core_mask(cpu));  		c->booted_cores = 1;  		return;  	}  	for_each_cpu(i, cpu_sibling_setup_mask) { -		if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && -		    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { -			cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); -			cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); -		} -		if (c->phys_proc_id == cpu_data(i).phys_proc_id) { -			cpumask_set_cpu(i, cpu_core_mask(cpu)); -			cpumask_set_cpu(cpu, cpu_core_mask(i)); +		o = &cpu_data(i); + +		if ((i == cpu) || (has_smt && match_smt(c, o))) +			link_mask(sibling, cpu, i); + +		if ((i == cpu) || (has_mc && match_llc(c, o))) +			link_mask(llc_shared, cpu, i); + +		if ((i == cpu) || (has_mc && match_mc(c, o))) { +			link_mask(core, cpu, i); +  			/*  			 *  Does this new cpu bringup a new core?  			 */ @@ -382,8 +413,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)  	 * For perf, we return last level cache shared map.  	 * And for power savings, we return cpu_core_map  	 */ -	if ((sched_mc_power_savings || sched_smt_power_savings) && -	    !(cpu_has(c, X86_FEATURE_AMD_DCM))) +	if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))  		return cpu_core_mask(cpu);  	else  		return cpu_llc_shared_mask(cpu); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index c29e235792a..b79133abda4 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -12,6 +12,7 @@  #include <linux/module.h>  #include <asm/cacheflush.h>  #include <asm/sections.h> +#include <asm/asm.h>  int rodata_test(void)  { @@ -42,14 +43,7 @@ int rodata_test(void)  		".section .fixup,\"ax\"\n"  		"2:	jmp 1b\n"  		".previous\n" -		".section __ex_table,\"a\"\n" -		"       .align 16\n" -#ifdef CONFIG_X86_32 -		"	.long 0b,2b\n" -#else -		"	.quad 0b,2b\n" -#endif -		".previous" +		_ASM_EXTABLE(0b,2b)  		: [rslt] "=r" (result)  		: [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)  	); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index c6eba2b4267..24d3c91e981 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -14,7 +14,6 @@  #include <linux/i8253.h>  #include <linux/time.h>  #include <linux/export.h> -#include <linux/mca.h>  #include <asm/vsyscall.h>  #include <asm/x86_init.h> @@ -58,11 +57,6 @@ EXPORT_SYMBOL(profile_pc);  static irqreturn_t timer_interrupt(int irq, void *dev_id)  {  	global_clock_event->event_handler(global_clock_event); - -	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ -	if (MCA_bus) -		outb_p(inb_p(0x61)| 0x80, 0x61); -  	return IRQ_HANDLED;  } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff9281f1602..ff08457a025 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,10 +37,6 @@  #include <linux/eisa.h>  #endif -#ifdef CONFIG_MCA -#include <linux/mca.h> -#endif -  #if defined(CONFIG_EDAC)  #include <linux/edac.h>  #endif @@ -50,6 +46,7 @@  #include <asm/processor.h>  #include <asm/debugreg.h>  #include <linux/atomic.h> +#include <asm/ftrace.h>  #include <asm/traps.h>  #include <asm/desc.h>  #include <asm/i387.h> @@ -303,8 +300,13 @@ gp_in_kernel:  }  /* May run on IST stack. */ -dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)  { +#ifdef CONFIG_DYNAMIC_FTRACE +	/* ftrace must be first, everything else may cause a recursive crash */ +	if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) +		return; +#endif  #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP  	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,  				SIGTRAP) == NOTIFY_STOP) diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index a1d804bcd48..8eeb55a551b 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -15,6 +15,7 @@  #include <linux/init.h>  #include <linux/pci_ids.h>  #include <linux/pci_regs.h> +#include <linux/smp.h>  #include <asm/apic.h>  #include <asm/pci-direct.h> @@ -22,6 +23,8 @@  #include <asm/paravirt.h>  #include <asm/setup.h> +#define TOPOLOGY_REGISTER_OFFSET 0x10 +  #if defined CONFIG_PCI && defined CONFIG_PARAVIRT  /*   * Interrupt control on vSMPowered systems: @@ -149,12 +152,49 @@ int is_vsmp_box(void)  	return 0;  }  #endif + +static void __init vsmp_cap_cpus(void) +{ +#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) +	void __iomem *address; +	unsigned int cfg, topology, node_shift, maxcpus; + +	/* +	 * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the +	 * ones present in the first board, unless explicitly overridden by +	 * setup_max_cpus +	 */ +	if (setup_max_cpus != NR_CPUS) +		return; + +	/* Read the vSMP Foundation topology register */ +	cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0); +	address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4); +	if (WARN_ON(!address)) +		return; + +	topology = readl(address); +	node_shift = (topology >> 16) & 0x7; +	if (!node_shift) +		/* The value 0 should be decoded as 8 */ +		node_shift = 8; +	maxcpus = (topology & ((1 << node_shift) - 1)) + 1; + +	pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n", +		maxcpus); +	setup_max_cpus = maxcpus; +	early_iounmap(address, 4); +#endif +} +  void __init vsmp_init(void)  {  	detect_vsmp_box();  	if (!is_vsmp_box())  		return; +	vsmp_cap_cpus(); +  	set_vsmp_pv_ops();  	return;  } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9cf71d0b2d3..35c5e543f55 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -18,6 +18,7 @@  #include <asm/e820.h>  #include <asm/time.h>  #include <asm/irq.h> +#include <asm/io_apic.h>  #include <asm/pat.h>  #include <asm/tsc.h>  #include <asm/iommu.h> @@ -119,3 +120,10 @@ struct x86_msi_ops x86_msi = {  	.teardown_msi_irqs = default_teardown_msi_irqs,  	.restore_msi_irqs = default_restore_msi_irqs,  }; + +struct x86_io_apic_ops x86_io_apic_ops = { +	.init	= native_io_apic_init_mappings, +	.read	= native_io_apic_read, +	.write	= native_io_apic_write, +	.modify	= native_io_apic_modify, +}; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index e62728e30b0..bd18149b2b0 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -48,8 +48,6 @@ void __sanitize_i387_state(struct task_struct *tsk)  	if (!fx)  		return; -	BUG_ON(__thread_has_fpu(tsk)); -  	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;  	/*  |