diff options
Diffstat (limited to 'arch/x86/xen/enlighten.c')
| -rw-r--r-- | arch/x86/xen/enlighten.c | 226 | 
1 files changed, 173 insertions, 53 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ff962d4b821..bf4bda6d3e9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,6 +31,7 @@  #include <linux/pci.h>  #include <linux/gfp.h>  #include <linux/memblock.h> +#include <linux/syscore_ops.h>  #include <xen/xen.h>  #include <xen/interface/xen.h> @@ -38,6 +39,7 @@  #include <xen/interface/physdev.h>  #include <xen/interface/vcpu.h>  #include <xen/interface/memory.h> +#include <xen/interface/xen-mca.h>  #include <xen/features.h>  #include <xen/page.h>  #include <xen/hvm.h> @@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);   * Point at some empty memory to start with. We map the real shared_info   * page as soon as fixmap is up and running.   */ -struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; +struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;  /*   * Flag to determine whether vcpu info placement is available on all @@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;   */  static int have_vcpu_info_placement = 1; +struct tls_descs { +	struct desc_struct desc[3]; +}; + +/* + * Updating the 3 TLS descriptors in the GDT on every task switch is + * surprisingly expensive so we avoid updating them if they haven't + * changed.  Since Xen writes different descriptors than the one + * passed in the update_descriptor hypercall we keep shadow copies to + * compare against. + */ +static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +  static void clamp_max_cpus(void)  {  #ifdef CONFIG_SMP @@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)  	unsigned int xsave_mask;  	cpuid_leaf1_edx_mask = -		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */ -		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */ -		  (1 << X86_FEATURE_MTRR) |  /* disable MTRR */ +		~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */  		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */  	if (!xen_initial_domain()) @@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)  		BUG();  } +static inline bool desc_equal(const struct desc_struct *d1, +			      const struct desc_struct *d2) +{ +	return d1->a == d2->a && d1->b == d2->b; +} +  static void load_TLS_descriptor(struct thread_struct *t,  				unsigned int cpu, unsigned int i)  { -	struct desc_struct *gdt = get_cpu_gdt_table(cpu); -	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); -	struct multicall_space mc = __xen_mc_entry(0); +	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; +	struct desc_struct *gdt; +	xmaddr_t maddr; +	struct multicall_space mc; + +	if (desc_equal(shadow, &t->tls_array[i])) +		return; + +	*shadow = t->tls_array[i]; + +	gdt = get_cpu_gdt_table(cpu); +	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); +	mc = __xen_mc_entry(0);  	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);  } @@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,  	/*  	 * Look for known traps using IST, and substitute them  	 * appropriately.  The debugger ones are the only ones we care -	 * about.  Xen will handle faults like double_fault and -	 * machine_check, so we should never see them.  Warn if +	 * about.  Xen will handle faults like double_fault, +	 * so we should never see them.  Warn if  	 * there's an unexpected IST-using fault handler.  	 */  	if (addr == (unsigned long)debug) @@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,  		return 0;  #ifdef CONFIG_X86_MCE  	} else if (addr == (unsigned long)machine_check) { -		return 0; +		/* +		 * when xen hypervisor inject vMCE to guest, +		 * use native mce handler to handle it +		 */ +		;  #endif  	} else {  		/* Some other trap using IST? */ @@ -1124,9 +1157,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {  	.wbinvd = native_wbinvd,  	.read_msr = native_read_msr_safe, -	.rdmsr_regs = native_rdmsr_safe_regs,  	.write_msr = xen_write_msr_safe, -	.wrmsr_regs = native_wrmsr_safe_regs,  	.read_tsc = native_read_tsc,  	.read_pmc = native_read_pmc, @@ -1439,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)  #endif  } -static int init_hvm_pv_info(int *major, int *minor) -{ -	uint32_t eax, ebx, ecx, edx, pages, msr, base; -	u64 pfn; - -	base = xen_cpuid_base(); -	cpuid(base + 1, &eax, &ebx, &ecx, &edx); - -	*major = eax >> 16; -	*minor = eax & 0xffff; -	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); - -	cpuid(base + 2, &pages, &msr, &ecx, &edx); - -	pfn = __pa(hypercall_page); -	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - -	xen_setup_features(); - -	pv_info.name = "Xen HVM"; - -	xen_domain_type = XEN_HVM_DOMAIN; +#ifdef CONFIG_XEN_PVHVM +/* + * The pfn containing the shared_info is located somewhere in RAM. This + * will cause trouble if the current kernel is doing a kexec boot into a + * new kernel. The new kernel (and its startup code) can not know where + * the pfn is, so it can not reserve the page. The hypervisor will + * continue to update the pfn, and as a result memory corruption occours + * in the new kernel. + * + * One way to work around this issue is to allocate a page in the + * xen-platform pci device's BAR memory range. But pci init is done very + * late and the shared_info page is already in use very early to read + * the pvclock. So moving the pfn from RAM to MMIO is racy because some + * code paths on other vcpus could access the pfn during the small + * window when the old pfn is moved to the new pfn. There is even a + * small window were the old pfn is not backed by a mfn, and during that + * time all reads return -1. + * + * Because it is not known upfront where the MMIO region is located it + * can not be used right from the start in xen_hvm_init_shared_info. + * + * To minimise trouble the move of the pfn is done shortly before kexec. + * This does not eliminate the race because all vcpus are still online + * when the syscore_ops will be called. But hopefully there is no work + * pending at this point in time. Also the syscore_op is run last which + * reduces the risk further. + */ -	return 0; -} +static struct shared_info *xen_hvm_shared_info; -void __ref xen_hvm_init_shared_info(void) +static void xen_hvm_connect_shared_info(unsigned long pfn)  { -	int cpu;  	struct xen_add_to_physmap xatp; -	static struct shared_info *shared_info_page = 0; -	if (!shared_info_page) -		shared_info_page = (struct shared_info *) -			extend_brk(PAGE_SIZE, PAGE_SIZE);  	xatp.domid = DOMID_SELF;  	xatp.idx = 0;  	xatp.space = XENMAPSPACE_shared_info; -	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; +	xatp.gpfn = pfn;  	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))  		BUG(); -	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; +} +static void xen_hvm_set_shared_info(struct shared_info *sip) +{ +	int cpu; + +	HYPERVISOR_shared_info = sip;  	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info  	 * page, we use it in the event channel upcall and in some pvclock  	 * related functions. We don't need the vcpu_info placement  	 * optimizations because we don't use any pv_mmu or pv_irq op on  	 * HVM. -	 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is -	 * online but xen_hvm_init_shared_info is run at resume time too and +	 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is +	 * online but xen_hvm_set_shared_info is run at resume time too and  	 * in that case multiple vcpus might be online. */  	for_each_online_cpu(cpu) {  		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];  	}  } -#ifdef CONFIG_XEN_PVHVM +/* Reconnect the shared_info pfn to a mfn */ +void xen_hvm_resume_shared_info(void) +{ +	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); +} + +#ifdef CONFIG_KEXEC +static struct shared_info *xen_hvm_shared_info_kexec; +static unsigned long xen_hvm_shared_info_pfn_kexec; + +/* Remember a pfn in MMIO space for kexec reboot */ +void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn) +{ +	xen_hvm_shared_info_kexec = sip; +	xen_hvm_shared_info_pfn_kexec = pfn; +} + +static void xen_hvm_syscore_shutdown(void) +{ +	struct xen_memory_reservation reservation = { +		.domid = DOMID_SELF, +		.nr_extents = 1, +	}; +	unsigned long prev_pfn; +	int rc; + +	if (!xen_hvm_shared_info_kexec) +		return; + +	prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT; +	set_xen_guest_handle(reservation.extent_start, &prev_pfn); + +	/* Move pfn to MMIO, disconnects previous pfn from mfn */ +	xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec); + +	/* Update pointers, following hypercall is also a memory barrier */ +	xen_hvm_set_shared_info(xen_hvm_shared_info_kexec); + +	/* Allocate new mfn for previous pfn */ +	do { +		rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); +		if (rc == 0) +			msleep(123); +	} while (rc == 0); + +	/* Make sure the previous pfn is really connected to a (new) mfn */ +	BUG_ON(rc != 1); +} + +static struct syscore_ops xen_hvm_syscore_ops = { +	.shutdown = xen_hvm_syscore_shutdown, +}; +#endif + +/* Use a pfn in RAM, may move to MMIO before kexec. */ +static void __init xen_hvm_init_shared_info(void) +{ +	/* Remember pointer for resume */ +	xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); +	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); +	xen_hvm_set_shared_info(xen_hvm_shared_info); +} + +static void __init init_hvm_pv_info(void) +{ +	int major, minor; +	uint32_t eax, ebx, ecx, edx, pages, msr, base; +	u64 pfn; + +	base = xen_cpuid_base(); +	cpuid(base + 1, &eax, &ebx, &ecx, &edx); + +	major = eax >> 16; +	minor = eax & 0xffff; +	printk(KERN_INFO "Xen version %d.%d.\n", major, minor); + +	cpuid(base + 2, &pages, &msr, &ecx, &edx); + +	pfn = __pa(hypercall_page); +	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + +	xen_setup_features(); + +	pv_info.name = "Xen HVM"; + +	xen_domain_type = XEN_HVM_DOMAIN; +} +  static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,  				    unsigned long action, void *hcpu)  { @@ -1519,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {  static void __init xen_hvm_guest_init(void)  { -	int r; -	int major, minor; - -	r = init_hvm_pv_info(&major, &minor); -	if (r < 0) -		return; +	init_hvm_pv_info();  	xen_hvm_init_shared_info(); +#ifdef CONFIG_KEXEC +	register_syscore_ops(&xen_hvm_syscore_ops); +#endif  	if (xen_feature(XENFEAT_hvm_callback_vector))  		xen_have_vector_callback = 1;  |