diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-22 13:45:50 -0700 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-22 13:45:50 -0700 | 
| commit | a7e1aabb28e8154ce987b622fd78d80a1ca39361 (patch) | |
| tree | 8671d8faf51d43665045b7362a177a23dc88921b | |
| parent | 111ad119d1765b1bbef2629a5f2bd825caeb7e74 (diff) | |
| parent | 996ba96a97f7406052486682846d68935a60e986 (diff) | |
| download | olio-linux-3.10-a7e1aabb28e8154ce987b622fd78d80a1ca39361.tar.xz olio-linux-3.10-a7e1aabb28e8154ce987b622fd78d80a1ca39361.zip  | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
  lguest: Fix in/out emulation
  lguest: Fix translation count about wikipedia's cpuid page
  lguest: Fix three simple typos in comments
  lguest: update comments
  lguest: Simplify device initialization.
  lguest: don't rewrite vmcall instructions
  lguest: remove remaining vmcall
  lguest: use a special 1:1 linear pagetable mode until first switch.
  lguest: Do not exit on non-fatal errors
| -rw-r--r-- | Documentation/virtual/lguest/lguest.c | 47 | ||||
| -rw-r--r-- | arch/x86/include/asm/lguest_hcall.h | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 1 | ||||
| -rw-r--r-- | arch/x86/lguest/boot.c | 36 | ||||
| -rw-r--r-- | arch/x86/lguest/i386_head.S | 35 | ||||
| -rw-r--r-- | drivers/lguest/core.c | 2 | ||||
| -rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 10 | ||||
| -rw-r--r-- | drivers/lguest/lg.h | 2 | ||||
| -rw-r--r-- | drivers/lguest/lguest_device.c | 37 | ||||
| -rw-r--r-- | drivers/lguest/lguest_user.c | 17 | ||||
| -rw-r--r-- | drivers/lguest/page_tables.c | 282 | ||||
| -rw-r--r-- | drivers/lguest/x86/core.c | 107 | ||||
| -rw-r--r-- | include/linux/lguest.h | 2 | 
13 files changed, 199 insertions, 380 deletions
diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index cd9d6af61d0..043bd7df313 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c @@ -51,7 +51,7 @@  #include <asm/bootparam.h>  #include "../../../include/linux/lguest_launcher.h"  /*L:110 - * We can ignore the 42 include files we need for this program, but I do want + * We can ignore the 43 include files we need for this program, but I do want   * to draw attention to the use of kernel-style types.   *   * As Linus said, "C is a Spartan language, and so should your naming be."  I @@ -65,7 +65,6 @@ typedef uint16_t u16;  typedef uint8_t u8;  /*:*/ -#define PAGE_PRESENT 0x7 	/* Present, RW, Execute */  #define BRIDGE_PFX "bridge:"  #ifndef SIOCBRADDIF  #define SIOCBRADDIF	0x89a2		/* add interface to bridge      */ @@ -861,8 +860,10 @@ static void console_output(struct virtqueue *vq)  	/* writev can return a partial write, so we loop here. */  	while (!iov_empty(iov, out)) {  		int len = writev(STDOUT_FILENO, iov, out); -		if (len <= 0) -			err(1, "Write to stdout gave %i", len); +		if (len <= 0) { +			warn("Write to stdout gave %i (%d)", len, errno); +			break; +		}  		iov_consume(iov, out, len);  	} @@ -898,7 +899,7 @@ static void net_output(struct virtqueue *vq)  	 * same format: what a coincidence!  	 */  	if (writev(net_info->tunfd, iov, out) < 0) -		errx(1, "Write to tun failed?"); +		warnx("Write to tun failed (%d)?", errno);  	/*  	 * Done with that one; wait_for_vq_desc() will send the interrupt if @@ -955,7 +956,7 @@ static void net_input(struct virtqueue *vq)  	 */  	len = readv(net_info->tunfd, iov, in);  	if (len <= 0) -		err(1, "Failed to read from tun."); +		warn("Failed to read from tun (%d).", errno);  	/*  	 * Mark that packet buffer as used, but don't interrupt here.  We want @@ -1093,9 +1094,10 @@ static void update_device_status(struct device *dev)  		warnx("Device %s configuration FAILED", dev->name);  		if (dev->running)  			reset_device(dev); -	} else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { -		if (!dev->running) -			start_device(dev); +	} else { +		if (dev->running) +			err(1, "Device %s features finalized twice", dev->name); +		start_device(dev);  	}  } @@ -1120,25 +1122,11 @@ static void handle_output(unsigned long addr)  			return;  		} -		/* -		 * Devices *can* be used before status is set to DRIVER_OK. -		 * The original plan was that they would never do this: they -		 * would always finish setting up their status bits before -		 * actually touching the virtqueues.  In practice, we allowed -		 * them to, and they do (eg. the disk probes for partition -		 * tables as part of initialization). -		 * -		 * If we see this, we start the device: once it's running, we -		 * expect the device to catch all the notifications. -		 */ +		/* Devices should not be used before features are finalized. */  		for (vq = i->vq; vq; vq = vq->next) {  			if (addr != vq->config.pfn*getpagesize())  				continue; -			if (i->running) -				errx(1, "Notification on running %s", i->name); -			/* This just calls create_thread() for each virtqueue */ -			start_device(i); -			return; +			errx(1, "Notification on %s before setup!", i->name);  		}  	} @@ -1370,7 +1358,7 @@ static void setup_console(void)   * --sharenet=<name> option which opens or creates a named pipe.  This can be   * used to send packets to another guest in a 1:1 manner.   * - * More sopisticated is to use one of the tools developed for project like UML + * More sophisticated is to use one of the tools developed for project like UML   * to do networking.   *   * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be @@ -1380,7 +1368,7 @@ static void setup_console(void)   * multiple inter-guest channels behind one interface, although it would   * require some manner of hotplugging new virtio channels.   * - * Finally, we could implement a virtio network switch in the kernel. + * Finally, we could use a virtio network switch in the kernel, ie. vhost.  :*/  static u32 str2ip(const char *ipaddr) @@ -2017,10 +2005,7 @@ int main(int argc, char *argv[])  	/* Tell the entry path not to try to reload segment registers. */  	boot->hdr.loadflags |= KEEP_SEGMENTS; -	/* -	 * We tell the kernel to initialize the Guest: this returns the open -	 * /dev/lguest file descriptor. -	 */ +	/* We tell the kernel to initialize the Guest. */  	tell_kernel(start);  	/* Ensure that we terminate if a device-servicing child dies. */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index b60f2924c41..879fd7d3387 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -61,6 +61,7 @@ hcall(unsigned long call,  		     : "memory");  	return call;  } +/*:*/  /* Can't use our min() macro here: needs to be a constant */  #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index c29d631af6f..395a10e6806 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -63,7 +63,6 @@ void foo(void)  	BLANK();  	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);  	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); -	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);  	BLANK();  	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index db832fd65ec..13ee258442a 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -71,7 +71,8 @@  #include <asm/stackprotector.h>  #include <asm/reboot.h>		/* for struct machine_ops */ -/*G:010 Welcome to the Guest! +/*G:010 + * Welcome to the Guest!   *   * The Guest in our tale is a simple creature: identical to the Host but   * behaving in simplified but equivalent ways.  In particular, the Guest is the @@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call,  #endif  /*G:036 - * When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. -:*/ + * When lazy mode is turned off, we issue the do-nothing hypercall to + * flush any stored calls, and call the generic helper to reset the + * per-cpu lazy mode variable. + */  static void lguest_leave_lazy_mmu_mode(void)  {  	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);  	paravirt_leave_lazy_mmu();  } +/* + * We also catch the end of context switch; we enter lazy mode for much of + * that too, so again we need to flush here. + * + * (Technically, this is lazy CPU mode, and normally we're in lazy MMU + * mode, but unlike Xen, lguest doesn't care about the difference). + */  static void lguest_end_context_switch(struct task_struct *next)  {  	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); @@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void)   * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.   *   * This instruction even it has its own Wikipedia entry.  The Wikipedia entry - * has been translated into 5 languages.  I am not making this up! + * has been translated into 6 languages.  I am not making this up!   *   * We could get funky here and identify ourselves as "GenuineLguest", but   * instead we just use the real "cpuid" instruction.  Then I pretty much turned @@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,  	/*  	 * PAE systems can mark pages as non-executable.  Linux calls this the  	 * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced -	 * Virus Protection).  We just switch turn if off here, since we don't +	 * Virus Protection).  We just switch it off here, since we don't  	 * support it.  	 */  	case 0x80000001: @@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void)  /* See lguest_set_pte() below. */  static bool cr3_changed = false; +static unsigned long current_cr3;  /*   * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0.  Keep a local copy, and tell the Host when it changes.  The only - * difference is that our local copy is in lguest_data because the Host needs - * to set it upon our initial hypercall. + * cr0.  Keep a local copy, and tell the Host when it changes.   */  static void lguest_write_cr3(unsigned long cr3)  { -	lguest_data.pgdir = cr3;  	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); +	current_cr3 = cr3;  	/* These two page tables are simple, linear, and used during boot */  	if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) @@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3)  static unsigned long lguest_read_cr3(void)  { -	return lguest_data.pgdir; +	return current_cr3;  }  /* cr4 is used to enable and disable PGE, but we don't care. */ @@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val)  /*   * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space.  Wetell the Host the toplevel and + * a page into a process' address space.  We tell the Host the toplevel and   * address this corresponds to.  The Guest uses one pagetable per process, so   * we need to tell the Host which one we're changing (mm->pgd).   */ @@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)  static void lguest_flush_tlb_single(unsigned long addr)  {  	/* Simply set it to zero: if it was not, it will fault back in. */ -	lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); +	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);  }  /* @@ -1140,7 +1148,7 @@ static struct notifier_block paniced = {  static __init char *lguest_memory_setup(void)  {  	/* -	 *The Linux bootloader header contains an "e820" memory map: the +	 * The Linux bootloader header contains an "e820" memory map: the  	 * Launcher populated the first entry with our memory limit.  	 */  	e820_add_region(boot_params.e820_map[0].addr, diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 4f420c2f2d5..6ddfe4fc23c 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -6,18 +6,22 @@  #include <asm/processor-flags.h>  /*G:020 - * Our story starts with the kernel booting into startup_32 in - * arch/x86/kernel/head_32.S.  It expects a boot header, which is created by - * the bootloader (the Launcher in our case). + + * Our story starts with the bzImage: booting starts at startup_32 in + * arch/x86/boot/compressed/head_32.S.  This merely uncompresses the real + * kernel in place and then jumps into it: startup_32 in + * arch/x86/kernel/head_32.S.  Both routines expects a boot header in the %esi + * register, which is created by the bootloader (the Launcher in our case).   *   * The startup_32 function does very little: it clears the uninitialized global   * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe.  Finally it checks the - * 'hardware_subarch' field.  This was introduced in 2.6.24 for lguest and Xen: - * if it's set to '1' (lguest's assigned number), then it calls us here. + * header and kernel command line somewhere safe, and populates some initial + * page tables.  Finally it checks the 'hardware_subarch' field.  This was + * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's + * assigned number), then it calls us here.   *   * WARNING: be very careful here!  We're running at addresses equal to physical - * addesses (around 0), not above PAGE_OFFSET as most code expectes + * addresses (around 0), not above PAGE_OFFSET as most code expects   * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any   * data without remembering to subtract __PAGE_OFFSET!   * @@ -27,13 +31,18 @@  .section .init.text, "ax", @progbits  ENTRY(lguest_entry)  	/* -	 * We make the "initialization" hypercall now to tell the Host about -	 * us, and also find out where it put our page tables. +	 * We make the "initialization" hypercall now to tell the Host where +	 * our lguest_data struct is.  	 */  	movl $LHCALL_LGUEST_INIT, %eax  	movl $lguest_data - __PAGE_OFFSET, %ebx  	int $LGUEST_TRAP_ENTRY +	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ +	movl $LHCALL_NEW_PGTABLE, %eax +	movl $(initial_page_table - __PAGE_OFFSET), %ebx +	int $LGUEST_TRAP_ENTRY +  	/* Set up the initial stack so we can run C code. */  	movl $(init_thread_union+THREAD_SIZE),%esp @@ -96,12 +105,8 @@ send_interrupts:  	 */  	pushl %eax  	movl $LHCALL_SEND_INTERRUPTS, %eax -	/* -	 * This is a vmcall instruction (same thing that KVM uses).  Older -	 * assembler versions might not know the "vmcall" instruction, so we -	 * create one manually here. -	 */ -	.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ +	/* This is the actual hypercall trap. */ +	int  $LGUEST_TRAP_ENTRY  	/* Put eax back the way we found it. */  	popl %eax  	ret diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index efa202499e3..2535933c49f 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -117,7 +117,7 @@ static __init int map_switcher(void)  	/*  	 * Now the Switcher is mapped at the right address, we can't fail! -	 * Copy in the compiled-in Switcher code (from <arch>_switcher.S). +	 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).  	 */  	memcpy(switcher_vma->addr, start_switcher_text,  	       end_switcher_text - start_switcher_text); diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index daaf8663164..28433a155d6 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -375,11 +375,9 @@ static bool direct_trap(unsigned int num)  	/*  	 * The Host needs to see page faults (for shadow paging and to save the  	 * fault address), general protection faults (in/out emulation) and -	 * device not available (TS handling), invalid opcode fault (kvm hcall), -	 * and of course, the hypercall trap. +	 * device not available (TS handling) and of course, the hypercall trap.  	 */ -	return num != 14 && num != 13 && num != 7 && -			num != 6 && num != LGUEST_TRAP_ENTRY; +	return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;  }  /*:*/ @@ -429,8 +427,8 @@ void pin_stack_pages(struct lg_cpu *cpu)  /*   * Direct traps also mean that we need to know whenever the Guest wants to use - * a different kernel stack, so we can change the IDT entries to use that - * stack.  The IDT entries expect a virtual address, so unlike most addresses + * a different kernel stack, so we can change the guest TSS to use that + * stack.  The TSS entries expect a virtual address, so unlike most addresses   * the Guest gives us, the "esp" (stack pointer) value here is virtual, not   * physical.   * diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 9136411fadd..295df06e659 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -59,6 +59,8 @@ struct lg_cpu {  	struct lguest_pages *last_pages; +	/* Initialization mode: linear map everything. */ +	bool linear_pages;  	int cpu_pgd; /* Which pgd this cpu is currently using */  	/* If a hypercall was asked for, this points to the arguments. */ diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 69c84a1d88e..5289ffa2e50 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -109,6 +109,17 @@ static u32 lg_get_features(struct virtio_device *vdev)  }  /* + * To notify on reset or feature finalization, we (ab)use the NOTIFY + * hypercall, with the descriptor address of the device. + */ +static void status_notify(struct virtio_device *vdev) +{ +	unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; + +	hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); +} + +/*   * The virtio core takes the features the Host offers, and copies the ones   * supported by the driver into the vdev->features array.  Once that's all   * sorted out, this routine is called so we can tell the Host which features we @@ -135,6 +146,9 @@ static void lg_finalize_features(struct virtio_device *vdev)  		if (test_bit(i, vdev->features))  			out_features[i / 8] |= (1 << (i % 8));  	} + +	/* Tell Host we've finished with this device's feature negotiation */ +	status_notify(vdev);  }  /* Once they've found a field, getting a copy of it is easy. */ @@ -168,28 +182,21 @@ static u8 lg_get_status(struct virtio_device *vdev)  	return to_lgdev(vdev)->desc->status;  } -/* - * To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device.  A zero status means "reset". - */ -static void set_status(struct virtio_device *vdev, u8 status) -{ -	unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; - -	/* We set the status. */ -	to_lgdev(vdev)->desc->status = status; -	hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); -} -  static void lg_set_status(struct virtio_device *vdev, u8 status)  {  	BUG_ON(!status); -	set_status(vdev, status); +	to_lgdev(vdev)->desc->status = status; + +	/* Tell Host immediately if we failed. */ +	if (status & VIRTIO_CONFIG_S_FAILED) +		status_notify(vdev);  }  static void lg_reset(struct virtio_device *vdev)  { -	set_status(vdev, 0); +	/* 0 status means "reset" */ +	to_lgdev(vdev)->desc->status = 0; +	status_notify(vdev);  }  /* diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 948c547b8e9..f97e625241a 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,10 @@ -/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher - * controls and communicates with the Guest.  For example, the first write will - * tell us the Guest's memory layout and entry point.  A read will run the - * Guest until something happens, such as a signal or the Guest doing a NOTIFY - * out to the Launcher. +/*P:200 This contains all the /dev/lguest code, whereby the userspace + * launcher controls and communicates with the Guest.  For example, + * the first write will tell us the Guest's memory layout and entry + * point.  A read will run the Guest until something happens, such as + * a signal or the Guest doing a NOTIFY out to the Launcher.  There is + * also a way for the Launcher to attach eventfds to particular NOTIFY + * values instead of returning from the read() call.  :*/  #include <linux/uaccess.h>  #include <linux/miscdevice.h> @@ -357,8 +359,8 @@ static int initialize(struct file *file, const unsigned long __user *input)  		goto free_eventfds;  	/* -	 * Initialize the Guest's shadow page tables, using the toplevel -	 * address the Launcher gave us.  This allocates memory, so can fail. +	 * Initialize the Guest's shadow page tables.  This allocates +	 * memory, so can fail.  	 */  	err = init_guest_pagetable(lg);  	if (err) @@ -516,6 +518,7 @@ static const struct file_operations lguest_fops = {  	.read	 = read,  	.llseek  = default_llseek,  }; +/*:*/  /*   * This is a textbook example of a "misc" character device.  Populate a "struct diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index d21578ee95d..3b62be160a6 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -17,7 +17,6 @@  #include <linux/percpu.h>  #include <asm/tlbflush.h>  #include <asm/uaccess.h> -#include <asm/bootparam.h>  #include "lg.h"  /*M:008 @@ -156,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)  }  /* - * These functions are just like the above two, except they access the Guest + * These functions are just like the above, except they access the Guest   * page tables.  Hence they return a Guest address.   */  static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -196,7 +195,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,  #endif  /*:*/ -/*M:014 +/*M:007   * get_pfn is slow: we could probably try to grab batches of pages here as   * an optimization (ie. pre-faulting).  :*/ @@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)  #endif  	/* First step: get the top-level Guest page table entry. */ -	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); -	/* Toplevel not present?  We can't map it in. */ -	if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) -		return false; +	if (unlikely(cpu->linear_pages)) { +		/* Faking up a linear mapping. */ +		gpgd = __pgd(CHECK_GPGD_MASK); +	} else { +		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); +		/* Toplevel not present?  We can't map it in. */ +		if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) +			return false; +	}  	/* Now look at the matching shadow entry. */  	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); @@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)  	}  #ifdef CONFIG_X86_PAE -	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); -	/* Middle level not present?  We can't map it in. */ -	if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) -		return false; +	if (unlikely(cpu->linear_pages)) { +		/* Faking up a linear mapping. */ +		gpmd = __pmd(_PAGE_TABLE); +	} else { +		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); +		/* Middle level not present?  We can't map it in. */ +		if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) +			return false; +	}  	/* Now look at the matching shadow entry. */  	spmd = spmd_addr(cpu, *spgd, vaddr); @@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)  	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);  #endif -	/* Read the actual PTE value. */ -	gpte = lgread(cpu, gpte_ptr, pte_t); +	if (unlikely(cpu->linear_pages)) { +		/* Linear?  Make up a PTE which points to same page. */ +		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); +	} else { +		/* Read the actual PTE value. */ +		gpte = lgread(cpu, gpte_ptr, pte_t); +	}  	/* If this page isn't in the Guest page tables, we can't page it in. */  	if (!(pte_flags(gpte) & _PAGE_PRESENT)) @@ -454,7 +468,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)  	 * Finally, we write the Guest PTE entry back: we've set the  	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.  	 */ -	lgwrite(cpu, gpte_ptr, pte_t, gpte); +	if (likely(!cpu->linear_pages)) +		lgwrite(cpu, gpte_ptr, pte_t, gpte);  	/*  	 * The fault is fixed, the page table is populated, the mapping @@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)  #ifdef CONFIG_X86_PAE  	pmd_t gpmd;  #endif + +	/* Still not set up?  Just map 1:1. */ +	if (unlikely(cpu->linear_pages)) +		return vaddr; +  	/* First step: get the top-level Guest page table entry. */  	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);  	/* Toplevel not present?  We can't map it in. */ @@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,  	return next;  } -/*H:430 - * (iv) Switching page tables - * - * Now we've seen all the page table setting and manipulation, let's see - * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir).  This occurs on almost every context switch. - */ -void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) -{ -	int newpgdir, repin = 0; - -	/* Look to see if we have this one already. */ -	newpgdir = find_pgdir(cpu->lg, pgtable); -	/* -	 * If not, we allocate or mug an existing one: if it's a fresh one, -	 * repin gets set to 1. -	 */ -	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) -		newpgdir = new_pgdir(cpu, pgtable, &repin); -	/* Change the current pgd index to the new one. */ -	cpu->cpu_pgd = newpgdir; -	/* If it was completely blank, we map in the Guest kernel stack */ -	if (repin) -		pin_stack_pages(cpu); -} -  /*H:470   * Finally, a routine which throws away everything: all PGD entries in all   * the shadow page tables, including the Guest's kernel mappings.  This is used @@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)  	/* We need the Guest kernel stack mapped again. */  	pin_stack_pages(cpu);  } + +/*H:430 + * (iv) Switching page tables + * + * Now we've seen all the page table setting and manipulation, let's see + * what happens when the Guest changes page tables (ie. changes the top-level + * pgdir).  This occurs on almost every context switch. + */ +void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) +{ +	int newpgdir, repin = 0; + +	/* +	 * The very first time they call this, we're actually running without +	 * any page tables; we've been making it up.  Throw them away now. +	 */ +	if (unlikely(cpu->linear_pages)) { +		release_all_pagetables(cpu->lg); +		cpu->linear_pages = false; +		/* Force allocation of a new pgdir. */ +		newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); +	} else { +		/* Look to see if we have this one already. */ +		newpgdir = find_pgdir(cpu->lg, pgtable); +	} + +	/* +	 * If not, we allocate or mug an existing one: if it's a fresh one, +	 * repin gets set to 1. +	 */ +	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) +		newpgdir = new_pgdir(cpu, pgtable, &repin); +	/* Change the current pgd index to the new one. */ +	cpu->cpu_pgd = newpgdir; +	/* If it was completely blank, we map in the Guest kernel stack */ +	if (repin) +		pin_stack_pages(cpu); +}  /*:*/  /*M:009 @@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)  }  #endif -/*H:505 - * To get through boot, we construct simple identity page mappings (which - * set virtual == physical) and linear mappings which will get the Guest far - * enough into the boot to create its own.  The linear mapping means we - * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, - * as you'll see. - * - * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). - */ -static unsigned long setup_pagetables(struct lguest *lg, -				      unsigned long mem, -				      unsigned long initrd_size) -{ -	pgd_t __user *pgdir; -	pte_t __user *linear; -	unsigned long mem_base = (unsigned long)lg->mem_base; -	unsigned int mapped_pages, i, linear_pages; -#ifdef CONFIG_X86_PAE -	pmd_t __user *pmds; -	unsigned int j; -	pgd_t pgd; -	pmd_t pmd; -#else -	unsigned int phys_linear; -#endif - -	/* -	 * We have mapped_pages frames to map, so we need linear_pages page -	 * tables to map them. -	 */ -	mapped_pages = mem / PAGE_SIZE; -	linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; - -	/* We put the toplevel page directory page at the top of memory. */ -	pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE); - -	/* Now we use the next linear_pages pages as pte pages */ -	linear = (void *)pgdir - linear_pages * PAGE_SIZE; - -#ifdef CONFIG_X86_PAE -	/* -	 * And the single mid page goes below that.  We only use one, but -	 * that's enough to map 1G, which definitely gets us through boot. -	 */ -	pmds = (void *)linear - PAGE_SIZE; -#endif -	/* -	 * Linear mapping is easy: put every page's address into the -	 * mapping in order. -	 */ -	for (i = 0; i < mapped_pages; i++) { -		pte_t pte; -		pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); -		if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0) -			return -EFAULT; -	} - -#ifdef CONFIG_X86_PAE -	/* -	 * Make the Guest PMD entries point to the corresponding place in the -	 * linear mapping (up to one page worth of PMD). -	 */ -	for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; -	     i += PTRS_PER_PTE, j++) { -		pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE, -			      __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - -		if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) -			return -EFAULT; -	} - -	/* One PGD entry, pointing to that PMD page. */ -	pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT); -	/* Copy it in as the first PGD entry (ie. addresses 0-1G). */ -	if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) -		return -EFAULT; -	/* -	 * And the other PGD entry to make the linear mapping at PAGE_OFFSET -	 */ -	if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd))) -		return -EFAULT; -#else -	/* -	 * The top level points to the linear page table pages above. -	 * We setup the identity and linear mappings here. -	 */ -	phys_linear = (unsigned long)linear - mem_base; -	for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { -		pgd_t pgd; -		/* -		 * Create a PGD entry which points to the right part of the -		 * linear PTE pages. -		 */ -		pgd = __pgd((phys_linear + i * sizeof(pte_t)) | -			    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - -		/* -		 * Copy it into the PGD page at 0 and PAGE_OFFSET. -		 */ -		if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) -		    || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) -					   + i / PTRS_PER_PTE], -				    &pgd, sizeof(pgd))) -			return -EFAULT; -	} -#endif - -	/* -	 * We return the top level (guest-physical) address: we remember where -	 * this is to write it into lguest_data when the Guest initializes. -	 */ -	return (unsigned long)pgdir - mem_base; -} -  /*H:500   * (vii) Setting up the page tables initially.   * - * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is.  We set some things up here: + * When a Guest is first created, set initialize a shadow page table which + * we will populate on future faults.  The Guest doesn't have any actual + * pagetables yet, so we set linear_pages to tell demand_page() to fake it + * for the moment.   */  int init_guest_pagetable(struct lguest *lg)  { -	u64 mem; -	u32 initrd_size; -	struct boot_params __user *boot = (struct boot_params *)lg->mem_base; -#ifdef CONFIG_X86_PAE -	pgd_t *pgd; -	pmd_t *pmd_table; -#endif -	/* -	 * Get the Guest memory size and the ramdisk size from the boot header -	 * located at lg->mem_base (Guest address 0). -	 */ -	if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) -	    || get_user(initrd_size, &boot->hdr.ramdisk_size)) -		return -EFAULT; +	struct lg_cpu *cpu = &lg->cpus[0]; +	int allocated = 0; -	/* -	 * We start on the first shadow page table, and give it a blank PGD -	 * page. -	 */ -	lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); -	if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) -		return lg->pgdirs[0].gpgdir; -	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); -	if (!lg->pgdirs[0].pgdir) +	/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ +	cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); +	if (!allocated)  		return -ENOMEM; -#ifdef CONFIG_X86_PAE -	/* For PAE, we also create the initial mid-level. */ -	pgd = lg->pgdirs[0].pgdir; -	pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); -	if (!pmd_table) -		return -ENOMEM; - -	set_pgd(pgd + SWITCHER_PGD_INDEX, -		__pgd(__pa(pmd_table) | _PAGE_PRESENT)); -#endif - -	/* This is the current page table. */ -	lg->cpus[0].cpu_pgd = 0; +	/* We start with a linear mapping until the initialize. */ +	cpu->linear_pages = true;  	return 0;  } @@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu)  		 * of virtual addresses used by the Switcher.  		 */  		|| put_user(RESERVE_MEM * 1024 * 1024, -			&cpu->lg->lguest_data->reserve_mem) -		|| put_user(cpu->lg->pgdirs[0].gpgdir, -			&cpu->lg->lguest_data->pgdir)) +			    &cpu->lg->lguest_data->reserve_mem)) {  		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); +		return; +	}  	/*  	 * In flush_user_mappings() we loop from 0 to diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 9f1659c3d1f..65af42f2d59 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -269,10 +269,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)  static int emulate_insn(struct lg_cpu *cpu)  {  	u8 insn; -	unsigned int insnlen = 0, in = 0, shift = 0; +	unsigned int insnlen = 0, in = 0, small_operand = 0;  	/*  	 * The eip contains the *virtual* address of the Guest's instruction: -	 * guest_pa just subtracts the Guest's page_offset. +	 * walk the Guest's page tables to find the "physical" address.  	 */  	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); @@ -300,11 +300,10 @@ static int emulate_insn(struct lg_cpu *cpu)  	}  	/* -	 * 0x66 is an "operand prefix".  It means it's using the upper 16 bits -	 * of the eax register. +	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.  	 */  	if (insn == 0x66) { -		shift = 16; +		small_operand = 1;  		/* The instruction is 1 byte so far, read the next byte. */  		insnlen = 1;  		insn = lgread(cpu, physaddr + insnlen, u8); @@ -340,11 +339,14 @@ static int emulate_insn(struct lg_cpu *cpu)  	 * traditionally means "there's nothing there".  	 */  	if (in) { -		/* Lower bit tells is whether it's a 16 or 32 bit access */ -		if (insn & 0x1) -			cpu->regs->eax = 0xFFFFFFFF; -		else -			cpu->regs->eax |= (0xFFFF << shift); +		/* Lower bit tells means it's a 32/16 bit access */ +		if (insn & 0x1) { +			if (small_operand) +				cpu->regs->eax |= 0xFFFF; +			else +				cpu->regs->eax = 0xFFFFFFFF; +		} else +			cpu->regs->eax |= 0xFF;  	}  	/* Finally, we've "done" the instruction, so move past it. */  	cpu->regs->eip += insnlen; @@ -352,69 +354,6 @@ static int emulate_insn(struct lg_cpu *cpu)  	return 1;  } -/* - * Our hypercalls mechanism used to be based on direct software interrupts. - * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to - * change over to using kvm hypercalls. - * - * KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid - * opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be - * an *emulation approach*: if the fault was really produced by an hypercall - * (is_hypercall() does exactly this check), we can just call the corresponding - * hypercall host implementation function. - * - * But these invalid opcode faults are notably slower than software interrupts. - * So we implemented the *patching (or rewriting) approach*: every time we hit - * the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f" - * opcode, so next time the Guest calls this hypercall it will use the - * faster trap mechanism. - * - * Matias even benchmarked it to convince you: this shows the average cycle - * cost of a hypercall.  For each alternative solution mentioned above we've - * made 5 runs of the benchmark: - * - * 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898 - * 2) emulation technique: 3410, 3681, 3466, 3392, 3780 - * 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884 - * - * One two-line function is worth a 20% hypercall speed boost! - */ -static void rewrite_hypercall(struct lg_cpu *cpu) -{ -	/* -	 * This are the opcodes we use to patch the Guest.  The opcode for "int -	 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we -	 * complete the sequence with a NOP (0x90). -	 */ -	u8 insn[3] = {0xcd, 0x1f, 0x90}; - -	__lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); -	/* -	 * The above write might have caused a copy of that page to be made -	 * (if it was read-only).  We need to make sure the Guest has -	 * up-to-date pagetables.  As this doesn't happen often, we can just -	 * drop them all. -	 */ -	guest_pagetable_clear_all(cpu); -} - -static bool is_hypercall(struct lg_cpu *cpu) -{ -	u8 insn[3]; - -	/* -	 * This must be the Guest kernel trying to do something. -	 * The bottom two bits of the CS segment register are the privilege -	 * level. -	 */ -	if ((cpu->regs->cs & 3) != GUEST_PL) -		return false; - -	/* Is it a vmcall? */ -	__lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn)); -	return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1; -} -  /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */  void lguest_arch_handle_trap(struct lg_cpu *cpu)  { @@ -429,20 +368,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)  			if (emulate_insn(cpu))  				return;  		} -		/* -		 * If KVM is active, the vmcall instruction triggers a General -		 * Protection Fault.  Normally it triggers an invalid opcode -		 * fault (6): -		 */ -	case 6: -		/* -		 * We need to check if ring == GUEST_PL and faulting -		 * instruction == vmcall. -		 */ -		if (is_hypercall(cpu)) { -			rewrite_hypercall(cpu); -			return; -		}  		break;  	case 14: /* We've intercepted a Page Fault. */  		/* @@ -486,7 +411,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)  		 * These values mean a real interrupt occurred, in which case  		 * the Host handler has already been run. We just do a  		 * friendly check if another process should now be run, then -		 * return to run the Guest again +		 * return to run the Guest again.  		 */  		cond_resched();  		return; @@ -536,7 +461,7 @@ void __init lguest_arch_host_init(void)  	int i;  	/* -	 * Most of the i386/switcher.S doesn't care that it's been moved; on +	 * Most of the x86/switcher_32.S doesn't care that it's been moved; on  	 * Intel, jumps are relative, and it doesn't access any references to  	 * external code or data.  	 * @@ -664,7 +589,7 @@ void __init lguest_arch_host_init(void)  		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);  	}  	put_online_cpus(); -}; +}  /*:*/  void __exit lguest_arch_host_fini(void) @@ -747,8 +672,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)  /*:*/  /*L:030 - * lguest_arch_setup_regs() - *   * Most of the Guest's registers are left alone: we used get_zeroed_page() to   * allocate the structure, so they will be 0.   */ diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 2fb1dcbcb5a..9962c6bb131 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -59,8 +59,6 @@ struct lguest_data {  	unsigned long reserve_mem;  	/* KHz for the TSC clock. */  	u32 tsc_khz; -	/* Page where the top-level pagetable is */ -	unsigned long pgdir;  /* Fields initialized by the Guest at boot: */  	/* Instruction range to suppress interrupts even if enabled */  |