diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
| -rw-r--r-- | arch/x86/kvm/mmu.c | 194 | 
1 files changed, 72 insertions, 122 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 01d7c2ad05f..956ca358108 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)  static bool spte_is_locklessly_modifiable(u64 spte)  { -	return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); +	return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == +		(SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);  }  static bool spte_has_volatile_bits(u64 spte) @@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)  	if (host_level == PT_PAGE_TABLE_LEVEL)  		return host_level; -	max_level = kvm_x86_ops->get_lpage_level() < host_level ? -		kvm_x86_ops->get_lpage_level() : host_level; +	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);  	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)  		if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) @@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)  }  static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, -				 int level, bool pt_protect) +				 bool pt_protect)  {  	u64 *sptep;  	struct rmap_iterator iter; @@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,  	while (mask) {  		rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),  				      PT_PAGE_TABLE_LEVEL, slot); -		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); +		__rmap_write_protect(kvm, rmapp, false);  		/* clear the first set bit */  		mask &= mask - 1; @@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)  	for (i = PT_PAGE_TABLE_LEVEL;  	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {  		rmapp = __gfn_to_rmap(gfn, i, slot); -		write_protected |= __rmap_write_protect(kvm, rmapp, i, true); +		write_protected |= __rmap_write_protect(kvm, rmapp, true);  	}  	return write_protected; @@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)  	percpu_counter_add(&kvm_total_used_mmu_pages, nr);  } -/* - * Remove the sp from shadow page cache, after call it, - * we can not find this sp from the cache, and the shadow - * page table is still valid. - * It should be under the protection of mmu lock. - */ -static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) +static void kvm_mmu_free_page(struct kvm_mmu_page *sp)  {  	ASSERT(is_empty_shadow_page(sp->spt));  	hlist_del(&sp->hash_link); -	if (!sp->role.direct) -		free_page((unsigned long)sp->gfns); -} - -/* - * Free the shadow page table and the sp, we can do it - * out of the protection of mmu lock. - */ -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) -{  	list_del(&sp->link);  	free_page((unsigned long)sp->spt); +	if (!sp->role.direct) +		free_page((unsigned long)sp->gfns);  	kmem_cache_free(mmu_page_header_cache, sp);  } @@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,  		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);  	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);  	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); -	bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);  	sp->parent_ptes = 0;  	mmu_page_add_parent_pte(vcpu, sp, parent_pte);  	kvm_mod_used_mmu_pages(vcpu->kvm, +1); @@ -1659,13 +1644,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,  static void kvm_mmu_commit_zap_page(struct kvm *kvm,  				    struct list_head *invalid_list); -#define for_each_gfn_sp(kvm, sp, gfn, pos)				\ -  hlist_for_each_entry(sp, pos,						\ +#define for_each_gfn_sp(kvm, sp, gfn)					\ +  hlist_for_each_entry(sp,						\     &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\  	if ((sp)->gfn != (gfn)) {} else -#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\ -  hlist_for_each_entry(sp, pos,						\ +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn)			\ +  hlist_for_each_entry(sp,						\     &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\  		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\  			(sp)->role.invalid) {} else @@ -1721,11 +1706,10 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,  static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)  {  	struct kvm_mmu_page *s; -	struct hlist_node *node;  	LIST_HEAD(invalid_list);  	bool flush = false; -	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { +	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {  		if (!s->unsync)  			continue; @@ -1863,7 +1847,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,  	union kvm_mmu_page_role role;  	unsigned quadrant;  	struct kvm_mmu_page *sp; -	struct hlist_node *node;  	bool need_sync = false;  	role = vcpu->arch.mmu.base_role; @@ -1878,7 +1861,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,  		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;  		role.quadrant = quadrant;  	} -	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { +	for_each_gfn_sp(vcpu->kvm, sp, gfn) {  		if (!need_sync && sp->unsync)  			need_sync = true; @@ -1973,9 +1956,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)  {  	u64 spte; -	spte = __pa(sp->spt) -		| PT_PRESENT_MASK | PT_ACCESSED_MASK -		| PT_WRITABLE_MASK | PT_USER_MASK; +	spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | +	       shadow_user_mask | shadow_x_mask | shadow_accessed_mask; +  	mmu_spte_set(sptep, spte);  } @@ -2126,7 +2109,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,  	do {  		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);  		WARN_ON(!sp->role.invalid || sp->root_count); -		kvm_mmu_isolate_page(sp);  		kvm_mmu_free_page(sp);  	} while (!list_empty(invalid_list));  } @@ -2144,6 +2126,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)  	 * change the value  	 */ +	spin_lock(&kvm->mmu_lock); +  	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {  		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&  			!list_empty(&kvm->arch.active_mmu_pages)) { @@ -2158,19 +2142,20 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)  	}  	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; + +	spin_unlock(&kvm->mmu_lock);  }  int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)  {  	struct kvm_mmu_page *sp; -	struct hlist_node *node;  	LIST_HEAD(invalid_list);  	int r;  	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);  	r = 0;  	spin_lock(&kvm->mmu_lock); -	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { +	for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {  		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,  			 sp->role.word);  		r = 1; @@ -2183,14 +2168,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)  }  EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) -{ -	int slot = memslot_id(kvm, gfn); -	struct kvm_mmu_page *sp = page_header(__pa(pte)); - -	__set_bit(slot, sp->slot_bitmap); -} -  /*   * The function is based on mtrr_type_lookup() in   * arch/x86/kernel/cpu/mtrr/generic.c @@ -2308,9 +2285,8 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)  static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)  {  	struct kvm_mmu_page *s; -	struct hlist_node *node; -	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { +	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {  		if (s->unsync)  			continue;  		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); @@ -2322,19 +2298,17 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,  				  bool can_unsync)  {  	struct kvm_mmu_page *s; -	struct hlist_node *node;  	bool need_unsync = false; -	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { +	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {  		if (!can_unsync)  			return 1;  		if (s->role.level != PT_PAGE_TABLE_LEVEL)  			return 1; -		if (!need_unsync && !s->unsync) { +		if (!s->unsync)  			need_unsync = true; -		}  	}  	if (need_unsync)  		kvm_unsync_pages(vcpu, gfn); @@ -2342,8 +2316,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,  }  static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, -		    unsigned pte_access, int user_fault, -		    int write_fault, int level, +		    unsigned pte_access, int level,  		    gfn_t gfn, pfn_t pfn, bool speculative,  		    bool can_unsync, bool host_writable)  { @@ -2378,20 +2351,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  	spte |= (u64)pfn << PAGE_SHIFT; -	if ((pte_access & ACC_WRITE_MASK) -	    || (!vcpu->arch.mmu.direct_map && write_fault -		&& !is_write_protection(vcpu) && !user_fault)) { +	if (pte_access & ACC_WRITE_MASK) {  		/* -		 * There are two cases: -		 * - the one is other vcpu creates new sp in the window -		 *   between mapping_level() and acquiring mmu-lock. -		 * - the another case is the new sp is created by itself -		 *   (page-fault path) when guest uses the target gfn as -		 *   its page table. -		 * Both of these cases can be fixed by allowing guest to -		 * retry the access, it will refault, then we can establish -		 * the mapping by using small page. +		 * Other vcpu creates new sp in the window between +		 * mapping_level() and acquiring mmu-lock. We can +		 * allow guest to retry the access, the mapping can +		 * be fixed if guest refault.  		 */  		if (level > PT_PAGE_TABLE_LEVEL &&  		    has_wrprotected_page(vcpu->kvm, gfn, level)) @@ -2399,19 +2365,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; -		if (!vcpu->arch.mmu.direct_map -		    && !(pte_access & ACC_WRITE_MASK)) { -			spte &= ~PT_USER_MASK; -			/* -			 * If we converted a user page to a kernel page, -			 * so that the kernel can write to it when cr0.wp=0, -			 * then we should prevent the kernel from executing it -			 * if SMEP is enabled. -			 */ -			if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) -				spte |= PT64_NX_MASK; -		} -  		/*  		 * Optimization: for pte sync, if spte was writable the hash  		 * lookup is unnecessary (and expensive). Write protection @@ -2441,19 +2394,15 @@ done:  }  static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, -			 unsigned pt_access, unsigned pte_access, -			 int user_fault, int write_fault, -			 int *emulate, int level, gfn_t gfn, -			 pfn_t pfn, bool speculative, +			 unsigned pte_access, int write_fault, int *emulate, +			 int level, gfn_t gfn, pfn_t pfn, bool speculative,  			 bool host_writable)  {  	int was_rmapped = 0;  	int rmap_count; -	pgprintk("%s: spte %llx access %x write_fault %d" -		 " user_fault %d gfn %llx\n", -		 __func__, *sptep, pt_access, -		 write_fault, user_fault, gfn); +	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, +		 *sptep, write_fault, gfn);  	if (is_rmap_spte(*sptep)) {  		/* @@ -2477,9 +2426,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  			was_rmapped = 1;  	} -	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, -		      level, gfn, pfn, speculative, true, -		      host_writable)) { +	if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, +	      true, host_writable)) {  		if (write_fault)  			*emulate = 1;  		kvm_mmu_flush_tlb(vcpu); @@ -2497,7 +2445,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		++vcpu->kvm->stat.lpages;  	if (is_shadow_present_pte(*sptep)) { -		page_header_update_slot(vcpu->kvm, sptep, gfn);  		if (!was_rmapped) {  			rmap_count = rmap_add(vcpu, sptep, gfn);  			if (rmap_count > RMAP_RECYCLE_THRESHOLD) @@ -2571,10 +2518,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,  		return -1;  	for (i = 0; i < ret; i++, gfn++, start++) -		mmu_set_spte(vcpu, start, ACC_ALL, -			     access, 0, 0, NULL, -			     sp->role.level, gfn, -			     page_to_pfn(pages[i]), true, true); +		mmu_set_spte(vcpu, start, access, 0, NULL, +			     sp->role.level, gfn, page_to_pfn(pages[i]), +			     true, true);  	return 0;  } @@ -2633,11 +2579,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,  	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {  		if (iterator.level == level) { -			unsigned pte_access = ACC_ALL; - -			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, -				     0, write, &emulate, -				     level, gfn, pfn, prefault, map_writable); +			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, +				     write, &emulate, level, gfn, pfn, +				     prefault, map_writable);  			direct_pte_prefetch(vcpu, iterator.sptep);  			++vcpu->stat.pf_fixed;  			break; @@ -2652,11 +2596,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,  					      iterator.level - 1,  					      1, ACC_ALL, iterator.sptep); -			mmu_spte_set(iterator.sptep, -				     __pa(sp->spt) -				     | PT_PRESENT_MASK | PT_WRITABLE_MASK -				     | shadow_user_mask | shadow_x_mask -				     | shadow_accessed_mask); +			link_shadow_page(iterator.sptep, sp);  		}  	}  	return emulate; @@ -3719,6 +3659,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)  	else  		r = paging32_init_context(vcpu, context); +	vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);  	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);  	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);  	vcpu->arch.mmu.base_role.smep_andnot_wp @@ -3885,7 +3826,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,  		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */  		*gpa &= ~(gpa_t)7;  		*bytes = 8; -		r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); +		r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);  		if (r)  			gentry = 0;  		new = (const u8 *)&gentry; @@ -3987,7 +3928,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,  	gfn_t gfn = gpa >> PAGE_SHIFT;  	union kvm_mmu_page_role mask = { .word = 0 };  	struct kvm_mmu_page *sp; -	struct hlist_node *node;  	LIST_HEAD(invalid_list);  	u64 entry, gentry, *spte;  	int npte; @@ -4018,7 +3958,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,  	kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);  	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; -	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { +	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {  		if (detect_write_misaligned(sp, gpa, bytes) ||  		      detect_write_flooding(sp)) {  			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, @@ -4039,7 +3979,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,  			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)  			      & mask.word) && rmap_can_add(vcpu))  				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); -			if (!remote_flush && need_remote_flush(entry, *spte)) +			if (need_remote_flush(entry, *spte))  				remote_flush = true;  			++spte;  		} @@ -4198,26 +4138,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)  void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)  { -	struct kvm_mmu_page *sp; -	bool flush = false; +	struct kvm_memory_slot *memslot; +	gfn_t last_gfn; +	int i; -	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { -		int i; -		u64 *pt; +	memslot = id_to_memslot(kvm->memslots, slot); +	last_gfn = memslot->base_gfn + memslot->npages - 1; -		if (!test_bit(slot, sp->slot_bitmap)) -			continue; +	spin_lock(&kvm->mmu_lock); -		pt = sp->spt; -		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { -			if (!is_shadow_present_pte(pt[i]) || -			      !is_last_spte(pt[i], sp->role.level)) -				continue; +	for (i = PT_PAGE_TABLE_LEVEL; +	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { +		unsigned long *rmapp; +		unsigned long last_index, index; -			spte_write_protect(kvm, &pt[i], &flush, false); +		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; +		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + +		for (index = 0; index <= last_index; ++index, ++rmapp) { +			if (*rmapp) +				__rmap_write_protect(kvm, rmapp, false); + +			if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { +				kvm_flush_remote_tlbs(kvm); +				cond_resched_lock(&kvm->mmu_lock); +			}  		}  	} +  	kvm_flush_remote_tlbs(kvm); +	spin_unlock(&kvm->mmu_lock);  }  void kvm_mmu_zap_all(struct kvm *kvm)  |