diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/compaction.c | 15 | ||||
| -rw-r--r-- | mm/huge_memory.c | 108 | ||||
| -rw-r--r-- | mm/hugetlb.c | 10 | ||||
| -rw-r--r-- | mm/internal.h | 7 | ||||
| -rw-r--r-- | mm/ksm.c | 6 | ||||
| -rw-r--r-- | mm/memcontrol.c | 7 | ||||
| -rw-r--r-- | mm/memory-failure.c | 7 | ||||
| -rw-r--r-- | mm/memory.c | 198 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 3 | ||||
| -rw-r--r-- | mm/mempolicy.c | 283 | ||||
| -rw-r--r-- | mm/migrate.c | 337 | ||||
| -rw-r--r-- | mm/mmap.c | 10 | ||||
| -rw-r--r-- | mm/mprotect.c | 135 | ||||
| -rw-r--r-- | mm/mremap.c | 2 | ||||
| -rw-r--r-- | mm/page_alloc.c | 10 | ||||
| -rw-r--r-- | mm/pgtable-generic.c | 9 | ||||
| -rw-r--r-- | mm/rmap.c | 66 | ||||
| -rw-r--r-- | mm/vmstat.c | 16 | 
18 files changed, 1098 insertions, 131 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 12979121822..5ad7f4f4d6f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -303,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,  	if (blockpfn == end_pfn)  		update_pageblock_skip(cc, valid_page, total_isolated, false); +	count_vm_events(COMPACTFREE_SCANNED, nr_scanned); +	if (total_isolated) +		count_vm_events(COMPACTISOLATED, total_isolated); +  	return total_isolated;  } @@ -609,6 +613,10 @@ next_pageblock:  	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); +	count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); +	if (nr_isolated) +		count_vm_events(COMPACTISOLATED, nr_isolated); +  	return low_pfn;  } @@ -1015,14 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)  		nr_migrate = cc->nr_migratepages;  		err = migrate_pages(&cc->migratepages, compaction_alloc,  				(unsigned long)cc, false, -				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); +				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, +				MR_COMPACTION);  		update_nr_listpages(cc);  		nr_remaining = cc->nr_migratepages; -		count_vm_event(COMPACTBLOCKS); -		count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); -		if (nr_remaining) -			count_vm_events(COMPACTPAGEFAILED, nr_remaining);  		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,  						nr_remaining); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 827d9c81305..d7ee1691fd2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -19,6 +19,7 @@  #include <linux/freezer.h>  #include <linux/mman.h>  #include <linux/pagemap.h> +#include <linux/migrate.h>  #include <asm/tlb.h>  #include <asm/pgalloc.h> @@ -690,7 +691,7 @@ out:  }  __setup("transparent_hugepage=", setup_transparent_hugepage); -static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)  {  	if (likely(vma->vm_flags & VM_WRITE))  		pmd = pmd_mkwrite(pmd); @@ -848,7 +849,8 @@ out:  	 * run pte_offset_map on the pmd, if an huge pmd could  	 * materialize from under us from a different thread.  	 */ -	if (unlikely(__pte_alloc(mm, vma, pmd, address))) +	if (unlikely(pmd_none(*pmd)) && +	    unlikely(__pte_alloc(mm, vma, pmd, address)))  		return VM_FAULT_OOM;  	/* if an huge pmd materialized from under us just retry later */  	if (unlikely(pmd_trans_huge(*pmd))) @@ -1287,6 +1289,81 @@ out:  	return page;  } +/* NUMA hinting page fault entry point for trans huge pmds */ +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +				unsigned long addr, pmd_t pmd, pmd_t *pmdp) +{ +	struct page *page; +	unsigned long haddr = addr & HPAGE_PMD_MASK; +	int target_nid; +	int current_nid = -1; +	bool migrated; +	bool page_locked = false; + +	spin_lock(&mm->page_table_lock); +	if (unlikely(!pmd_same(pmd, *pmdp))) +		goto out_unlock; + +	page = pmd_page(pmd); +	get_page(page); +	current_nid = page_to_nid(page); +	count_vm_numa_event(NUMA_HINT_FAULTS); +	if (current_nid == numa_node_id()) +		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + +	target_nid = mpol_misplaced(page, vma, haddr); +	if (target_nid == -1) { +		put_page(page); +		goto clear_pmdnuma; +	} + +	/* Acquire the page lock to serialise THP migrations */ +	spin_unlock(&mm->page_table_lock); +	lock_page(page); +	page_locked = true; + +	/* Confirm the PTE did not while locked */ +	spin_lock(&mm->page_table_lock); +	if (unlikely(!pmd_same(pmd, *pmdp))) { +		unlock_page(page); +		put_page(page); +		goto out_unlock; +	} +	spin_unlock(&mm->page_table_lock); + +	/* Migrate the THP to the requested node */ +	migrated = migrate_misplaced_transhuge_page(mm, vma, +				pmdp, pmd, addr, +				page, target_nid); +	if (migrated) +		current_nid = target_nid; +	else { +		spin_lock(&mm->page_table_lock); +		if (unlikely(!pmd_same(pmd, *pmdp))) { +			unlock_page(page); +			goto out_unlock; +		} +		goto clear_pmdnuma; +	} + +	task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); +	return 0; + +clear_pmdnuma: +	pmd = pmd_mknonnuma(pmd); +	set_pmd_at(mm, haddr, pmdp, pmd); +	VM_BUG_ON(pmd_numa(*pmdp)); +	update_mmu_cache_pmd(vma, addr, pmdp); +	if (page_locked) +		unlock_page(page); + +out_unlock: +	spin_unlock(&mm->page_table_lock); +	if (current_nid != -1) +		task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); +	return 0; +} +  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,  		 pmd_t *pmd, unsigned long addr)  { @@ -1375,7 +1452,7 @@ out:  }  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, -		unsigned long addr, pgprot_t newprot) +		unsigned long addr, pgprot_t newprot, int prot_numa)  {  	struct mm_struct *mm = vma->vm_mm;  	int ret = 0; @@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,  	if (__pmd_trans_huge_lock(pmd, vma) == 1) {  		pmd_t entry;  		entry = pmdp_get_and_clear(mm, addr, pmd); -		entry = pmd_modify(entry, newprot); +		if (!prot_numa) +			entry = pmd_modify(entry, newprot); +		else { +			struct page *page = pmd_page(*pmd); + +			/* only check non-shared pages */ +			if (page_mapcount(page) == 1 && +			    !pmd_numa(*pmd)) { +				entry = pmd_mknuma(entry); +			} +		}  		BUG_ON(pmd_write(entry));  		set_pmd_at(mm, addr, pmd, entry);  		spin_unlock(&vma->vm_mm->page_table_lock); @@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,  		 * We can't temporarily set the pmd to null in order  		 * to split it, the pmd must remain marked huge at all  		 * times or the VM won't take the pmd_trans_huge paths -		 * and it won't wait on the anon_vma->root->mutex to +		 * and it won't wait on the anon_vma->root->rwsem to  		 * serialize against split_huge_page*.  		 */  		pmdp_splitting_flush(vma, address, pmd); @@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)  		page_tail->mapping = page->mapping;  		page_tail->index = page->index + i; +		page_xchg_last_nid(page_tail, page_last_nid(page));  		BUG_ON(!PageAnon(page_tail));  		BUG_ON(!PageUptodate(page_tail)); @@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,  				BUG_ON(page_mapcount(page) != 1);  			if (!pmd_young(*pmd))  				entry = pte_mkold(entry); +			if (pmd_numa(*pmd)) +				entry = pte_mknuma(entry);  			pte = pte_offset_map(&_pmd, haddr);  			BUG_ON(!pte_none(*pte));  			set_pte_at(mm, haddr, pte, entry); @@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,  	return ret;  } -/* must be called with anon_vma->root->mutex hold */ +/* must be called with anon_vma->root->rwsem held */  static void __split_huge_page(struct page *page,  			      struct anon_vma *anon_vma)  { @@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page)  	BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));  	BUG_ON(!PageAnon(page)); -	anon_vma = page_lock_anon_vma(page); +	anon_vma = page_lock_anon_vma_read(page);  	if (!anon_vma)  		goto out;  	ret = 0; @@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page)  	BUG_ON(PageCompound(page));  out_unlock: -	page_unlock_anon_vma(anon_vma); +	page_unlock_anon_vma_read(anon_vma);  out:  	return ret;  } @@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm,  	if (pmd_trans_huge(*pmd))  		goto out; -	anon_vma_lock(vma->anon_vma); +	anon_vma_lock_write(vma->anon_vma);  	pte = pte_offset_map(pmd, address);  	ptl = pte_lockptr(mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 88e7293b96b..e5318c7793a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3016,7 +3016,7 @@ same_page:  	return i ? i : -EFAULT;  } -void hugetlb_change_protection(struct vm_area_struct *vma, +unsigned long hugetlb_change_protection(struct vm_area_struct *vma,  		unsigned long address, unsigned long end, pgprot_t newprot)  {  	struct mm_struct *mm = vma->vm_mm; @@ -3024,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,  	pte_t *ptep;  	pte_t pte;  	struct hstate *h = hstate_vma(vma); +	unsigned long pages = 0;  	BUG_ON(address >= end);  	flush_cache_range(vma, address, end); @@ -3034,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,  		ptep = huge_pte_offset(mm, address);  		if (!ptep)  			continue; -		if (huge_pmd_unshare(mm, &address, ptep)) +		if (huge_pmd_unshare(mm, &address, ptep)) { +			pages++;  			continue; +		}  		if (!huge_pte_none(huge_ptep_get(ptep))) {  			pte = huge_ptep_get_and_clear(mm, address, ptep);  			pte = pte_mkhuge(pte_modify(pte, newprot));  			set_huge_pte_at(mm, address, ptep, pte); +			pages++;  		}  	}  	spin_unlock(&mm->page_table_lock); @@ -3051,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,  	 */  	flush_tlb_range(vma, start, end);  	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + +	return pages << h->order;  }  int hugetlb_reserve_pages(struct inode *inode, diff --git a/mm/internal.h b/mm/internal.h index 52d1fa95719..d597f94cc20 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)  {  	if (TestClearPageMlocked(page)) {  		unsigned long flags; +		int nr_pages = hpage_nr_pages(page);  		local_irq_save(flags); -		__dec_zone_page_state(page, NR_MLOCK); +		__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);  		SetPageMlocked(newpage); -		__inc_zone_page_state(newpage, NR_MLOCK); +		__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);  		local_irq_restore(flags);  	}  } +extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); +  #ifdef CONFIG_TRANSPARENT_HUGEPAGE  extern unsigned long vma_address(struct page *page,  				 struct vm_area_struct *vma); @@ -1624,7 +1624,7 @@ again:  		struct anon_vma_chain *vmac;  		struct vm_area_struct *vma; -		anon_vma_lock(anon_vma); +		anon_vma_lock_write(anon_vma);  		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,  					       0, ULONG_MAX) {  			vma = vmac->vma; @@ -1678,7 +1678,7 @@ again:  		struct anon_vma_chain *vmac;  		struct vm_area_struct *vma; -		anon_vma_lock(anon_vma); +		anon_vma_lock_write(anon_vma);  		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,  					       0, ULONG_MAX) {  			vma = vmac->vma; @@ -1731,7 +1731,7 @@ again:  		struct anon_vma_chain *vmac;  		struct vm_area_struct *vma; -		anon_vma_lock(anon_vma); +		anon_vma_lock_write(anon_vma);  		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,  					       0, ULONG_MAX) {  			vma = vmac->vma; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c055929c8c..bbfac5063ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3289,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,  				  struct mem_cgroup **memcgp)  {  	struct mem_cgroup *memcg = NULL; +	unsigned int nr_pages = 1;  	struct page_cgroup *pc;  	enum charge_type ctype;  	*memcgp = NULL; -	VM_BUG_ON(PageTransHuge(page));  	if (mem_cgroup_disabled())  		return; +	if (PageTransHuge(page)) +		nr_pages <<= compound_order(page); +  	pc = lookup_page_cgroup(page);  	lock_page_cgroup(pc);  	if (PageCgroupUsed(pc)) { @@ -3359,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,  	 * charged to the res_counter since we plan on replacing the  	 * old one and only one page is going to be left afterwards.  	 */ -	__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); +	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);  }  /* remove redundant charge if migration failed*/ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 108c52fa60f..c6e4dd3e1c0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,  	struct anon_vma *av;  	pgoff_t pgoff; -	av = page_lock_anon_vma(page); +	av = page_lock_anon_vma_read(page);  	if (av == NULL)	/* Not actually mapped anymore */  		return; @@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,  		}  	}  	read_unlock(&tasklist_lock); -	page_unlock_anon_vma(av); +	page_unlock_anon_vma_read(av);  }  /* @@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)  					    page_is_file_cache(page));  		list_add(&page->lru, &pagelist);  		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, -							false, MIGRATE_SYNC); +							false, MIGRATE_SYNC, +							MR_MEMORY_FAILURE);  		if (ret) {  			putback_lru_pages(&pagelist);  			pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory.c b/mm/memory.c index db2e9e797a0..e6a3b933517 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,6 +57,7 @@  #include <linux/swapops.h>  #include <linux/elf.h>  #include <linux/gfp.h> +#include <linux/migrate.h>  #include <asm/io.h>  #include <asm/pgalloc.h> @@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,  		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);  		goto out;  	} +	if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) +		goto no_page_table;  	if (pmd_trans_huge(*pmd)) {  		if (flags & FOLL_SPLIT) {  			split_huge_page_pmd(vma, address, pmd); @@ -1532,6 +1535,8 @@ split_fallthrough:  	pte = *ptep;  	if (!pte_present(pte))  		goto no_page; +	if ((flags & FOLL_NUMA) && pte_numa(pte)) +		goto no_page;  	if ((flags & FOLL_WRITE) && !pte_write(pte))  		goto unlock; @@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,  			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);  	vm_flags &= (gup_flags & FOLL_FORCE) ?  			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + +	/* +	 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault +	 * would be called on PROT_NONE ranges. We must never invoke +	 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting +	 * page faults would unprotect the PROT_NONE ranges if +	 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd +	 * bitflag. So to avoid that, don't set FOLL_NUMA if +	 * FOLL_FORCE is set. +	 */ +	if (!(gup_flags & FOLL_FORCE)) +		gup_flags |= FOLL_NUMA; +  	i = 0;  	do { @@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);  } +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +				unsigned long addr, int current_nid) +{ +	get_page(page); + +	count_vm_numa_event(NUMA_HINT_FAULTS); +	if (current_nid == numa_node_id()) +		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + +	return mpol_misplaced(page, vma, addr); +} + +int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +{ +	struct page *page = NULL; +	spinlock_t *ptl; +	int current_nid = -1; +	int target_nid; +	bool migrated = false; + +	/* +	* The "pte" at this point cannot be used safely without +	* validation through pte_unmap_same(). It's of NUMA type but +	* the pfn may be screwed if the read is non atomic. +	* +	* ptep_modify_prot_start is not called as this is clearing +	* the _PAGE_NUMA bit and it is not really expected that there +	* would be concurrent hardware modifications to the PTE. +	*/ +	ptl = pte_lockptr(mm, pmd); +	spin_lock(ptl); +	if (unlikely(!pte_same(*ptep, pte))) { +		pte_unmap_unlock(ptep, ptl); +		goto out; +	} + +	pte = pte_mknonnuma(pte); +	set_pte_at(mm, addr, ptep, pte); +	update_mmu_cache(vma, addr, ptep); + +	page = vm_normal_page(vma, addr, pte); +	if (!page) { +		pte_unmap_unlock(ptep, ptl); +		return 0; +	} + +	current_nid = page_to_nid(page); +	target_nid = numa_migrate_prep(page, vma, addr, current_nid); +	pte_unmap_unlock(ptep, ptl); +	if (target_nid == -1) { +		/* +		 * Account for the fault against the current node if it not +		 * being replaced regardless of where the page is located. +		 */ +		current_nid = numa_node_id(); +		put_page(page); +		goto out; +	} + +	/* Migrate to the requested node */ +	migrated = migrate_misplaced_page(page, target_nid); +	if (migrated) +		current_nid = target_nid; + +out: +	if (current_nid != -1) +		task_numa_fault(current_nid, 1, migrated); +	return 0; +} + +/* NUMA hinting page fault entry point for regular pmds */ +#ifdef CONFIG_NUMA_BALANCING +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +		     unsigned long addr, pmd_t *pmdp) +{ +	pmd_t pmd; +	pte_t *pte, *orig_pte; +	unsigned long _addr = addr & PMD_MASK; +	unsigned long offset; +	spinlock_t *ptl; +	bool numa = false; +	int local_nid = numa_node_id(); + +	spin_lock(&mm->page_table_lock); +	pmd = *pmdp; +	if (pmd_numa(pmd)) { +		set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); +		numa = true; +	} +	spin_unlock(&mm->page_table_lock); + +	if (!numa) +		return 0; + +	/* we're in a page fault so some vma must be in the range */ +	BUG_ON(!vma); +	BUG_ON(vma->vm_start >= _addr + PMD_SIZE); +	offset = max(_addr, vma->vm_start) & ~PMD_MASK; +	VM_BUG_ON(offset >= PMD_SIZE); +	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); +	pte += offset >> PAGE_SHIFT; +	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { +		pte_t pteval = *pte; +		struct page *page; +		int curr_nid = local_nid; +		int target_nid; +		bool migrated; +		if (!pte_present(pteval)) +			continue; +		if (!pte_numa(pteval)) +			continue; +		if (addr >= vma->vm_end) { +			vma = find_vma(mm, addr); +			/* there's a pte present so there must be a vma */ +			BUG_ON(!vma); +			BUG_ON(addr < vma->vm_start); +		} +		if (pte_numa(pteval)) { +			pteval = pte_mknonnuma(pteval); +			set_pte_at(mm, addr, pte, pteval); +		} +		page = vm_normal_page(vma, addr, pteval); +		if (unlikely(!page)) +			continue; +		/* only check non-shared pages */ +		if (unlikely(page_mapcount(page) != 1)) +			continue; + +		/* +		 * Note that the NUMA fault is later accounted to either +		 * the node that is currently running or where the page is +		 * migrated to. +		 */ +		curr_nid = local_nid; +		target_nid = numa_migrate_prep(page, vma, addr, +					       page_to_nid(page)); +		if (target_nid == -1) { +			put_page(page); +			continue; +		} + +		/* Migrate to the requested node */ +		pte_unmap_unlock(pte, ptl); +		migrated = migrate_misplaced_page(page, target_nid); +		if (migrated) +			curr_nid = target_nid; +		task_numa_fault(curr_nid, 1, migrated); + +		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); +	} +	pte_unmap_unlock(orig_pte, ptl); + +	return 0; +} +#else +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +		     unsigned long addr, pmd_t *pmdp) +{ +	BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ +  /*   * These routines also need to handle stuff like marking pages dirty   * and/or accessed for architectures that don't do it in hardware (most @@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm,  					pte, pmd, flags, entry);  	} +	if (pte_numa(entry)) +		return do_numa_page(mm, vma, address, entry, pte, pmd); +  	ptl = pte_lockptr(mm, pmd);  	spin_lock(ptl);  	if (unlikely(!pte_same(*pte, entry))) @@ -3520,8 +3704,11 @@ retry:  		if (pmd_trans_huge(orig_pmd)) {  			unsigned int dirty = flags & FAULT_FLAG_WRITE; -			if (dirty && !pmd_write(orig_pmd) && -			    !pmd_trans_splitting(orig_pmd)) { +			if (pmd_numa(orig_pmd)) +				return do_huge_pmd_numa_page(mm, vma, address, +							     orig_pmd, pmd); + +			if (dirty && !pmd_write(orig_pmd)) {  				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,  							  orig_pmd);  				/* @@ -3536,16 +3723,21 @@ retry:  				huge_pmd_set_accessed(mm, vma, address, pmd,  						      orig_pmd, dirty);  			} +  			return 0;  		}  	} +	if (pmd_numa(*pmd)) +		return do_pmd_numa_page(mm, vma, address, pmd); +  	/*  	 * Use __pte_alloc instead of pte_alloc_map, because we can't  	 * run pte_offset_map on the pmd, if an huge pmd could  	 * materialize from under us from a different thread.  	 */ -	if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) +	if (unlikely(pmd_none(*pmd)) && +	    unlikely(__pte_alloc(mm, vma, pmd, address)))  		return VM_FAULT_OOM;  	/* if an huge pmd materialized from under us just retry later */  	if (unlikely(pmd_trans_huge(*pmd))) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 518baa896e8..962e353aa86 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1055,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)  		 * migrate_pages returns # of failed pages.  		 */  		ret = migrate_pages(&source, alloc_migrate_target, 0, -							true, MIGRATE_SYNC); +							true, MIGRATE_SYNC, +							MR_MEMORY_HOTPLUG);  		if (ret)  			putback_lru_pages(&source);  	} diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aaf54566cb6..d1b315e9862 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -90,6 +90,7 @@  #include <linux/syscalls.h>  #include <linux/ctype.h>  #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h>  #include <asm/tlbflush.h>  #include <asm/uaccess.h> @@ -117,6 +118,26 @@ static struct mempolicy default_policy = {  	.flags = MPOL_F_LOCAL,  }; +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +static struct mempolicy *get_task_policy(struct task_struct *p) +{ +	struct mempolicy *pol = p->mempolicy; +	int node; + +	if (!pol) { +		node = numa_node_id(); +		if (node != -1) +			pol = &preferred_node_policy[node]; + +		/* preferred_node_policy is not initialised early in boot */ +		if (!pol->mode) +			pol = NULL; +	} + +	return pol; +} +  static const struct mempolicy_operations {  	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);  	/* @@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,  	if (mode == MPOL_DEFAULT) {  		if (nodes && !nodes_empty(*nodes))  			return ERR_PTR(-EINVAL); -		return NULL;	/* simply delete any existing policy */ +		return NULL;  	}  	VM_BUG_ON(!nodes); @@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,  			     (flags & MPOL_F_RELATIVE_NODES)))  				return ERR_PTR(-EINVAL);  		} +	} else if (mode == MPOL_LOCAL) { +		if (!nodes_empty(*nodes)) +			return ERR_PTR(-EINVAL); +		mode = MPOL_PREFERRED;  	} else if (nodes_empty(*nodes))  		return ERR_PTR(-EINVAL);  	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,  	return 0;  } +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. + */ +unsigned long change_prot_numa(struct vm_area_struct *vma, +			unsigned long addr, unsigned long end) +{ +	int nr_updated; +	BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); + +	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); +	if (nr_updated) +		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + +	return nr_updated; +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, +			unsigned long addr, unsigned long end) +{ +	return 0; +} +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ +  /*   * Check if all pages in a range are on a set of nodes.   * If pagelist != NULL then isolate pages from the LRU and @@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,  		return ERR_PTR(-EFAULT);  	prev = NULL;  	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { +		unsigned long endvma = vma->vm_end; + +		if (endvma > end) +			endvma = end; +		if (vma->vm_start > start) +			start = vma->vm_start; +  		if (!(flags & MPOL_MF_DISCONTIG_OK)) {  			if (!vma->vm_next && vma->vm_end < end)  				return ERR_PTR(-EFAULT);  			if (prev && prev->vm_end < vma->vm_start)  				return ERR_PTR(-EFAULT);  		} -		if (!is_vm_hugetlb_page(vma) && -		    ((flags & MPOL_MF_STRICT) || + +		if (is_vm_hugetlb_page(vma)) +			goto next; + +		if (flags & MPOL_MF_LAZY) { +			change_prot_numa(vma, start, endvma); +			goto next; +		} + +		if ((flags & MPOL_MF_STRICT) ||  		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && -				vma_migratable(vma)))) { -			unsigned long endvma = vma->vm_end; +		      vma_migratable(vma))) { -			if (endvma > end) -				endvma = end; -			if (vma->vm_start > start) -				start = vma->vm_start;  			err = check_pgd_range(vma, start, endvma, nodes,  						flags, private);  			if (err) { @@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,  				break;  			}  		} +next:  		prev = vma;  	}  	return first; @@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,  	if (!list_empty(&pagelist)) {  		err = migrate_pages(&pagelist, new_node_page, dest, -							false, MIGRATE_SYNC); +							false, MIGRATE_SYNC, +							MR_SYSCALL);  		if (err)  			putback_lru_pages(&pagelist);  	} @@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,  	int err;  	LIST_HEAD(pagelist); -	if (flags & ~(unsigned long)(MPOL_MF_STRICT | -				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) +	if (flags & ~(unsigned long)MPOL_MF_VALID)  		return -EINVAL;  	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))  		return -EPERM; @@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,  	if (IS_ERR(new))  		return PTR_ERR(new); +	if (flags & MPOL_MF_LAZY) +		new->flags |= MPOL_F_MOF; +  	/*  	 * If we are using the default policy then operation  	 * on discontinuous address spaces is okay after all @@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,  	vma = check_range(mm, start, end, nmask,  			  flags | MPOL_MF_INVERT, &pagelist); -	err = PTR_ERR(vma); -	if (!IS_ERR(vma)) { -		int nr_failed = 0; - +	err = PTR_ERR(vma);	/* maybe ... */ +	if (!IS_ERR(vma))  		err = mbind_range(mm, start, end, new); +	if (!err) { +		int nr_failed = 0; +  		if (!list_empty(&pagelist)) { +			WARN_ON_ONCE(flags & MPOL_MF_LAZY);  			nr_failed = migrate_pages(&pagelist, new_vma_page,  						(unsigned long)vma, -						false, MIGRATE_SYNC); +						false, MIGRATE_SYNC, +						MR_MEMPOLICY_MBIND);  			if (nr_failed)  				putback_lru_pages(&pagelist);  		} -		if (!err && nr_failed && (flags & MPOL_MF_STRICT)) +		if (nr_failed && (flags & MPOL_MF_STRICT))  			err = -EIO;  	} else  		putback_lru_pages(&pagelist); @@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,  struct mempolicy *get_vma_policy(struct task_struct *task,  		struct vm_area_struct *vma, unsigned long addr)  { -	struct mempolicy *pol = task->mempolicy; +	struct mempolicy *pol = get_task_policy(task);  	if (vma) {  		if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -1956,7 +2028,7 @@ retry_cpuset:   */  struct page *alloc_pages_current(gfp_t gfp, unsigned order)  { -	struct mempolicy *pol = current->mempolicy; +	struct mempolicy *pol = get_task_policy(current);  	struct page *page;  	unsigned int cpuset_mems_cookie; @@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n)  	kmem_cache_free(sn_cache, n);  } +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page   - page to be checked + * @vma    - vm area where page mapped + * @addr   - virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. + * + * Returns: + *	-1	- not misplaced, page is in the right node + *	node	- node id where the page should be + * + * Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ +	struct mempolicy *pol; +	struct zone *zone; +	int curnid = page_to_nid(page); +	unsigned long pgoff; +	int polnid = -1; +	int ret = -1; + +	BUG_ON(!vma); + +	pol = get_vma_policy(current, vma, addr); +	if (!(pol->flags & MPOL_F_MOF)) +		goto out; + +	switch (pol->mode) { +	case MPOL_INTERLEAVE: +		BUG_ON(addr >= vma->vm_end); +		BUG_ON(addr < vma->vm_start); + +		pgoff = vma->vm_pgoff; +		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; +		polnid = offset_il_node(pol, vma, pgoff); +		break; + +	case MPOL_PREFERRED: +		if (pol->flags & MPOL_F_LOCAL) +			polnid = numa_node_id(); +		else +			polnid = pol->v.preferred_node; +		break; + +	case MPOL_BIND: +		/* +		 * allows binding to multiple nodes. +		 * use current page if in policy nodemask, +		 * else select nearest allowed node, if any. +		 * If no allowed nodes, use current [!misplaced]. +		 */ +		if (node_isset(curnid, pol->v.nodes)) +			goto out; +		(void)first_zones_zonelist( +				node_zonelist(numa_node_id(), GFP_HIGHUSER), +				gfp_zone(GFP_HIGHUSER), +				&pol->v.nodes, &zone); +		polnid = zone->node; +		break; + +	default: +		BUG(); +	} + +	/* Migrate the page towards the node whose CPU is referencing it */ +	if (pol->flags & MPOL_F_MORON) { +		int last_nid; + +		polnid = numa_node_id(); + +		/* +		 * Multi-stage node selection is used in conjunction +		 * with a periodic migration fault to build a temporal +		 * task<->page relation. By using a two-stage filter we +		 * remove short/unlikely relations. +		 * +		 * Using P(p) ~ n_p / n_t as per frequentist +		 * probability, we can equate a task's usage of a +		 * particular page (n_p) per total usage of this +		 * page (n_t) (in a given time-span) to a probability. +		 * +		 * Our periodic faults will sample this probability and +		 * getting the same result twice in a row, given these +		 * samples are fully independent, is then given by +		 * P(n)^2, provided our sample period is sufficiently +		 * short compared to the usage pattern. +		 * +		 * This quadric squishes small probabilities, making +		 * it less likely we act on an unlikely task<->page +		 * relation. +		 */ +		last_nid = page_xchg_last_nid(page, polnid); +		if (last_nid != polnid) +			goto out; +	} + +	if (curnid != polnid) +		ret = polnid; +out: +	mpol_cond_put(pol); + +	return ret; +} +  static void sp_delete(struct shared_policy *sp, struct sp_node *n)  {  	pr_debug("deleting %lx-l%lx\n", n->start, n->end); @@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)  	mutex_unlock(&p->mutex);  } +#ifdef CONFIG_NUMA_BALANCING +static bool __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ +	bool numabalancing_default = false; + +	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) +		numabalancing_default = true; + +	if (nr_node_ids > 1 && !numabalancing_override) { +		printk(KERN_INFO "Enabling automatic NUMA balancing. " +			"Configure with numa_balancing= or sysctl"); +		set_numabalancing_state(numabalancing_default); +	} +} + +static int __init setup_numabalancing(char *str) +{ +	int ret = 0; +	if (!str) +		goto out; +	numabalancing_override = true; + +	if (!strcmp(str, "enable")) { +		set_numabalancing_state(true); +		ret = 1; +	} else if (!strcmp(str, "disable")) { +		set_numabalancing_state(false); +		ret = 1; +	} +out: +	if (!ret) +		printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + +	return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ +  /* assumes fs == KERNEL_DS */  void __init numa_policy_init(void)  { @@ -2320,6 +2545,15 @@ void __init numa_policy_init(void)  				     sizeof(struct sp_node),  				     0, SLAB_PANIC, NULL); +	for_each_node(nid) { +		preferred_node_policy[nid] = (struct mempolicy) { +			.refcnt = ATOMIC_INIT(1), +			.mode = MPOL_PREFERRED, +			.flags = MPOL_F_MOF | MPOL_F_MORON, +			.v = { .preferred_node = nid, }, +		}; +	} +  	/*  	 * Set interleaving policy for system init. Interleaving is only  	 * enabled across suitably sized nodes (default is >= 16MB), or @@ -2346,6 +2580,8 @@ void __init numa_policy_init(void)  	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))  		printk("numa_policy_init: interleaving failed\n"); + +	check_numabalancing_enable();  }  /* Reset policy of current process to default */ @@ -2362,14 +2598,13 @@ void numa_default_policy(void)   * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag   * Used only for mpol_parse_str() and mpol_to_str()   */ -#define MPOL_LOCAL MPOL_MAX  static const char * const policy_modes[] =  {  	[MPOL_DEFAULT]    = "default",  	[MPOL_PREFERRED]  = "prefer",  	[MPOL_BIND]       = "bind",  	[MPOL_INTERLEAVE] = "interleave", -	[MPOL_LOCAL]      = "local" +	[MPOL_LOCAL]      = "local",  }; @@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)  	if (flags)  		*flags++ = '\0';	/* terminate mode string */ -	for (mode = 0; mode <= MPOL_LOCAL; mode++) { +	for (mode = 0; mode < MPOL_MAX; mode++) {  		if (!strcmp(str, policy_modes[mode])) {  			break;  		}  	} -	if (mode > MPOL_LOCAL) +	if (mode >= MPOL_MAX)  		goto out;  	switch (mode) { diff --git a/mm/migrate.c b/mm/migrate.c index cae02711181..32efd8028bc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -39,6 +39,9 @@  #include <asm/tlbflush.h> +#define CREATE_TRACE_POINTS +#include <trace/events/migrate.h> +  #include "internal.h"  /* @@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,  		struct page *newpage, struct page *page,  		struct buffer_head *head, enum migrate_mode mode)  { -	int expected_count; +	int expected_count = 0;  	void **pslot;  	if (!mapping) { @@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,   */  void migrate_page_copy(struct page *newpage, struct page *page)  { -	if (PageHuge(page)) +	if (PageHuge(page) || PageTransHuge(page))  		copy_huge_page(newpage, page);  	else  		copy_highpage(newpage, page); @@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,  	 */  	if (PageAnon(page)) {  		/* -		 * Only page_lock_anon_vma() understands the subtleties of +		 * Only page_lock_anon_vma_read() understands the subtleties of  		 * getting a hold on an anon_vma from outside one of its mms.  		 */  		anon_vma = page_get_anon_vma(page); @@ -998,10 +1001,11 @@ out:   */  int migrate_pages(struct list_head *from,  		new_page_t get_new_page, unsigned long private, bool offlining, -		enum migrate_mode mode) +		enum migrate_mode mode, int reason)  {  	int retry = 1;  	int nr_failed = 0; +	int nr_succeeded = 0;  	int pass = 0;  	struct page *page;  	struct page *page2; @@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from,  				retry++;  				break;  			case MIGRATEPAGE_SUCCESS: +				nr_succeeded++;  				break;  			default:  				/* Permanent failure */ @@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from,  	}  	rc = nr_failed + retry;  out: +	if (nr_succeeded) +		count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); +	if (nr_failed) +		count_vm_events(PGMIGRATE_FAIL, nr_failed); +	trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); +  	if (!swapwrite)  		current->flags &= ~PF_SWAPWRITE; @@ -1176,7 +1187,8 @@ set_status:  	err = 0;  	if (!list_empty(&pagelist)) {  		err = migrate_pages(&pagelist, new_page_node, -				(unsigned long)pm, 0, MIGRATE_SYNC); +				(unsigned long)pm, 0, MIGRATE_SYNC, +				MR_SYSCALL);  		if (err)  			putback_lru_pages(&pagelist);  	} @@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,   	}   	return err;  } -#endif + +#ifdef CONFIG_NUMA_BALANCING +/* + * Returns true if this is a safe migration target node for misplaced NUMA + * pages. Currently it only checks the watermarks which crude + */ +static bool migrate_balanced_pgdat(struct pglist_data *pgdat, +				   int nr_migrate_pages) +{ +	int z; +	for (z = pgdat->nr_zones - 1; z >= 0; z--) { +		struct zone *zone = pgdat->node_zones + z; + +		if (!populated_zone(zone)) +			continue; + +		if (zone->all_unreclaimable) +			continue; + +		/* Avoid waking kswapd by allocating pages_to_migrate pages. */ +		if (!zone_watermark_ok(zone, 0, +				       high_wmark_pages(zone) + +				       nr_migrate_pages, +				       0, 0)) +			continue; +		return true; +	} +	return false; +} + +static struct page *alloc_misplaced_dst_page(struct page *page, +					   unsigned long data, +					   int **result) +{ +	int nid = (int) data; +	struct page *newpage; + +	newpage = alloc_pages_exact_node(nid, +					 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | +					  __GFP_NOMEMALLOC | __GFP_NORETRY | +					  __GFP_NOWARN) & +					 ~GFP_IOFS, 0); +	if (newpage) +		page_xchg_last_nid(newpage, page_last_nid(page)); + +	return newpage; +} + +/* + * page migration rate limiting control. + * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs + * window of time. Default here says do not migrate more than 1280M per second. + * If a node is rate-limited then PTE NUMA updates are also rate-limited. However + * as it is faults that reset the window, pte updates will happen unconditionally + * if there has not been a fault since @pteupdate_interval_millisecs after the + * throttle window closed. + */ +static unsigned int migrate_interval_millisecs __read_mostly = 100; +static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; +static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); + +/* Returns true if NUMA migration is currently rate limited */ +bool migrate_ratelimited(int node) +{ +	pg_data_t *pgdat = NODE_DATA(node); + +	if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + +				msecs_to_jiffies(pteupdate_interval_millisecs))) +		return false; + +	if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) +		return false; + +	return true; +} + +/* Returns true if the node is migrate rate-limited after the update */ +bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) +{ +	bool rate_limited = false; + +	/* +	 * Rate-limit the amount of data that is being migrated to a node. +	 * Optimal placement is no good if the memory bus is saturated and +	 * all the time is being spent migrating! +	 */ +	spin_lock(&pgdat->numabalancing_migrate_lock); +	if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { +		pgdat->numabalancing_migrate_nr_pages = 0; +		pgdat->numabalancing_migrate_next_window = jiffies + +			msecs_to_jiffies(migrate_interval_millisecs); +	} +	if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) +		rate_limited = true; +	else +		pgdat->numabalancing_migrate_nr_pages += nr_pages; +	spin_unlock(&pgdat->numabalancing_migrate_lock); +	 +	return rate_limited; +} + +int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +{ +	int ret = 0; + +	/* Avoid migrating to a node that is nearly full */ +	if (migrate_balanced_pgdat(pgdat, 1)) { +		int page_lru; + +		if (isolate_lru_page(page)) { +			put_page(page); +			return 0; +		} + +		/* Page is isolated */ +		ret = 1; +		page_lru = page_is_file_cache(page); +		if (!PageTransHuge(page)) +			inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); +		else +			mod_zone_page_state(page_zone(page), +					NR_ISOLATED_ANON + page_lru, +					HPAGE_PMD_NR); +	} + +	/* +	 * Page is either isolated or there is not enough space on the target +	 * node. If isolated, then it has taken a reference count and the +	 * callers reference can be safely dropped without the page +	 * disappearing underneath us during migration. Otherwise the page is +	 * not to be migrated but the callers reference should still be +	 * dropped so it does not leak. +	 */ +	put_page(page); + +	return ret; +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, int node) +{ +	pg_data_t *pgdat = NODE_DATA(node); +	int isolated = 0; +	int nr_remaining; +	LIST_HEAD(migratepages); + +	/* +	 * Don't migrate pages that are mapped in multiple processes. +	 * TODO: Handle false sharing detection instead of this hammer +	 */ +	if (page_mapcount(page) != 1) { +		put_page(page); +		goto out; +	} + +	/* +	 * Rate-limit the amount of data that is being migrated to a node. +	 * Optimal placement is no good if the memory bus is saturated and +	 * all the time is being spent migrating! +	 */ +	if (numamigrate_update_ratelimit(pgdat, 1)) { +		put_page(page); +		goto out; +	} + +	isolated = numamigrate_isolate_page(pgdat, page); +	if (!isolated) +		goto out; + +	list_add(&page->lru, &migratepages); +	nr_remaining = migrate_pages(&migratepages, +			alloc_misplaced_dst_page, +			node, false, MIGRATE_ASYNC, +			MR_NUMA_MISPLACED); +	if (nr_remaining) { +		putback_lru_pages(&migratepages); +		isolated = 0; +	} else +		count_vm_numa_event(NUMA_PAGE_MIGRATE); +	BUG_ON(!list_empty(&migratepages)); +out: +	return isolated; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +int migrate_misplaced_transhuge_page(struct mm_struct *mm, +				struct vm_area_struct *vma, +				pmd_t *pmd, pmd_t entry, +				unsigned long address, +				struct page *page, int node) +{ +	unsigned long haddr = address & HPAGE_PMD_MASK; +	pg_data_t *pgdat = NODE_DATA(node); +	int isolated = 0; +	struct page *new_page = NULL; +	struct mem_cgroup *memcg = NULL; +	int page_lru = page_is_file_cache(page); + +	/* +	 * Don't migrate pages that are mapped in multiple processes. +	 * TODO: Handle false sharing detection instead of this hammer +	 */ +	if (page_mapcount(page) != 1) +		goto out_dropref; + +	/* +	 * Rate-limit the amount of data that is being migrated to a node. +	 * Optimal placement is no good if the memory bus is saturated and +	 * all the time is being spent migrating! +	 */ +	if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) +		goto out_dropref; + +	new_page = alloc_pages_node(node, +		(GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); +	if (!new_page) { +		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); +		goto out_dropref; +	} +	page_xchg_last_nid(new_page, page_last_nid(page)); + +	isolated = numamigrate_isolate_page(pgdat, page); +	if (!isolated) { +		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); +		put_page(new_page); +		goto out_keep_locked; +	} + +	/* Prepare a page as a migration target */ +	__set_page_locked(new_page); +	SetPageSwapBacked(new_page); + +	/* anon mapping, we can simply copy page->mapping to the new page: */ +	new_page->mapping = page->mapping; +	new_page->index = page->index; +	migrate_page_copy(new_page, page); +	WARN_ON(PageLRU(new_page)); + +	/* Recheck the target PMD */ +	spin_lock(&mm->page_table_lock); +	if (unlikely(!pmd_same(*pmd, entry))) { +		spin_unlock(&mm->page_table_lock); + +		/* Reverse changes made by migrate_page_copy() */ +		if (TestClearPageActive(new_page)) +			SetPageActive(page); +		if (TestClearPageUnevictable(new_page)) +			SetPageUnevictable(page); +		mlock_migrate_page(page, new_page); + +		unlock_page(new_page); +		put_page(new_page);		/* Free it */ + +		unlock_page(page); +		putback_lru_page(page); + +		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); +		goto out; +	} + +	/* +	 * Traditional migration needs to prepare the memcg charge +	 * transaction early to prevent the old page from being +	 * uncharged when installing migration entries.  Here we can +	 * save the potential rollback and start the charge transfer +	 * only when migration is already known to end successfully. +	 */ +	mem_cgroup_prepare_migration(page, new_page, &memcg); + +	entry = mk_pmd(new_page, vma->vm_page_prot); +	entry = pmd_mknonnuma(entry); +	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); +	entry = pmd_mkhuge(entry); + +	page_add_new_anon_rmap(new_page, vma, haddr); + +	set_pmd_at(mm, haddr, pmd, entry); +	update_mmu_cache_pmd(vma, address, entry); +	page_remove_rmap(page); +	/* +	 * Finish the charge transaction under the page table lock to +	 * prevent split_huge_page() from dividing up the charge +	 * before it's fully transferred to the new page. +	 */ +	mem_cgroup_end_migration(memcg, page, new_page, true); +	spin_unlock(&mm->page_table_lock); + +	unlock_page(new_page); +	unlock_page(page); +	put_page(page);			/* Drop the rmap reference */ +	put_page(page);			/* Drop the LRU isolation reference */ + +	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); +	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); + +out: +	mod_zone_page_state(page_zone(page), +			NR_ISOLATED_ANON + page_lru, +			-HPAGE_PMD_NR); +	return isolated; + +out_dropref: +	put_page(page); +out_keep_locked: +	return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* CONFIG_NUMA */ diff --git a/mm/mmap.c b/mm/mmap.c index 2b7d9e78a56..f54b235f29a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -736,7 +736,7 @@ again:			remove_next = 1 + (end > next->vm_end);  	if (anon_vma) {  		VM_BUG_ON(adjust_next && next->anon_vma &&  			  anon_vma != next->anon_vma); -		anon_vma_lock(anon_vma); +		anon_vma_lock_write(anon_vma);  		anon_vma_interval_tree_pre_update_vma(vma);  		if (adjust_next)  			anon_vma_interval_tree_pre_update_vma(next); @@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)  		 * The LSB of head.next can't change from under us  		 * because we hold the mm_all_locks_mutex.  		 */ -		mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); +		down_write(&anon_vma->root->rwsem);  		/*  		 * We can safely modify head.next after taking the -		 * anon_vma->root->mutex. If some other vma in this mm shares +		 * anon_vma->root->rwsem. If some other vma in this mm shares  		 * the same anon_vma we won't take it again.  		 *  		 * No need of atomic instructions here, head.next  		 * can't change from under us thanks to the -		 * anon_vma->root->mutex. +		 * anon_vma->root->rwsem.  		 */  		if (__test_and_set_bit(0, (unsigned long *)  				       &anon_vma->root->rb_root.rb_node)) @@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)  		 *  		 * No need of atomic instructions here, head.next  		 * can't change from under us until we release the -		 * anon_vma->root->mutex. +		 * anon_vma->root->rwsem.  		 */  		if (!__test_and_clear_bit(0, (unsigned long *)  					  &anon_vma->root->rb_root.rb_node)) diff --git a/mm/mprotect.c b/mm/mprotect.c index e8c3938db6f..3dca970367d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)  }  #endif -static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,  		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +		int dirty_accountable, int prot_numa, bool *ret_all_same_node)  { +	struct mm_struct *mm = vma->vm_mm;  	pte_t *pte, oldpte;  	spinlock_t *ptl; +	unsigned long pages = 0; +	bool all_same_node = true; +	int last_nid = -1;  	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);  	arch_enter_lazy_mmu_mode(); @@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,  		oldpte = *pte;  		if (pte_present(oldpte)) {  			pte_t ptent; +			bool updated = false;  			ptent = ptep_modify_prot_start(mm, addr, pte); -			ptent = pte_modify(ptent, newprot); +			if (!prot_numa) { +				ptent = pte_modify(ptent, newprot); +				updated = true; +			} else { +				struct page *page; + +				page = vm_normal_page(vma, addr, oldpte); +				if (page) { +					int this_nid = page_to_nid(page); +					if (last_nid == -1) +						last_nid = this_nid; +					if (last_nid != this_nid) +						all_same_node = false; + +					/* only check non-shared pages */ +					if (!pte_numa(oldpte) && +					    page_mapcount(page) == 1) { +						ptent = pte_mknuma(ptent); +						updated = true; +					} +				} +			}  			/*  			 * Avoid taking write faults for pages we know to be  			 * dirty.  			 */ -			if (dirty_accountable && pte_dirty(ptent)) +			if (dirty_accountable && pte_dirty(ptent)) {  				ptent = pte_mkwrite(ptent); +				updated = true; +			} +			if (updated) +				pages++;  			ptep_modify_prot_commit(mm, addr, pte, ptent);  		} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {  			swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,  				set_pte_at(mm, addr, pte,  					swp_entry_to_pte(entry));  			} +			pages++;  		}  	} while (pte++, addr += PAGE_SIZE, addr != end);  	arch_leave_lazy_mmu_mode();  	pte_unmap_unlock(pte - 1, ptl); + +	*ret_all_same_node = all_same_node; +	return pages; +} + +#ifdef CONFIG_NUMA_BALANCING +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, +		pmd_t *pmd) +{ +	spin_lock(&mm->page_table_lock); +	set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); +	spin_unlock(&mm->page_table_lock); +} +#else +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, +		pmd_t *pmd) +{ +	BUG();  } +#endif /* CONFIG_NUMA_BALANCING */ -static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,  		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +		int dirty_accountable, int prot_numa)  {  	pmd_t *pmd;  	unsigned long next; +	unsigned long pages = 0; +	bool all_same_node;  	pmd = pmd_offset(pud, addr);  	do { @@ -91,42 +143,59 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,  		if (pmd_trans_huge(*pmd)) {  			if (next - addr != HPAGE_PMD_SIZE)  				split_huge_page_pmd(vma, addr, pmd); -			else if (change_huge_pmd(vma, pmd, addr, newprot)) +			else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { +				pages += HPAGE_PMD_NR;  				continue; +			}  			/* fall through */  		}  		if (pmd_none_or_clear_bad(pmd))  			continue; -		change_pte_range(vma->vm_mm, pmd, addr, next, newprot, -				 dirty_accountable); +		pages += change_pte_range(vma, pmd, addr, next, newprot, +				 dirty_accountable, prot_numa, &all_same_node); + +		/* +		 * If we are changing protections for NUMA hinting faults then +		 * set pmd_numa if the examined pages were all on the same +		 * node. This allows a regular PMD to be handled as one fault +		 * and effectively batches the taking of the PTL +		 */ +		if (prot_numa && all_same_node) +			change_pmd_protnuma(vma->vm_mm, addr, pmd);  	} while (pmd++, addr = next, addr != end); + +	return pages;  } -static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,  		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +		int dirty_accountable, int prot_numa)  {  	pud_t *pud;  	unsigned long next; +	unsigned long pages = 0;  	pud = pud_offset(pgd, addr);  	do {  		next = pud_addr_end(addr, end);  		if (pud_none_or_clear_bad(pud))  			continue; -		change_pmd_range(vma, pud, addr, next, newprot, -				 dirty_accountable); +		pages += change_pmd_range(vma, pud, addr, next, newprot, +				 dirty_accountable, prot_numa);  	} while (pud++, addr = next, addr != end); + +	return pages;  } -static void change_protection(struct vm_area_struct *vma, +static unsigned long change_protection_range(struct vm_area_struct *vma,  		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +		int dirty_accountable, int prot_numa)  {  	struct mm_struct *mm = vma->vm_mm;  	pgd_t *pgd;  	unsigned long next;  	unsigned long start = addr; +	unsigned long pages = 0;  	BUG_ON(addr >= end);  	pgd = pgd_offset(mm, addr); @@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma,  		next = pgd_addr_end(addr, end);  		if (pgd_none_or_clear_bad(pgd))  			continue; -		change_pud_range(vma, pgd, addr, next, newprot, -				 dirty_accountable); +		pages += change_pud_range(vma, pgd, addr, next, newprot, +				 dirty_accountable, prot_numa);  	} while (pgd++, addr = next, addr != end); -	flush_tlb_range(vma, start, end); + +	/* Only flush the TLB if we actually modified any entries: */ +	if (pages) +		flush_tlb_range(vma, start, end); + +	return pages; +} + +unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, +		       unsigned long end, pgprot_t newprot, +		       int dirty_accountable, int prot_numa) +{ +	struct mm_struct *mm = vma->vm_mm; +	unsigned long pages; + +	mmu_notifier_invalidate_range_start(mm, start, end); +	if (is_vm_hugetlb_page(vma)) +		pages = hugetlb_change_protection(vma, start, end, newprot); +	else +		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); +	mmu_notifier_invalidate_range_end(mm, start, end); + +	return pages;  }  int @@ -213,12 +304,8 @@ success:  		dirty_accountable = 1;  	} -	mmu_notifier_invalidate_range_start(mm, start, end); -	if (is_vm_hugetlb_page(vma)) -		hugetlb_change_protection(vma, start, end, vma->vm_page_prot); -	else -		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); -	mmu_notifier_invalidate_range_end(mm, start, end); +	change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); +  	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);  	vm_stat_account(mm, newflags, vma->vm_file, nrpages);  	perf_event_mmap(vma); diff --git a/mm/mremap.c b/mm/mremap.c index eabb24da6c9..e1031e1f6a6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,  		}  		if (vma->anon_vma) {  			anon_vma = vma->anon_vma; -			anon_vma_lock(anon_vma); +			anon_vma_lock_write(anon_vma);  		}  	} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83637dfba11..d037c8bc151 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -611,6 +611,7 @@ static inline int free_pages_check(struct page *page)  		bad_page(page);  		return 1;  	} +	reset_page_last_nid(page);  	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)  		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;  	return 0; @@ -3883,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  		mminit_verify_page_links(page, zone, nid, pfn);  		init_page_count(page);  		reset_page_mapcount(page); +		reset_page_last_nid(page);  		SetPageReserved(page);  		/*  		 * Mark the block movable so that blocks are reserved for @@ -4526,6 +4528,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,  	int ret;  	pgdat_resize_init(pgdat); +#ifdef CONFIG_NUMA_BALANCING +	spin_lock_init(&pgdat->numabalancing_migrate_lock); +	pgdat->numabalancing_migrate_nr_pages = 0; +	pgdat->numabalancing_migrate_next_window = jiffies; +#endif  	init_waitqueue_head(&pgdat->kswapd_wait);  	init_waitqueue_head(&pgdat->pfmemalloc_wait);  	pgdat_page_cgroup_init(pgdat); @@ -5800,7 +5807,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,  		ret = migrate_pages(&cc->migratepages,  				    alloc_migrate_target, -				    0, false, MIGRATE_SYNC); +				    0, false, MIGRATE_SYNC, +				    MR_CMA);  	}  	putback_movable_pages(&cc->migratepages); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e642627da6b..0c8323fe6c8 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -12,8 +12,8 @@  #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS  /* - * Only sets the access flags (dirty, accessed, and - * writable). Furthermore, we know it always gets set to a "more + * Only sets the access flags (dirty, accessed), as well as write  + * permission. Furthermore, we know it always gets set to a "more   * permissive" setting, which allows most architectures to optimize   * this. We return whether the PTE actually changed, which in turn   * instructs the caller to do things like update__mmu_cache.  This @@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,  	int changed = !pte_same(*ptep, entry);  	if (changed) {  		set_pte_at(vma->vm_mm, address, ptep, entry); -		flush_tlb_page(vma, address); +		flush_tlb_fix_spurious_fault(vma, address);  	}  	return changed;  } @@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,  {  	pte_t pte;  	pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); -	flush_tlb_page(vma, address); +	if (pte_accessible(pte)) +		flush_tlb_page(vma, address);  	return pte;  }  #endif diff --git a/mm/rmap.c b/mm/rmap.c index face808a489..2c78f8cadc9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@   *   mm->mmap_sem   *     page->flags PG_locked (lock_page)   *       mapping->i_mmap_mutex - *         anon_vma->mutex + *         anon_vma->rwsem   *           mm->page_table_lock or pte_lock   *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)   *             swap_lock (in swap_duplicate, swap_info_get) @@ -37,7 +37,7 @@   *                           in arch-dependent flush_dcache_mmap_lock,   *                           within bdi.wb->list_lock in __sync_single_inode)   * - * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon) + * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)   *   ->tasklist_lock   *     pte map lock   */ @@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)  	VM_BUG_ON(atomic_read(&anon_vma->refcount));  	/* -	 * Synchronize against page_lock_anon_vma() such that +	 * Synchronize against page_lock_anon_vma_read() such that  	 * we can safely hold the lock without the anon_vma getting  	 * freed.  	 *  	 * Relies on the full mb implied by the atomic_dec_and_test() from  	 * put_anon_vma() against the acquire barrier implied by -	 * mutex_trylock() from page_lock_anon_vma(). This orders: +	 * down_read_trylock() from page_lock_anon_vma_read(). This orders:  	 * -	 * page_lock_anon_vma()		VS	put_anon_vma() -	 *   mutex_trylock()			  atomic_dec_and_test() +	 * page_lock_anon_vma_read()	VS	put_anon_vma() +	 *   down_read_trylock()		  atomic_dec_and_test()  	 *   LOCK				  MB -	 *   atomic_read()			  mutex_is_locked() +	 *   atomic_read()			  rwsem_is_locked()  	 *  	 * LOCK should suffice since the actual taking of the lock must  	 * happen _before_ what follows.  	 */ -	if (mutex_is_locked(&anon_vma->root->mutex)) { -		anon_vma_lock(anon_vma); +	if (rwsem_is_locked(&anon_vma->root->rwsem)) { +		anon_vma_lock_write(anon_vma);  		anon_vma_unlock(anon_vma);  	} @@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,   * allocate a new one.   *   * Anon-vma allocations are very subtle, because we may have - * optimistically looked up an anon_vma in page_lock_anon_vma() + * optimistically looked up an anon_vma in page_lock_anon_vma_read()   * and that may actually touch the spinlock even in the newly   * allocated vma (it depends on RCU to make sure that the   * anon_vma isn't actually destroyed). @@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)  			allocated = anon_vma;  		} -		anon_vma_lock(anon_vma); +		anon_vma_lock_write(anon_vma);  		/* page_table_lock to protect against threads */  		spin_lock(&mm->page_table_lock);  		if (likely(!vma->anon_vma)) { @@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct  	struct anon_vma *new_root = anon_vma->root;  	if (new_root != root) {  		if (WARN_ON_ONCE(root)) -			mutex_unlock(&root->mutex); +			up_write(&root->rwsem);  		root = new_root; -		mutex_lock(&root->mutex); +		down_write(&root->rwsem);  	}  	return root;  } @@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct  static inline void unlock_anon_vma_root(struct anon_vma *root)  {  	if (root) -		mutex_unlock(&root->mutex); +		up_write(&root->rwsem);  }  /* @@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)  	get_anon_vma(anon_vma->root);  	/* Mark this anon_vma as the one where our new (COWed) pages go. */  	vma->anon_vma = anon_vma; -	anon_vma_lock(anon_vma); +	anon_vma_lock_write(anon_vma);  	anon_vma_chain_link(vma, avc, anon_vma);  	anon_vma_unlock(anon_vma); @@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)  	/*  	 * Iterate the list once more, it now only contains empty and unlinked  	 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() -	 * needing to acquire the anon_vma->root->mutex. +	 * needing to write-acquire the anon_vma->root->rwsem.  	 */  	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {  		struct anon_vma *anon_vma = avc->anon_vma; @@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)  {  	struct anon_vma *anon_vma = data; -	mutex_init(&anon_vma->mutex); +	init_rwsem(&anon_vma->rwsem);  	atomic_set(&anon_vma->refcount, 0);  	anon_vma->rb_root = RB_ROOT;  } @@ -442,7 +442,7 @@ out:   * atomic op -- the trylock. If we fail the trylock, we fall back to getting a   * reference like with page_get_anon_vma() and then block on the mutex.   */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma_read(struct page *page)  {  	struct anon_vma *anon_vma = NULL;  	struct anon_vma *root_anon_vma; @@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)  	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);  	root_anon_vma = ACCESS_ONCE(anon_vma->root); -	if (mutex_trylock(&root_anon_vma->mutex)) { +	if (down_read_trylock(&root_anon_vma->rwsem)) {  		/*  		 * If the page is still mapped, then this anon_vma is still  		 * its anon_vma, and holding the mutex ensures that it will  		 * not go away, see anon_vma_free().  		 */  		if (!page_mapped(page)) { -			mutex_unlock(&root_anon_vma->mutex); +			up_read(&root_anon_vma->rwsem);  			anon_vma = NULL;  		}  		goto out; @@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)  	/* we pinned the anon_vma, its safe to sleep */  	rcu_read_unlock(); -	anon_vma_lock(anon_vma); +	anon_vma_lock_read(anon_vma);  	if (atomic_dec_and_test(&anon_vma->refcount)) {  		/*  		 * Oops, we held the last refcount, release the lock  		 * and bail -- can't simply use put_anon_vma() because -		 * we'll deadlock on the anon_vma_lock() recursion. +		 * we'll deadlock on the anon_vma_lock_write() recursion.  		 */ -		anon_vma_unlock(anon_vma); +		anon_vma_unlock_read(anon_vma);  		__put_anon_vma(anon_vma);  		anon_vma = NULL;  	} @@ -504,9 +504,9 @@ out:  	return anon_vma;  } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma_read(struct anon_vma *anon_vma)  { -	anon_vma_unlock(anon_vma); +	anon_vma_unlock_read(anon_vma);  }  /* @@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page,  	struct anon_vma_chain *avc;  	int referenced = 0; -	anon_vma = page_lock_anon_vma(page); +	anon_vma = page_lock_anon_vma_read(page);  	if (!anon_vma)  		return referenced; @@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page,  			break;  	} -	page_unlock_anon_vma(anon_vma); +	page_unlock_anon_vma_read(anon_vma);  	return referenced;  } @@ -1315,7 +1315,7 @@ out_mlock:  	/*  	 * We need mmap_sem locking, Otherwise VM_LOCKED check makes  	 * unstable result and race. Plus, We can't wait here because -	 * we now hold anon_vma->mutex or mapping->i_mmap_mutex. +	 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.  	 * if trylock failed, the page remain in evictable lru and later  	 * vmscan could retry to move the page to unevictable lru if the  	 * page is actually mlocked. @@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)  	struct anon_vma_chain *avc;  	int ret = SWAP_AGAIN; -	anon_vma = page_lock_anon_vma(page); +	anon_vma = page_lock_anon_vma_read(page);  	if (!anon_vma)  		return ret; @@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)  			break;  	} -	page_unlock_anon_vma(anon_vma); +	page_unlock_anon_vma_read(anon_vma);  	return ret;  } @@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,  	int ret = SWAP_AGAIN;  	/* -	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() +	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()  	 * because that depends on page_mapped(); but not all its usages  	 * are holding mmap_sem. Users without mmap_sem are required to  	 * take a reference count to prevent the anon_vma disappearing @@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,  	anon_vma = page_anon_vma(page);  	if (!anon_vma)  		return ret; -	anon_vma_lock(anon_vma); +	anon_vma_lock_read(anon_vma);  	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {  		struct vm_area_struct *vma = avc->vma;  		unsigned long address = vma_address(page, vma); @@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,  		if (ret != SWAP_AGAIN)  			break;  	} -	anon_vma_unlock(anon_vma); +	anon_vma_unlock_read(anon_vma);  	return ret;  } diff --git a/mm/vmstat.c b/mm/vmstat.c index df14808f0a3..9800306c819 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -774,10 +774,20 @@ const char * const vmstat_text[] = {  	"pgrotated", +#ifdef CONFIG_NUMA_BALANCING +	"numa_pte_updates", +	"numa_hint_faults", +	"numa_hint_faults_local", +	"numa_pages_migrated", +#endif +#ifdef CONFIG_MIGRATION +	"pgmigrate_success", +	"pgmigrate_fail", +#endif  #ifdef CONFIG_COMPACTION -	"compact_blocks_moved", -	"compact_pages_moved", -	"compact_pagemigrate_failed", +	"compact_migrate_scanned", +	"compact_free_scanned", +	"compact_isolated",  	"compact_stall",  	"compact_fail",  	"compact_success",  |