diff options
45 files changed, 1938 insertions, 172 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 20e248cc03a..ea8e5b48557 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2032,6 +2032,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.  	nr_uarts=	[SERIAL] maximum number of UARTs to be registered. +	numa_balancing=	[KNL,X86] Enable or disable automatic NUMA balancing. +			Allowed values are enable and disable +  	numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.  			one of ['zone', 'node', 'default'] can be specified  			This can be set from sysctl after boot. diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index cb8f9920f4d..0f7c852f355 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -111,6 +111,7 @@ config VSYSCALL  config NUMA  	bool "Non Uniform Memory Access (NUMA) Support"  	depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL +	select ARCH_WANT_NUMA_VARIABLE_LOCALITY  	default n  	help  	  Some SH systems have many various memories scattered around diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 65a872bf72f..97f8c5ad8c2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,6 +22,8 @@ config X86  	def_bool y  	select HAVE_AOUT if X86_32  	select HAVE_UNSTABLE_SCHED_CLOCK +	select ARCH_SUPPORTS_NUMA_BALANCING +	select ARCH_WANTS_PROT_NUMA_PROT_NONE  	select HAVE_IDE  	select HAVE_OPROFILE  	select HAVE_PCSPKR_PLATFORM diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a1f780d45f7..5199db2923d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -404,7 +404,14 @@ static inline int pte_same(pte_t a, pte_t b)  static inline int pte_present(pte_t a)  { -	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); +	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | +			       _PAGE_NUMA); +} + +#define pte_accessible pte_accessible +static inline int pte_accessible(pte_t a) +{ +	return pte_flags(a) & _PAGE_PRESENT;  }  static inline int pte_hidden(pte_t pte) @@ -420,7 +427,8 @@ static inline int pmd_present(pmd_t pmd)  	 * the _PAGE_PSE flag will remain set at all times while the  	 * _PAGE_PRESENT bit is clear).  	 */ -	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); +	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | +				 _PAGE_NUMA);  }  static inline int pmd_none(pmd_t pmd) @@ -479,6 +487,11 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)  static inline int pmd_bad(pmd_t pmd)  { +#ifdef CONFIG_NUMA_BALANCING +	/* pmd_numa check */ +	if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) +		return 0; +#endif  	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;  } diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ec8a1fc9505..3c32db8c539 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -64,6 +64,26 @@  #define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)  #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +/* + * _PAGE_NUMA indicates that this page will trigger a numa hinting + * minor page fault to gather numa placement statistics (see + * pte_numa()). The bit picked (8) is within the range between + * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't + * require changes to the swp entry format because that bit is always + * zero when the pte is not present. + * + * The bit picked must be always zero when the pmd is present and not + * present, so that we don't lose information when we set it while + * atomically clearing the present bit. + * + * Because we shared the same bit (8) with _PAGE_PROTNONE this can be + * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE + * couldn't reach, like handle_mm_fault() (see access_error in + * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for + * handle_mm_fault() to be invoked). + */ +#define _PAGE_NUMA	_PAGE_PROTNONE +  #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\  			 _PAGE_ACCESSED | _PAGE_DIRTY)  #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 217eb705fac..e27fbf887f3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -301,6 +301,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)  	free_page((unsigned long)pgd);  } +/* + * Used to set accessed or dirty bits in the page table entries + * on other architectures. On x86, the accessed and dirty bits + * are tracked by hardware. However, do_wp_page calls this function + * to also make the pte writeable at the same time the dirty bit is + * set. In that case we do actually need to write the PTE. + */  int ptep_set_access_flags(struct vm_area_struct *vma,  			  unsigned long address, pte_t *ptep,  			  pte_t entry, int dirty) @@ -310,7 +317,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma,  	if (changed && dirty) {  		*ptep = entry;  		pte_update_defer(vma->vm_mm, address, ptep); -		flush_tlb_page(vma, address);  	}  	return changed; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 284e80831d2..701beab27aa 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -219,6 +219,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)  #define move_pte(pte, prot, old_addr, new_addr)	(pte)  #endif +#ifndef pte_accessible +# define pte_accessible(pte)		((void)(pte),1) +#endif +  #ifndef flush_tlb_fix_spurious_fault  #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)  #endif @@ -580,6 +584,112 @@ static inline int pmd_trans_unstable(pmd_t *pmd)  #endif  } +#ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the + * same bit too). It's set only when _PAGE_PRESET is not set and it's + * never set if _PAGE_PRESENT is set. + * + * pte/pmd_present() returns true if pte/pmd_numa returns true. Page + * fault triggers on those regions if pte/pmd_numa returns true + * (because _PAGE_PRESENT is not set). + */ +#ifndef pte_numa +static inline int pte_numa(pte_t pte) +{ +	return (pte_flags(pte) & +		(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; +} +#endif + +#ifndef pmd_numa +static inline int pmd_numa(pmd_t pmd) +{ +	return (pmd_flags(pmd) & +		(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; +} +#endif + +/* + * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically + * because they're called by the NUMA hinting minor page fault. If we + * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler + * would be forced to set it later while filling the TLB after we + * return to userland. That would trigger a second write to memory + * that we optimize away by setting _PAGE_ACCESSED here. + */ +#ifndef pte_mknonnuma +static inline pte_t pte_mknonnuma(pte_t pte) +{ +	pte = pte_clear_flags(pte, _PAGE_NUMA); +	return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED); +} +#endif + +#ifndef pmd_mknonnuma +static inline pmd_t pmd_mknonnuma(pmd_t pmd) +{ +	pmd = pmd_clear_flags(pmd, _PAGE_NUMA); +	return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED); +} +#endif + +#ifndef pte_mknuma +static inline pte_t pte_mknuma(pte_t pte) +{ +	pte = pte_set_flags(pte, _PAGE_NUMA); +	return pte_clear_flags(pte, _PAGE_PRESENT); +} +#endif + +#ifndef pmd_mknuma +static inline pmd_t pmd_mknuma(pmd_t pmd) +{ +	pmd = pmd_set_flags(pmd, _PAGE_NUMA); +	return pmd_clear_flags(pmd, _PAGE_PRESENT); +} +#endif +#else +extern int pte_numa(pte_t pte); +extern int pmd_numa(pmd_t pmd); +extern pte_t pte_mknonnuma(pte_t pte); +extern pmd_t pmd_mknonnuma(pmd_t pmd); +extern pte_t pte_mknuma(pte_t pte); +extern pmd_t pmd_mknuma(pmd_t pmd); +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ +#else +static inline int pmd_numa(pmd_t pmd) +{ +	return 0; +} + +static inline int pte_numa(pte_t pte) +{ +	return 0; +} + +static inline pte_t pte_mknonnuma(pte_t pte) +{ +	return pte; +} + +static inline pmd_t pmd_mknonnuma(pmd_t pmd) +{ +	return pmd; +} + +static inline pte_t pte_mknuma(pte_t pte) +{ +	return pte; +} + +static inline pmd_t pmd_mknuma(pmd_t pmd) +{ +	return pmd; +} +#endif /* CONFIG_NUMA_BALANCING */ +  #endif /* CONFIG_MMU */  #endif /* !__ASSEMBLY__ */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 092dc5305a3..1d76f8ca90f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,  			 unsigned long new_addr, unsigned long old_end,  			 pmd_t *old_pmd, pmd_t *new_pmd);  extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, -			unsigned long addr, pgprot_t newprot); +			unsigned long addr, pgprot_t newprot, +			int prot_numa);  enum transparent_hugepage_flag {  	TRANSPARENT_HUGEPAGE_FLAG, @@ -111,7 +112,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,  #define wait_split_huge_page(__anon_vma, __pmd)				\  	do {								\  		pmd_t *____pmd = (__pmd);				\ -		anon_vma_lock(__anon_vma);				\ +		anon_vma_lock_write(__anon_vma);			\  		anon_vma_unlock(__anon_vma);				\  		BUG_ON(pmd_trans_splitting(*____pmd) ||			\  		       pmd_trans_huge(*____pmd));			\ @@ -171,6 +172,10 @@ static inline struct page *compound_trans_head(struct page *page)  	}  	return page;  } + +extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +				unsigned long addr, pmd_t pmd, pmd_t *pmdp); +  #else /* CONFIG_TRANSPARENT_HUGEPAGE */  #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })  #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -209,6 +214,13 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,  {  	return 0;  } + +static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +					unsigned long addr, pmd_t pmd, pmd_t *pmdp) +{ +	return 0; +} +  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */  #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3e7fa1acf09..0c80d3f57a5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,  				pud_t *pud, int write);  int pmd_huge(pmd_t pmd);  int pud_huge(pud_t pmd); -void hugetlb_change_protection(struct vm_area_struct *vma, +unsigned long hugetlb_change_protection(struct vm_area_struct *vma,  		unsigned long address, unsigned long end, pgprot_t newprot);  #else /* !CONFIG_HUGETLB_PAGE */ @@ -132,7 +132,11 @@ static inline void copy_huge_page(struct page *dst, struct page *src)  {  } -#define hugetlb_change_protection(vma, address, end, newprot) +static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +		unsigned long address, unsigned long end, pgprot_t newprot) +{ +	return 0; +}  static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,  			struct vm_area_struct *vma, unsigned long start, diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index dbd212723b7..9adc270de7e 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -188,6 +188,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)  	return 1;  } +extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); +  #else  struct mempolicy {}; @@ -307,5 +309,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,  	return 0;  } +static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, +				 unsigned long address) +{ +	return -1; /* no node preference */ +} +  #endif /* CONFIG_NUMA */  #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 0b5865c61ef..1e9f627967a 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -23,6 +23,15 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **);  #define MIGRATEPAGE_BALLOON_SUCCESS	1 /* special ret code for balloon page  					   * sucessful migration case.  					   */ +enum migrate_reason { +	MR_COMPACTION, +	MR_MEMORY_FAILURE, +	MR_MEMORY_HOTPLUG, +	MR_SYSCALL,		/* also applies to cpusets */ +	MR_MEMPOLICY_MBIND, +	MR_NUMA_MISPLACED, +	MR_CMA +};  #ifdef CONFIG_MIGRATION @@ -32,7 +41,7 @@ extern int migrate_page(struct address_space *,  			struct page *, struct page *, enum migrate_mode);  extern int migrate_pages(struct list_head *l, new_page_t x,  			unsigned long private, bool offlining, -			enum migrate_mode mode); +			enum migrate_mode mode, int reason);  extern int migrate_huge_page(struct page *, new_page_t x,  			unsigned long private, bool offlining,  			enum migrate_mode mode); @@ -54,7 +63,7 @@ static inline void putback_lru_pages(struct list_head *l) {}  static inline void putback_movable_pages(struct list_head *l) {}  static inline int migrate_pages(struct list_head *l, new_page_t x,  		unsigned long private, bool offlining, -		enum migrate_mode mode) { return -ENOSYS; } +		enum migrate_mode mode, int reason) { return -ENOSYS; }  static inline int migrate_huge_page(struct page *page, new_page_t x,  		unsigned long private, bool offlining,  		enum migrate_mode mode) { return -ENOSYS; } @@ -83,4 +92,37 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,  #define fail_migrate_page NULL  #endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_NUMA_BALANCING +extern int migrate_misplaced_page(struct page *page, int node); +extern int migrate_misplaced_page(struct page *page, int node); +extern bool migrate_ratelimited(int node); +#else +static inline int migrate_misplaced_page(struct page *page, int node) +{ +	return -EAGAIN; /* can't migrate now */ +} +static inline bool migrate_ratelimited(int node) +{ +	return false; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, +			struct vm_area_struct *vma, +			pmd_t *pmd, pmd_t entry, +			unsigned long address, +			struct page *page, int node); +#else +static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, +			struct vm_area_struct *vma, +			pmd_t *pmd, pmd_t entry, +			unsigned long address, +			struct page *page, int node) +{ +	return -EAGAIN; +} +#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/ +  #endif /* _LINUX_MIGRATE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 4af4f0b1be4..7f4f906190b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -693,6 +693,36 @@ static inline int page_to_nid(const struct page *page)  }  #endif +#ifdef CONFIG_NUMA_BALANCING +static inline int page_xchg_last_nid(struct page *page, int nid) +{ +	return xchg(&page->_last_nid, nid); +} + +static inline int page_last_nid(struct page *page) +{ +	return page->_last_nid; +} +static inline void reset_page_last_nid(struct page *page) +{ +	page->_last_nid = -1; +} +#else +static inline int page_xchg_last_nid(struct page *page, int nid) +{ +	return page_to_nid(page); +} + +static inline int page_last_nid(struct page *page) +{ +	return page_to_nid(page); +} + +static inline void reset_page_last_nid(struct page *page) +{ +} +#endif +  static inline struct zone *page_zone(const struct page *page)  {  	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; @@ -1078,6 +1108,9 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,  extern unsigned long do_mremap(unsigned long addr,  			       unsigned long old_len, unsigned long new_len,  			       unsigned long flags, unsigned long new_addr); +extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, +			      unsigned long end, pgprot_t newprot, +			      int dirty_accountable, int prot_numa);  extern int mprotect_fixup(struct vm_area_struct *vma,  			  struct vm_area_struct **pprev, unsigned long start,  			  unsigned long end, unsigned long newflags); @@ -1579,6 +1612,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)  }  #endif +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +unsigned long change_prot_numa(struct vm_area_struct *vma, +			unsigned long start, unsigned long end); +#endif +  struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);  int remap_pfn_range(struct vm_area_struct *, unsigned long addr,  			unsigned long pfn, unsigned long size, pgprot_t); @@ -1600,6 +1638,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,  #define FOLL_MLOCK	0x40	/* mark page as mlocked */  #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */  #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */ +#define FOLL_NUMA	0x200	/* force NUMA hinting page fault */  typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,  			void *data); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7ade2731b5d..7d9ebb7cc98 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -175,6 +175,10 @@ struct page {  	 */  	void *shadow;  #endif + +#ifdef CONFIG_NUMA_BALANCING +	int _last_nid; +#endif  }  /*   * The struct page can be forced to be double word aligned so that atomic ops @@ -411,9 +415,36 @@ struct mm_struct {  #ifdef CONFIG_CPUMASK_OFFSTACK  	struct cpumask cpumask_allocation;  #endif +#ifdef CONFIG_NUMA_BALANCING +	/* +	 * numa_next_scan is the next time when the PTEs will me marked +	 * pte_numa to gather statistics and migrate pages to new nodes +	 * if necessary +	 */ +	unsigned long numa_next_scan; + +	/* numa_next_reset is when the PTE scanner period will be reset */ +	unsigned long numa_next_reset; + +	/* Restart point for scanning and setting pte_numa */ +	unsigned long numa_scan_offset; + +	/* numa_scan_seq prevents two threads setting pte_numa */ +	int numa_scan_seq; + +	/* +	 * The first node a task was scheduled on. If a task runs on +	 * a different node than Make PTE Scan Go Now. +	 */ +	int first_nid; +#endif  	struct uprobes_state uprobes_state;  }; +/* first nid will either be a valid NID or one of these values */ +#define NUMA_PTE_SCAN_INIT	-1 +#define NUMA_PTE_SCAN_ACTIVE	-2 +  static inline void mm_init_cpumask(struct mm_struct *mm)  {  #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd55dad56aa..4bec5be82ca 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -735,6 +735,19 @@ typedef struct pglist_data {  	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */  	int kswapd_max_order;  	enum zone_type classzone_idx; +#ifdef CONFIG_NUMA_BALANCING +	/* +	 * Lock serializing the per destination node AutoNUMA memory +	 * migration rate limiting data. +	 */ +	spinlock_t numabalancing_migrate_lock; + +	/* Rate limiting time interval */ +	unsigned long numabalancing_migrate_next_window; + +	/* Number of pages migrated during the rate limiting time interval */ +	unsigned long numabalancing_migrate_nr_pages; +#endif  } pg_data_t;  #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bfe1f478064..c20635c527a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -7,7 +7,7 @@  #include <linux/list.h>  #include <linux/slab.h>  #include <linux/mm.h> -#include <linux/mutex.h> +#include <linux/rwsem.h>  #include <linux/memcontrol.h>  /* @@ -25,8 +25,8 @@   * pointing to this anon_vma once its vma list is empty.   */  struct anon_vma { -	struct anon_vma *root;	/* Root of this anon_vma tree */ -	struct mutex mutex;	/* Serialize access to vma list */ +	struct anon_vma *root;		/* Root of this anon_vma tree */ +	struct rw_semaphore rwsem;	/* W: modification, R: walking the list */  	/*  	 * The refcount is taken on an anon_vma when there is no  	 * guarantee that the vma of page tables will exist for @@ -64,7 +64,7 @@ struct anon_vma_chain {  	struct vm_area_struct *vma;  	struct anon_vma *anon_vma;  	struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */ -	struct rb_node rb;			/* locked by anon_vma->mutex */ +	struct rb_node rb;			/* locked by anon_vma->rwsem */  	unsigned long rb_subtree_last;  #ifdef CONFIG_DEBUG_VM_RB  	unsigned long cached_vma_start, cached_vma_last; @@ -108,26 +108,37 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma)  {  	struct anon_vma *anon_vma = vma->anon_vma;  	if (anon_vma) -		mutex_lock(&anon_vma->root->mutex); +		down_write(&anon_vma->root->rwsem);  }  static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)  {  	struct anon_vma *anon_vma = vma->anon_vma;  	if (anon_vma) -		mutex_unlock(&anon_vma->root->mutex); +		up_write(&anon_vma->root->rwsem);  } -static inline void anon_vma_lock(struct anon_vma *anon_vma) +static inline void anon_vma_lock_write(struct anon_vma *anon_vma)  { -	mutex_lock(&anon_vma->root->mutex); +	down_write(&anon_vma->root->rwsem);  }  static inline void anon_vma_unlock(struct anon_vma *anon_vma)  { -	mutex_unlock(&anon_vma->root->mutex); +	up_write(&anon_vma->root->rwsem);  } +static inline void anon_vma_lock_read(struct anon_vma *anon_vma) +{ +	down_read(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) +{ +	up_read(&anon_vma->root->rwsem); +} + +  /*   * anon_vma helper functions.   */ @@ -220,8 +231,8 @@ int try_to_munlock(struct page *);  /*   * Called by memory-failure.c to kill processes.   */ -struct anon_vma *page_lock_anon_vma(struct page *page); -void page_unlock_anon_vma(struct anon_vma *anon_vma); +struct anon_vma *page_lock_anon_vma_read(struct page *page); +void page_unlock_anon_vma_read(struct anon_vma *anon_vma);  int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);  /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c2f3072bee..b089c92c609 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1527,6 +1527,14 @@ struct task_struct {  	short il_next;  	short pref_node_fork;  #endif +#ifdef CONFIG_NUMA_BALANCING +	int numa_scan_seq; +	int numa_migrate_seq; +	unsigned int numa_scan_period; +	u64 node_stamp;			/* migration stamp  */ +	struct callback_head numa_work; +#endif /* CONFIG_NUMA_BALANCING */ +  	struct rcu_head rcu;  	/* @@ -1601,6 +1609,18 @@ struct task_struct {  /* Future-safe accessor for struct task_struct's cpus_allowed. */  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#ifdef CONFIG_NUMA_BALANCING +extern void task_numa_fault(int node, int pages, bool migrated); +extern void set_numabalancing_state(bool enabled); +#else +static inline void task_numa_fault(int node, int pages, bool migrated) +{ +} +static inline void set_numabalancing_state(bool enabled) +{ +} +#endif +  /*   * Priority of a process goes from 0..MAX_PRIO-1, valid RT   * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -2030,6 +2050,13 @@ enum sched_tunable_scaling {  };  extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_period_reset; +extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_settle_count; +  #ifdef CONFIG_SCHED_DEBUG  extern unsigned int sysctl_sched_migration_cost;  extern unsigned int sysctl_sched_nr_migrate; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index fe786f07d2b..fce0a2799d4 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -38,8 +38,18 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,  		KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,  		KSWAPD_SKIP_CONGESTION_WAIT,  		PAGEOUTRUN, ALLOCSTALL, PGROTATED, +#ifdef CONFIG_NUMA_BALANCING +		NUMA_PTE_UPDATES, +		NUMA_HINT_FAULTS, +		NUMA_HINT_FAULTS_LOCAL, +		NUMA_PAGE_MIGRATE, +#endif +#ifdef CONFIG_MIGRATION +		PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, +#endif  #ifdef CONFIG_COMPACTION -		COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED, +		COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, +		COMPACTISOLATED,  		COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,  #endif  #ifdef CONFIG_HUGETLB_PAGE diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 92a86b2cce3..a13291f7da8 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -80,6 +80,14 @@ static inline void vm_events_fold_cpu(int cpu)  #endif /* CONFIG_VM_EVENT_COUNTERS */ +#ifdef CONFIG_NUMA_BALANCING +#define count_vm_numa_event(x)     count_vm_event(x) +#define count_vm_numa_events(x, y) count_vm_events(x, y) +#else +#define count_vm_numa_event(x) do {} while (0) +#define count_vm_numa_events(x, y) do {} while (0) +#endif /* CONFIG_NUMA_BALANCING */ +  #define __count_zone_vm_events(item, zone, delta) \  		__count_vm_events(item##_NORMAL - ZONE_NORMAL + \  		zone_idx(zone), delta) diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h new file mode 100644 index 00000000000..ec2a6ccfd7e --- /dev/null +++ b/include/trace/events/migrate.h @@ -0,0 +1,51 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM migrate + +#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MIGRATE_H + +#define MIGRATE_MODE						\ +	{MIGRATE_ASYNC,		"MIGRATE_ASYNC"},		\ +	{MIGRATE_SYNC_LIGHT,	"MIGRATE_SYNC_LIGHT"},		\ +	{MIGRATE_SYNC,		"MIGRATE_SYNC"}		 + +#define MIGRATE_REASON						\ +	{MR_COMPACTION,		"compaction"},			\ +	{MR_MEMORY_FAILURE,	"memory_failure"},		\ +	{MR_MEMORY_HOTPLUG,	"memory_hotplug"},		\ +	{MR_SYSCALL,		"syscall_or_cpuset"},		\ +	{MR_MEMPOLICY_MBIND,	"mempolicy_mbind"},		\ +	{MR_CMA,		"cma"} + +TRACE_EVENT(mm_migrate_pages, + +	TP_PROTO(unsigned long succeeded, unsigned long failed, +		 enum migrate_mode mode, int reason), + +	TP_ARGS(succeeded, failed, mode, reason), + +	TP_STRUCT__entry( +		__field(	unsigned long,		succeeded) +		__field(	unsigned long,		failed) +		__field(	enum migrate_mode,	mode) +		__field(	int,			reason) +	), + +	TP_fast_assign( +		__entry->succeeded	= succeeded; +		__entry->failed		= failed; +		__entry->mode		= mode; +		__entry->reason		= reason; +	), + +	TP_printk("nr_succeeded=%lu nr_failed=%lu mode=%s reason=%s", +		__entry->succeeded, +		__entry->failed, +		__print_symbolic(__entry->mode, MIGRATE_MODE), +		__print_symbolic(__entry->reason, MIGRATE_REASON)) +); + +#endif /* _TRACE_MIGRATE_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 23e62e0537e..0d11c3dcd3a 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -20,6 +20,7 @@ enum {  	MPOL_PREFERRED,  	MPOL_BIND,  	MPOL_INTERLEAVE, +	MPOL_LOCAL,  	MPOL_MAX,	/* always last member of enum */  }; @@ -47,9 +48,15 @@ enum mpol_rebind_step {  /* Flags for mbind */  #define MPOL_MF_STRICT	(1<<0)	/* Verify existing pages in the mapping */ -#define MPOL_MF_MOVE	(1<<1)	/* Move pages owned by this process to conform to mapping */ -#define MPOL_MF_MOVE_ALL (1<<2)	/* Move every page to conform to mapping */ -#define MPOL_MF_INTERNAL (1<<3)	/* Internal flags start here */ +#define MPOL_MF_MOVE	 (1<<1)	/* Move pages owned by this process to conform +				   to policy */ +#define MPOL_MF_MOVE_ALL (1<<2)	/* Move every page to conform to policy */ +#define MPOL_MF_LAZY	 (1<<3)	/* Modifies '_MOVE:  lazy migrate on fault */ +#define MPOL_MF_INTERNAL (1<<4)	/* Internal flags start here */ + +#define MPOL_MF_VALID	(MPOL_MF_STRICT   | 	\ +			 MPOL_MF_MOVE     | 	\ +			 MPOL_MF_MOVE_ALL)  /*   * Internal flags that share the struct mempolicy flags word with @@ -59,6 +66,8 @@ enum mpol_rebind_step {  #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */  #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */  #define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */ +#define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */ +#define MPOL_F_MORON	(1 << 4) /* Migrate On pte_numa Reference On Node */  #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/init/Kconfig b/init/Kconfig index 2054e048bb9..1a207efca59 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -717,6 +717,50 @@ config LOG_BUF_SHIFT  config HAVE_UNSTABLE_SCHED_CLOCK  	bool +# +# For architectures that want to enable the support for NUMA-affine scheduler +# balancing logic: +# +config ARCH_SUPPORTS_NUMA_BALANCING +	bool + +# For architectures that (ab)use NUMA to represent different memory regions +# all cpu-local but of different latencies, such as SuperH. +# +config ARCH_WANT_NUMA_VARIABLE_LOCALITY +	bool + +# +# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE +config ARCH_WANTS_PROT_NUMA_PROT_NONE +	bool + +config ARCH_USES_NUMA_PROT_NONE +	bool +	default y +	depends on ARCH_WANTS_PROT_NUMA_PROT_NONE +	depends on NUMA_BALANCING + +config NUMA_BALANCING_DEFAULT_ENABLED +	bool "Automatically enable NUMA aware memory/task placement" +	default y +	depends on NUMA_BALANCING +	help +	  If set, autonumic NUMA balancing will be enabled if running on a NUMA +	  machine. + +config NUMA_BALANCING +	bool "Memory placement aware NUMA scheduler" +	depends on ARCH_SUPPORTS_NUMA_BALANCING +	depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY +	depends on SMP && NUMA && MIGRATION +	help +	  This option adds support for automatic NUMA aware memory/task placement. +	  The mechanism is quite primitive and is based on migrating memory when +	  it is references to the node the task is running on. + +	  This system will be inactive on UMA systems. +  menuconfig CGROUPS  	boolean "Control Group support"  	depends on EVENTFD diff --git a/kernel/fork.c b/kernel/fork.c index 3c31e874afa..115d6c2e4cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -823,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)  #ifdef CONFIG_TRANSPARENT_HUGEPAGE  	mm->pmd_huge_pte = NULL;  #endif +#ifdef CONFIG_NUMA_BALANCING +	mm->first_nid = NUMA_PTE_SCAN_INIT; +#endif  	if (!mm_init(mm, tsk))  		goto fail_nomem; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0533496b622..c1fb82104bf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -193,23 +193,10 @@ static void sched_feat_disable(int i) { };  static void sched_feat_enable(int i) { };  #endif /* HAVE_JUMP_LABEL */ -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, -		size_t cnt, loff_t *ppos) +static int sched_feat_set(char *cmp)  { -	char buf[64]; -	char *cmp; -	int neg = 0;  	int i; - -	if (cnt > 63) -		cnt = 63; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; - -	buf[cnt] = 0; -	cmp = strstrip(buf); +	int neg = 0;  	if (strncmp(cmp, "NO_", 3) == 0) {  		neg = 1; @@ -229,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,  		}  	} +	return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, +		size_t cnt, loff_t *ppos) +{ +	char buf[64]; +	char *cmp; +	int i; + +	if (cnt > 63) +		cnt = 63; + +	if (copy_from_user(&buf, ubuf, cnt)) +		return -EFAULT; + +	buf[cnt] = 0; +	cmp = strstrip(buf); + +	i = sched_feat_set(cmp);  	if (i == __SCHED_FEAT_NR)  		return -EINVAL; @@ -1560,7 +1568,40 @@ static void __sched_fork(struct task_struct *p)  #ifdef CONFIG_PREEMPT_NOTIFIERS  	INIT_HLIST_HEAD(&p->preempt_notifiers);  #endif + +#ifdef CONFIG_NUMA_BALANCING +	if (p->mm && atomic_read(&p->mm->mm_users) == 1) { +		p->mm->numa_next_scan = jiffies; +		p->mm->numa_next_reset = jiffies; +		p->mm->numa_scan_seq = 0; +	} + +	p->node_stamp = 0ULL; +	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; +	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; +	p->numa_scan_period = sysctl_numa_balancing_scan_delay; +	p->numa_work.next = &p->numa_work; +#endif /* CONFIG_NUMA_BALANCING */ +} + +#ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_SCHED_DEBUG +void set_numabalancing_state(bool enabled) +{ +	if (enabled) +		sched_feat_set("NUMA"); +	else +		sched_feat_set("NO_NUMA"); +} +#else +__read_mostly bool numabalancing_enabled; + +void set_numabalancing_state(bool enabled) +{ +	numabalancing_enabled = enabled;  } +#endif /* CONFIG_SCHED_DEBUG */ +#endif /* CONFIG_NUMA_BALANCING */  /*   * fork()/clone()-time setup: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 756f9f9e854..9af5af979a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -26,6 +26,9 @@  #include <linux/slab.h>  #include <linux/profile.h>  #include <linux/interrupt.h> +#include <linux/mempolicy.h> +#include <linux/migrate.h> +#include <linux/task_work.h>  #include <trace/events/sched.h> @@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)   * Scheduling class queueing methods:   */ +#ifdef CONFIG_NUMA_BALANCING +/* + * numa task sample period in ms + */ +unsigned int sysctl_numa_balancing_scan_period_min = 100; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; + +/* Portion of address space to scan in MB */ +unsigned int sysctl_numa_balancing_scan_size = 256; + +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ +unsigned int sysctl_numa_balancing_scan_delay = 1000; + +static void task_numa_placement(struct task_struct *p) +{ +	int seq = ACCESS_ONCE(p->mm->numa_scan_seq); + +	if (p->numa_scan_seq == seq) +		return; +	p->numa_scan_seq = seq; + +	/* FIXME: Scheduling placement policy hints go here */ +} + +/* + * Got a PROT_NONE fault for a page on @node. + */ +void task_numa_fault(int node, int pages, bool migrated) +{ +	struct task_struct *p = current; + +	if (!sched_feat_numa(NUMA)) +		return; + +	/* FIXME: Allocate task-specific structure for placement policy here */ + +	/* +	 * If pages are properly placed (did not migrate) then scan slower. +	 * This is reset periodically in case of phase changes +	 */ +        if (!migrated) +		p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, +			p->numa_scan_period + jiffies_to_msecs(10)); + +	task_numa_placement(p); +} + +static void reset_ptenuma_scan(struct task_struct *p) +{ +	ACCESS_ONCE(p->mm->numa_scan_seq)++; +	p->mm->numa_scan_offset = 0; +} + +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +void task_numa_work(struct callback_head *work) +{ +	unsigned long migrate, next_scan, now = jiffies; +	struct task_struct *p = current; +	struct mm_struct *mm = p->mm; +	struct vm_area_struct *vma; +	unsigned long start, end; +	long pages; + +	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + +	work->next = work; /* protect against double add */ +	/* +	 * Who cares about NUMA placement when they're dying. +	 * +	 * NOTE: make sure not to dereference p->mm before this check, +	 * exit_task_work() happens _after_ exit_mm() so we could be called +	 * without p->mm even though we still had it when we enqueued this +	 * work. +	 */ +	if (p->flags & PF_EXITING) +		return; + +	/* +	 * We do not care about task placement until a task runs on a node +	 * other than the first one used by the address space. This is +	 * largely because migrations are driven by what CPU the task +	 * is running on. If it's never scheduled on another node, it'll +	 * not migrate so why bother trapping the fault. +	 */ +	if (mm->first_nid == NUMA_PTE_SCAN_INIT) +		mm->first_nid = numa_node_id(); +	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { +		/* Are we running on a new node yet? */ +		if (numa_node_id() == mm->first_nid && +		    !sched_feat_numa(NUMA_FORCE)) +			return; + +		mm->first_nid = NUMA_PTE_SCAN_ACTIVE; +	} + +	/* +	 * Reset the scan period if enough time has gone by. Objective is that +	 * scanning will be reduced if pages are properly placed. As tasks +	 * can enter different phases this needs to be re-examined. Lacking +	 * proper tracking of reference behaviour, this blunt hammer is used. +	 */ +	migrate = mm->numa_next_reset; +	if (time_after(now, migrate)) { +		p->numa_scan_period = sysctl_numa_balancing_scan_period_min; +		next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); +		xchg(&mm->numa_next_reset, next_scan); +	} + +	/* +	 * Enforce maximal scan/migration frequency.. +	 */ +	migrate = mm->numa_next_scan; +	if (time_before(now, migrate)) +		return; + +	if (p->numa_scan_period == 0) +		p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + +	next_scan = now + msecs_to_jiffies(p->numa_scan_period); +	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) +		return; + +	/* +	 * Do not set pte_numa if the current running node is rate-limited. +	 * This loses statistics on the fault but if we are unwilling to +	 * migrate to this node, it is less likely we can do useful work +	 */ +	if (migrate_ratelimited(numa_node_id())) +		return; + +	start = mm->numa_scan_offset; +	pages = sysctl_numa_balancing_scan_size; +	pages <<= 20 - PAGE_SHIFT; /* MB in pages */ +	if (!pages) +		return; + +	down_read(&mm->mmap_sem); +	vma = find_vma(mm, start); +	if (!vma) { +		reset_ptenuma_scan(p); +		start = 0; +		vma = mm->mmap; +	} +	for (; vma; vma = vma->vm_next) { +		if (!vma_migratable(vma)) +			continue; + +		/* Skip small VMAs. They are not likely to be of relevance */ +		if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR) +			continue; + +		do { +			start = max(start, vma->vm_start); +			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); +			end = min(end, vma->vm_end); +			pages -= change_prot_numa(vma, start, end); + +			start = end; +			if (pages <= 0) +				goto out; +		} while (end != vma->vm_end); +	} + +out: +	/* +	 * It is possible to reach the end of the VMA list but the last few VMAs are +	 * not guaranteed to the vma_migratable. If they are not, we would find the +	 * !migratable VMA on the next scan but not reset the scanner to the start +	 * so check it now. +	 */ +	if (vma) +		mm->numa_scan_offset = start; +	else +		reset_ptenuma_scan(p); +	up_read(&mm->mmap_sem); +} + +/* + * Drive the periodic memory faults.. + */ +void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +	struct callback_head *work = &curr->numa_work; +	u64 period, now; + +	/* +	 * We don't care about NUMA placement if we don't have memory. +	 */ +	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) +		return; + +	/* +	 * Using runtime rather than walltime has the dual advantage that +	 * we (mostly) drive the selection from busy threads and that the +	 * task needs to have done some actual work before we bother with +	 * NUMA placement. +	 */ +	now = curr->se.sum_exec_runtime; +	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; + +	if (now - curr->node_stamp > period) { +		if (!curr->node_stamp) +			curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; +		curr->node_stamp = now; + +		if (!time_before(jiffies, curr->mm->numa_next_scan)) { +			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ +			task_work_add(curr, work, true); +		} +	} +} +#else +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ +  static void  account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)  { @@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)  		entity_tick(cfs_rq, se, queued);  	} +	if (sched_feat_numa(NUMA)) +		task_tick_numa(rq, curr); +  	update_rq_runnable_avg(rq, 1);  } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e68e69ab917..1ad1d2b5395 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -66,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true)  SCHED_FEAT(FORCE_SD_OVERLAP, false)  SCHED_FEAT(RT_RUNTIME_SHARE, true)  SCHED_FEAT(LB_MIN, false) + +/* + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing=. Allow PTE scanning to be forced on UMA machines + * for debugging the core machinery. + */ +#ifdef CONFIG_NUMA_BALANCING +SCHED_FEAT(NUMA,	false) +SCHED_FEAT(NUMA_FORCE,	false) +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5eca173b563..fc886441436 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -663,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];  #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))  #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ +#ifdef CONFIG_NUMA_BALANCING +#define sched_feat_numa(x) sched_feat(x) +#ifdef CONFIG_SCHED_DEBUG +#define numabalancing_enabled sched_feat_numa(NUMA) +#else +extern bool numabalancing_enabled; +#endif /* CONFIG_SCHED_DEBUG */ +#else +#define sched_feat_numa(x) (0) +#define numabalancing_enabled (0) +#endif /* CONFIG_NUMA_BALANCING */ +  static inline u64 global_rt_period(void)  {  	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 33f71f37267..c88878db491 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */  static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */  static int min_wakeup_granularity_ns;			/* 0 usecs */  static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */ +#ifdef CONFIG_SMP  static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;  static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */  #ifdef CONFIG_COMPACTION  static int min_extfrag_threshold; @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {  		.extra1		= &min_wakeup_granularity_ns,  		.extra2		= &max_wakeup_granularity_ns,  	}, +#ifdef CONFIG_SMP  	{  		.procname	= "sched_tunable_scaling",  		.data		= &sysctl_sched_tunable_scaling, @@ -347,7 +350,45 @@ static struct ctl_table kern_table[] = {  		.extra1		= &zero,  		.extra2		= &one,  	}, -#endif +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING +	{ +		.procname	= "numa_balancing_scan_delay_ms", +		.data		= &sysctl_numa_balancing_scan_delay, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "numa_balancing_scan_period_min_ms", +		.data		= &sysctl_numa_balancing_scan_period_min, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "numa_balancing_scan_period_reset", +		.data		= &sysctl_numa_balancing_scan_period_reset, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "numa_balancing_scan_period_max_ms", +		.data		= &sysctl_numa_balancing_scan_period_max, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "numa_balancing_scan_size_mb", +		.data		= &sysctl_numa_balancing_scan_size, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */  	{  		.procname	= "sched_rt_period_us",  		.data		= &sysctl_sched_rt_period, diff --git a/mm/compaction.c b/mm/compaction.c index 12979121822..5ad7f4f4d6f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -303,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,  	if (blockpfn == end_pfn)  		update_pageblock_skip(cc, valid_page, total_isolated, false); +	count_vm_events(COMPACTFREE_SCANNED, nr_scanned); +	if (total_isolated) +		count_vm_events(COMPACTISOLATED, total_isolated); +  	return total_isolated;  } @@ -609,6 +613,10 @@ next_pageblock:  	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); +	count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); +	if (nr_isolated) +		count_vm_events(COMPACTISOLATED, nr_isolated); +  	return low_pfn;  } @@ -1015,14 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)  		nr_migrate = cc->nr_migratepages;  		err = migrate_pages(&cc->migratepages, compaction_alloc,  				(unsigned long)cc, false, -				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); +				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, +				MR_COMPACTION);  		update_nr_listpages(cc);  		nr_remaining = cc->nr_migratepages; -		count_vm_event(COMPACTBLOCKS); -		count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); -		if (nr_remaining) -			count_vm_events(COMPACTPAGEFAILED, nr_remaining);  		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,  						nr_remaining); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 827d9c81305..d7ee1691fd2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -19,6 +19,7 @@  #include <linux/freezer.h>  #include <linux/mman.h>  #include <linux/pagemap.h> +#include <linux/migrate.h>  #include <asm/tlb.h>  #include <asm/pgalloc.h> @@ -690,7 +691,7 @@ out:  }  __setup("transparent_hugepage=", setup_transparent_hugepage); -static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)  {  	if (likely(vma->vm_flags & VM_WRITE))  		pmd = pmd_mkwrite(pmd); @@ -848,7 +849,8 @@ out:  	 * run pte_offset_map on the pmd, if an huge pmd could  	 * materialize from under us from a different thread.  	 */ -	if (unlikely(__pte_alloc(mm, vma, pmd, address))) +	if (unlikely(pmd_none(*pmd)) && +	    unlikely(__pte_alloc(mm, vma, pmd, address)))  		return VM_FAULT_OOM;  	/* if an huge pmd materialized from under us just retry later */  	if (unlikely(pmd_trans_huge(*pmd))) @@ -1287,6 +1289,81 @@ out:  	return page;  } +/* NUMA hinting page fault entry point for trans huge pmds */ +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +				unsigned long addr, pmd_t pmd, pmd_t *pmdp) +{ +	struct page *page; +	unsigned long haddr = addr & HPAGE_PMD_MASK; +	int target_nid; +	int current_nid = -1; +	bool migrated; +	bool page_locked = false; + +	spin_lock(&mm->page_table_lock); +	if (unlikely(!pmd_same(pmd, *pmdp))) +		goto out_unlock; + +	page = pmd_page(pmd); +	get_page(page); +	current_nid = page_to_nid(page); +	count_vm_numa_event(NUMA_HINT_FAULTS); +	if (current_nid == numa_node_id()) +		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + +	target_nid = mpol_misplaced(page, vma, haddr); +	if (target_nid == -1) { +		put_page(page); +		goto clear_pmdnuma; +	} + +	/* Acquire the page lock to serialise THP migrations */ +	spin_unlock(&mm->page_table_lock); +	lock_page(page); +	page_locked = true; + +	/* Confirm the PTE did not while locked */ +	spin_lock(&mm->page_table_lock); +	if (unlikely(!pmd_same(pmd, *pmdp))) { +		unlock_page(page); +		put_page(page); +		goto out_unlock; +	} +	spin_unlock(&mm->page_table_lock); + +	/* Migrate the THP to the requested node */ +	migrated = migrate_misplaced_transhuge_page(mm, vma, +				pmdp, pmd, addr, +				page, target_nid); +	if (migrated) +		current_nid = target_nid; +	else { +		spin_lock(&mm->page_table_lock); +		if (unlikely(!pmd_same(pmd, *pmdp))) { +			unlock_page(page); +			goto out_unlock; +		} +		goto clear_pmdnuma; +	} + +	task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); +	return 0; + +clear_pmdnuma: +	pmd = pmd_mknonnuma(pmd); +	set_pmd_at(mm, haddr, pmdp, pmd); +	VM_BUG_ON(pmd_numa(*pmdp)); +	update_mmu_cache_pmd(vma, addr, pmdp); +	if (page_locked) +		unlock_page(page); + +out_unlock: +	spin_unlock(&mm->page_table_lock); +	if (current_nid != -1) +		task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); +	return 0; +} +  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,  		 pmd_t *pmd, unsigned long addr)  { @@ -1375,7 +1452,7 @@ out:  }  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, -		unsigned long addr, pgprot_t newprot) +		unsigned long addr, pgprot_t newprot, int prot_numa)  {  	struct mm_struct *mm = vma->vm_mm;  	int ret = 0; @@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,  	if (__pmd_trans_huge_lock(pmd, vma) == 1) {  		pmd_t entry;  		entry = pmdp_get_and_clear(mm, addr, pmd); -		entry = pmd_modify(entry, newprot); +		if (!prot_numa) +			entry = pmd_modify(entry, newprot); +		else { +			struct page *page = pmd_page(*pmd); + +			/* only check non-shared pages */ +			if (page_mapcount(page) == 1 && +			    !pmd_numa(*pmd)) { +				entry = pmd_mknuma(entry); +			} +		}  		BUG_ON(pmd_write(entry));  		set_pmd_at(mm, addr, pmd, entry);  		spin_unlock(&vma->vm_mm->page_table_lock); @@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,  		 * We can't temporarily set the pmd to null in order  		 * to split it, the pmd must remain marked huge at all  		 * times or the VM won't take the pmd_trans_huge paths -		 * and it won't wait on the anon_vma->root->mutex to +		 * and it won't wait on the anon_vma->root->rwsem to  		 * serialize against split_huge_page*.  		 */  		pmdp_splitting_flush(vma, address, pmd); @@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)  		page_tail->mapping = page->mapping;  		page_tail->index = page->index + i; +		page_xchg_last_nid(page_tail, page_last_nid(page));  		BUG_ON(!PageAnon(page_tail));  		BUG_ON(!PageUptodate(page_tail)); @@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,  				BUG_ON(page_mapcount(page) != 1);  			if (!pmd_young(*pmd))  				entry = pte_mkold(entry); +			if (pmd_numa(*pmd)) +				entry = pte_mknuma(entry);  			pte = pte_offset_map(&_pmd, haddr);  			BUG_ON(!pte_none(*pte));  			set_pte_at(mm, haddr, pte, entry); @@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,  	return ret;  } -/* must be called with anon_vma->root->mutex hold */ +/* must be called with anon_vma->root->rwsem held */  static void __split_huge_page(struct page *page,  			      struct anon_vma *anon_vma)  { @@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page)  	BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));  	BUG_ON(!PageAnon(page)); -	anon_vma = page_lock_anon_vma(page); +	anon_vma = page_lock_anon_vma_read(page);  	if (!anon_vma)  		goto out;  	ret = 0; @@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page)  	BUG_ON(PageCompound(page));  out_unlock: -	page_unlock_anon_vma(anon_vma); +	page_unlock_anon_vma_read(anon_vma);  out:  	return ret;  } @@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm,  	if (pmd_trans_huge(*pmd))  		goto out; -	anon_vma_lock(vma->anon_vma); +	anon_vma_lock_write(vma->anon_vma);  	pte = pte_offset_map(pmd, address);  	ptl = pte_lockptr(mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 88e7293b96b..e5318c7793a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3016,7 +3016,7 @@ same_page:  	return i ? i : -EFAULT;  } -void hugetlb_change_protection(struct vm_area_struct *vma, +unsigned long hugetlb_change_protection(struct vm_area_struct *vma,  		unsigned long address, unsigned long end, pgprot_t newprot)  {  	struct mm_struct *mm = vma->vm_mm; @@ -3024,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,  	pte_t *ptep;  	pte_t pte;  	struct hstate *h = hstate_vma(vma); +	unsigned long pages = 0;  	BUG_ON(address >= end);  	flush_cache_range(vma, address, end); @@ -3034,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,  		ptep = huge_pte_offset(mm, address);  		if (!ptep)  			continue; -		if (huge_pmd_unshare(mm, &address, ptep)) +		if (huge_pmd_unshare(mm, &address, ptep)) { +			pages++;  			continue; +		}  		if (!huge_pte_none(huge_ptep_get(ptep))) {  			pte = huge_ptep_get_and_clear(mm, address, ptep);  			pte = pte_mkhuge(pte_modify(pte, newprot));  			set_huge_pte_at(mm, address, ptep, pte); +			pages++;  		}  	}  	spin_unlock(&mm->page_table_lock); @@ -3051,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,  	 */  	flush_tlb_range(vma, start, end);  	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + +	return pages << h->order;  }  int hugetlb_reserve_pages(struct inode *inode, diff --git a/mm/internal.h b/mm/internal.h index 52d1fa95719..d597f94cc20 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)  {  	if (TestClearPageMlocked(page)) {  		unsigned long flags; +		int nr_pages = hpage_nr_pages(page);  		local_irq_save(flags); -		__dec_zone_page_state(page, NR_MLOCK); +		__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);  		SetPageMlocked(newpage); -		__inc_zone_page_state(newpage, NR_MLOCK); +		__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);  		local_irq_restore(flags);  	}  } +extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); +  #ifdef CONFIG_TRANSPARENT_HUGEPAGE  extern unsigned long vma_address(struct page *page,  				 struct vm_area_struct *vma); @@ -1624,7 +1624,7 @@ again:  		struct anon_vma_chain *vmac;  		struct vm_area_struct *vma; -		anon_vma_lock(anon_vma); +		anon_vma_lock_write(anon_vma);  		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,  					       0, ULONG_MAX) {  			vma = vmac->vma; @@ -1678,7 +1678,7 @@ again:  		struct anon_vma_chain *vmac;  		struct vm_area_struct *vma; -		anon_vma_lock(anon_vma); +		anon_vma_lock_write(anon_vma);  		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,  					       0, ULONG_MAX) {  			vma = vmac->vma; @@ -1731,7 +1731,7 @@ again:  		struct anon_vma_chain *vmac;  		struct vm_area_struct *vma; -		anon_vma_lock(anon_vma); +		anon_vma_lock_write(anon_vma);  		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,  					       0, ULONG_MAX) {  			vma = vmac->vma; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c055929c8c..bbfac5063ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3289,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,  				  struct mem_cgroup **memcgp)  {  	struct mem_cgroup *memcg = NULL; +	unsigned int nr_pages = 1;  	struct page_cgroup *pc;  	enum charge_type ctype;  	*memcgp = NULL; -	VM_BUG_ON(PageTransHuge(page));  	if (mem_cgroup_disabled())  		return; +	if (PageTransHuge(page)) +		nr_pages <<= compound_order(page); +  	pc = lookup_page_cgroup(page);  	lock_page_cgroup(pc);  	if (PageCgroupUsed(pc)) { @@ -3359,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,  	 * charged to the res_counter since we plan on replacing the  	 * old one and only one page is going to be left afterwards.  	 */ -	__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); +	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);  }  /* remove redundant charge if migration failed*/ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 108c52fa60f..c6e4dd3e1c0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,  	struct anon_vma *av;  	pgoff_t pgoff; -	av = page_lock_anon_vma(page); +	av = page_lock_anon_vma_read(page);  	if (av == NULL)	/* Not actually mapped anymore */  		return; @@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,  		}  	}  	read_unlock(&tasklist_lock); -	page_unlock_anon_vma(av); +	page_unlock_anon_vma_read(av);  }  /* @@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)  					    page_is_file_cache(page));  		list_add(&page->lru, &pagelist);  		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, -							false, MIGRATE_SYNC); +							false, MIGRATE_SYNC, +							MR_MEMORY_FAILURE);  		if (ret) {  			putback_lru_pages(&pagelist);  			pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory.c b/mm/memory.c index db2e9e797a0..e6a3b933517 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,6 +57,7 @@  #include <linux/swapops.h>  #include <linux/elf.h>  #include <linux/gfp.h> +#include <linux/migrate.h>  #include <asm/io.h>  #include <asm/pgalloc.h> @@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,  		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);  		goto out;  	} +	if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) +		goto no_page_table;  	if (pmd_trans_huge(*pmd)) {  		if (flags & FOLL_SPLIT) {  			split_huge_page_pmd(vma, address, pmd); @@ -1532,6 +1535,8 @@ split_fallthrough:  	pte = *ptep;  	if (!pte_present(pte))  		goto no_page; +	if ((flags & FOLL_NUMA) && pte_numa(pte)) +		goto no_page;  	if ((flags & FOLL_WRITE) && !pte_write(pte))  		goto unlock; @@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,  			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);  	vm_flags &= (gup_flags & FOLL_FORCE) ?  			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + +	/* +	 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault +	 * would be called on PROT_NONE ranges. We must never invoke +	 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting +	 * page faults would unprotect the PROT_NONE ranges if +	 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd +	 * bitflag. So to avoid that, don't set FOLL_NUMA if +	 * FOLL_FORCE is set. +	 */ +	if (!(gup_flags & FOLL_FORCE)) +		gup_flags |= FOLL_NUMA; +  	i = 0;  	do { @@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);  } +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +				unsigned long addr, int current_nid) +{ +	get_page(page); + +	count_vm_numa_event(NUMA_HINT_FAULTS); +	if (current_nid == numa_node_id()) +		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + +	return mpol_misplaced(page, vma, addr); +} + +int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +{ +	struct page *page = NULL; +	spinlock_t *ptl; +	int current_nid = -1; +	int target_nid; +	bool migrated = false; + +	/* +	* The "pte" at this point cannot be used safely without +	* validation through pte_unmap_same(). It's of NUMA type but +	* the pfn may be screwed if the read is non atomic. +	* +	* ptep_modify_prot_start is not called as this is clearing +	* the _PAGE_NUMA bit and it is not really expected that there +	* would be concurrent hardware modifications to the PTE. +	*/ +	ptl = pte_lockptr(mm, pmd); +	spin_lock(ptl); +	if (unlikely(!pte_same(*ptep, pte))) { +		pte_unmap_unlock(ptep, ptl); +		goto out; +	} + +	pte = pte_mknonnuma(pte); +	set_pte_at(mm, addr, ptep, pte); +	update_mmu_cache(vma, addr, ptep); + +	page = vm_normal_page(vma, addr, pte); +	if (!page) { +		pte_unmap_unlock(ptep, ptl); +		return 0; +	} + +	current_nid = page_to_nid(page); +	target_nid = numa_migrate_prep(page, vma, addr, current_nid); +	pte_unmap_unlock(ptep, ptl); +	if (target_nid == -1) { +		/* +		 * Account for the fault against the current node if it not +		 * being replaced regardless of where the page is located. +		 */ +		current_nid = numa_node_id(); +		put_page(page); +		goto out; +	} + +	/* Migrate to the requested node */ +	migrated = migrate_misplaced_page(page, target_nid); +	if (migrated) +		current_nid = target_nid; + +out: +	if (current_nid != -1) +		task_numa_fault(current_nid, 1, migrated); +	return 0; +} + +/* NUMA hinting page fault entry point for regular pmds */ +#ifdef CONFIG_NUMA_BALANCING +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +		     unsigned long addr, pmd_t *pmdp) +{ +	pmd_t pmd; +	pte_t *pte, *orig_pte; +	unsigned long _addr = addr & PMD_MASK; +	unsigned long offset; +	spinlock_t *ptl; +	bool numa = false; +	int local_nid = numa_node_id(); + +	spin_lock(&mm->page_table_lock); +	pmd = *pmdp; +	if (pmd_numa(pmd)) { +		set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); +		numa = true; +	} +	spin_unlock(&mm->page_table_lock); + +	if (!numa) +		return 0; + +	/* we're in a page fault so some vma must be in the range */ +	BUG_ON(!vma); +	BUG_ON(vma->vm_start >= _addr + PMD_SIZE); +	offset = max(_addr, vma->vm_start) & ~PMD_MASK; +	VM_BUG_ON(offset >= PMD_SIZE); +	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); +	pte += offset >> PAGE_SHIFT; +	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { +		pte_t pteval = *pte; +		struct page *page; +		int curr_nid = local_nid; +		int target_nid; +		bool migrated; +		if (!pte_present(pteval)) +			continue; +		if (!pte_numa(pteval)) +			continue; +		if (addr >= vma->vm_end) { +			vma = find_vma(mm, addr); +			/* there's a pte present so there must be a vma */ +			BUG_ON(!vma); +			BUG_ON(addr < vma->vm_start); +		} +		if (pte_numa(pteval)) { +			pteval = pte_mknonnuma(pteval); +			set_pte_at(mm, addr, pte, pteval); +		} +		page = vm_normal_page(vma, addr, pteval); +		if (unlikely(!page)) +			continue; +		/* only check non-shared pages */ +		if (unlikely(page_mapcount(page) != 1)) +			continue; + +		/* +		 * Note that the NUMA fault is later accounted to either +		 * the node that is currently running or where the page is +		 * migrated to. +		 */ +		curr_nid = local_nid; +		target_nid = numa_migrate_prep(page, vma, addr, +					       page_to_nid(page)); +		if (target_nid == -1) { +			put_page(page); +			continue; +		} + +		/* Migrate to the requested node */ +		pte_unmap_unlock(pte, ptl); +		migrated = migrate_misplaced_page(page, target_nid); +		if (migrated) +			curr_nid = target_nid; +		task_numa_fault(curr_nid, 1, migrated); + +		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); +	} +	pte_unmap_unlock(orig_pte, ptl); + +	return 0; +} +#else +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +		     unsigned long addr, pmd_t *pmdp) +{ +	BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ +  /*   * These routines also need to handle stuff like marking pages dirty   * and/or accessed for architectures that don't do it in hardware (most @@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm,  					pte, pmd, flags, entry);  	} +	if (pte_numa(entry)) +		return do_numa_page(mm, vma, address, entry, pte, pmd); +  	ptl = pte_lockptr(mm, pmd);  	spin_lock(ptl);  	if (unlikely(!pte_same(*pte, entry))) @@ -3520,8 +3704,11 @@ retry:  		if (pmd_trans_huge(orig_pmd)) {  			unsigned int dirty = flags & FAULT_FLAG_WRITE; -			if (dirty && !pmd_write(orig_pmd) && -			    !pmd_trans_splitting(orig_pmd)) { +			if (pmd_numa(orig_pmd)) +				return do_huge_pmd_numa_page(mm, vma, address, +							     orig_pmd, pmd); + +			if (dirty && !pmd_write(orig_pmd)) {  				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,  							  orig_pmd);  				/* @@ -3536,16 +3723,21 @@ retry:  				huge_pmd_set_accessed(mm, vma, address, pmd,  						      orig_pmd, dirty);  			} +  			return 0;  		}  	} +	if (pmd_numa(*pmd)) +		return do_pmd_numa_page(mm, vma, address, pmd); +  	/*  	 * Use __pte_alloc instead of pte_alloc_map, because we can't  	 * run pte_offset_map on the pmd, if an huge pmd could  	 * materialize from under us from a different thread.  	 */ -	if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) +	if (unlikely(pmd_none(*pmd)) && +	    unlikely(__pte_alloc(mm, vma, pmd, address)))  		return VM_FAULT_OOM;  	/* if an huge pmd materialized from under us just retry later */  	if (unlikely(pmd_trans_huge(*pmd))) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 518baa896e8..962e353aa86 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1055,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)  		 * migrate_pages returns # of failed pages.  		 */  		ret = migrate_pages(&source, alloc_migrate_target, 0, -							true, MIGRATE_SYNC); +							true, MIGRATE_SYNC, +							MR_MEMORY_HOTPLUG);  		if (ret)  			putback_lru_pages(&source);  	} diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aaf54566cb6..d1b315e9862 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -90,6 +90,7 @@  #include <linux/syscalls.h>  #include <linux/ctype.h>  #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h>  #include <asm/tlbflush.h>  #include <asm/uaccess.h> @@ -117,6 +118,26 @@ static struct mempolicy default_policy = {  	.flags = MPOL_F_LOCAL,  }; +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +static struct mempolicy *get_task_policy(struct task_struct *p) +{ +	struct mempolicy *pol = p->mempolicy; +	int node; + +	if (!pol) { +		node = numa_node_id(); +		if (node != -1) +			pol = &preferred_node_policy[node]; + +		/* preferred_node_policy is not initialised early in boot */ +		if (!pol->mode) +			pol = NULL; +	} + +	return pol; +} +  static const struct mempolicy_operations {  	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);  	/* @@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,  	if (mode == MPOL_DEFAULT) {  		if (nodes && !nodes_empty(*nodes))  			return ERR_PTR(-EINVAL); -		return NULL;	/* simply delete any existing policy */ +		return NULL;  	}  	VM_BUG_ON(!nodes); @@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,  			     (flags & MPOL_F_RELATIVE_NODES)))  				return ERR_PTR(-EINVAL);  		} +	} else if (mode == MPOL_LOCAL) { +		if (!nodes_empty(*nodes)) +			return ERR_PTR(-EINVAL); +		mode = MPOL_PREFERRED;  	} else if (nodes_empty(*nodes))  		return ERR_PTR(-EINVAL);  	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,  	return 0;  } +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. + */ +unsigned long change_prot_numa(struct vm_area_struct *vma, +			unsigned long addr, unsigned long end) +{ +	int nr_updated; +	BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); + +	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); +	if (nr_updated) +		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + +	return nr_updated; +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, +			unsigned long addr, unsigned long end) +{ +	return 0; +} +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ +  /*   * Check if all pages in a range are on a set of nodes.   * If pagelist != NULL then isolate pages from the LRU and @@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,  		return ERR_PTR(-EFAULT);  	prev = NULL;  	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { +		unsigned long endvma = vma->vm_end; + +		if (endvma > end) +			endvma = end; +		if (vma->vm_start > start) +			start = vma->vm_start; +  		if (!(flags & MPOL_MF_DISCONTIG_OK)) {  			if (!vma->vm_next && vma->vm_end < end)  				return ERR_PTR(-EFAULT);  			if (prev && prev->vm_end < vma->vm_start)  				return ERR_PTR(-EFAULT);  		} -		if (!is_vm_hugetlb_page(vma) && -		    ((flags & MPOL_MF_STRICT) || + +		if (is_vm_hugetlb_page(vma)) +			goto next; + +		if (flags & MPOL_MF_LAZY) { +			change_prot_numa(vma, start, endvma); +			goto next; +		} + +		if ((flags & MPOL_MF_STRICT) ||  		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && -				vma_migratable(vma)))) { -			unsigned long endvma = vma->vm_end; +		      vma_migratable(vma))) { -			if (endvma > end) -				endvma = end; -			if (vma->vm_start > start) -				start = vma->vm_start;  			err = check_pgd_range(vma, start, endvma, nodes,  						flags, private);  			if (err) { @@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,  				break;  			}  		} +next:  		prev = vma;  	}  	return first; @@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,  	if (!list_empty(&pagelist)) {  		err = migrate_pages(&pagelist, new_node_page, dest, -							false, MIGRATE_SYNC); +							false, MIGRATE_SYNC, +							MR_SYSCALL);  		if (err)  			putback_lru_pages(&pagelist);  	} @@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,  	int err;  	LIST_HEAD(pagelist); -	if (flags & ~(unsigned long)(MPOL_MF_STRICT | -				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) +	if (flags & ~(unsigned long)MPOL_MF_VALID)  		return -EINVAL;  	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))  		return -EPERM; @@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,  	if (IS_ERR(new))  		return PTR_ERR(new); +	if (flags & MPOL_MF_LAZY) +		new->flags |= MPOL_F_MOF; +  	/*  	 * If we are using the default policy then operation  	 * on discontinuous address spaces is okay after all @@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,  	vma = check_range(mm, start, end, nmask,  			  flags | MPOL_MF_INVERT, &pagelist); -	err = PTR_ERR(vma); -	if (!IS_ERR(vma)) { -		int nr_failed = 0; - +	err = PTR_ERR(vma);	/* maybe ... */ +	if (!IS_ERR(vma))  		err = mbind_range(mm, start, end, new); +	if (!err) { +		int nr_failed = 0; +  		if (!list_empty(&pagelist)) { +			WARN_ON_ONCE(flags & MPOL_MF_LAZY);  			nr_failed = migrate_pages(&pagelist, new_vma_page,  						(unsigned long)vma, -						false, MIGRATE_SYNC); +						false, MIGRATE_SYNC, +						MR_MEMPOLICY_MBIND);  			if (nr_failed)  				putback_lru_pages(&pagelist);  		} -		if (!err && nr_failed && (flags & MPOL_MF_STRICT)) +		if (nr_failed && (flags & MPOL_MF_STRICT))  			err = -EIO;  	} else  		putback_lru_pages(&pagelist); @@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,  struct mempolicy *get_vma_policy(struct task_struct *task,  		struct vm_area_struct *vma, unsigned long addr)  { -	struct mempolicy *pol = task->mempolicy; +	struct mempolicy *pol = get_task_policy(task);  	if (vma) {  		if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -1956,7 +2028,7 @@ retry_cpuset:   */  struct page *alloc_pages_current(gfp_t gfp, unsigned order)  { -	struct mempolicy *pol = current->mempolicy; +	struct mempolicy *pol = get_task_policy(current);  	struct page *page;  	unsigned int cpuset_mems_cookie; @@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n)  	kmem_cache_free(sn_cache, n);  } +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page   - page to be checked + * @vma    - vm area where page mapped + * @addr   - virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. + * + * Returns: + *	-1	- not misplaced, page is in the right node + *	node	- node id where the page should be + * + * Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ +	struct mempolicy *pol; +	struct zone *zone; +	int curnid = page_to_nid(page); +	unsigned long pgoff; +	int polnid = -1; +	int ret = -1; + +	BUG_ON(!vma); + +	pol = get_vma_policy(current, vma, addr); +	if (!(pol->flags & MPOL_F_MOF)) +		goto out; + +	switch (pol->mode) { +	case MPOL_INTERLEAVE: +		BUG_ON(addr >= vma->vm_end); +		BUG_ON(addr < vma->vm_start); + +		pgoff = vma->vm_pgoff; +		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; +		polnid = offset_il_node(pol, vma, pgoff); +		break; + +	case MPOL_PREFERRED: +		if (pol->flags & MPOL_F_LOCAL) +			polnid = numa_node_id(); +		else +			polnid = pol->v.preferred_node; +		break; + +	case MPOL_BIND: +		/* +		 * allows binding to multiple nodes. +		 * use current page if in policy nodemask, +		 * else select nearest allowed node, if any. +		 * If no allowed nodes, use current [!misplaced]. +		 */ +		if (node_isset(curnid, pol->v.nodes)) +			goto out; +		(void)first_zones_zonelist( +				node_zonelist(numa_node_id(), GFP_HIGHUSER), +				gfp_zone(GFP_HIGHUSER), +				&pol->v.nodes, &zone); +		polnid = zone->node; +		break; + +	default: +		BUG(); +	} + +	/* Migrate the page towards the node whose CPU is referencing it */ +	if (pol->flags & MPOL_F_MORON) { +		int last_nid; + +		polnid = numa_node_id(); + +		/* +		 * Multi-stage node selection is used in conjunction +		 * with a periodic migration fault to build a temporal +		 * task<->page relation. By using a two-stage filter we +		 * remove short/unlikely relations. +		 * +		 * Using P(p) ~ n_p / n_t as per frequentist +		 * probability, we can equate a task's usage of a +		 * particular page (n_p) per total usage of this +		 * page (n_t) (in a given time-span) to a probability. +		 * +		 * Our periodic faults will sample this probability and +		 * getting the same result twice in a row, given these +		 * samples are fully independent, is then given by +		 * P(n)^2, provided our sample period is sufficiently +		 * short compared to the usage pattern. +		 * +		 * This quadric squishes small probabilities, making +		 * it less likely we act on an unlikely task<->page +		 * relation. +		 */ +		last_nid = page_xchg_last_nid(page, polnid); +		if (last_nid != polnid) +			goto out; +	} + +	if (curnid != polnid) +		ret = polnid; +out: +	mpol_cond_put(pol); + +	return ret; +} +  static void sp_delete(struct shared_policy *sp, struct sp_node *n)  {  	pr_debug("deleting %lx-l%lx\n", n->start, n->end); @@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)  	mutex_unlock(&p->mutex);  } +#ifdef CONFIG_NUMA_BALANCING +static bool __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ +	bool numabalancing_default = false; + +	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) +		numabalancing_default = true; + +	if (nr_node_ids > 1 && !numabalancing_override) { +		printk(KERN_INFO "Enabling automatic NUMA balancing. " +			"Configure with numa_balancing= or sysctl"); +		set_numabalancing_state(numabalancing_default); +	} +} + +static int __init setup_numabalancing(char *str) +{ +	int ret = 0; +	if (!str) +		goto out; +	numabalancing_override = true; + +	if (!strcmp(str, "enable")) { +		set_numabalancing_state(true); +		ret = 1; +	} else if (!strcmp(str, "disable")) { +		set_numabalancing_state(false); +		ret = 1; +	} +out: +	if (!ret) +		printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + +	return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ +  /* assumes fs == KERNEL_DS */  void __init numa_policy_init(void)  { @@ -2320,6 +2545,15 @@ void __init numa_policy_init(void)  				     sizeof(struct sp_node),  				     0, SLAB_PANIC, NULL); +	for_each_node(nid) { +		preferred_node_policy[nid] = (struct mempolicy) { +			.refcnt = ATOMIC_INIT(1), +			.mode = MPOL_PREFERRED, +			.flags = MPOL_F_MOF | MPOL_F_MORON, +			.v = { .preferred_node = nid, }, +		}; +	} +  	/*  	 * Set interleaving policy for system init. Interleaving is only  	 * enabled across suitably sized nodes (default is >= 16MB), or @@ -2346,6 +2580,8 @@ void __init numa_policy_init(void)  	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))  		printk("numa_policy_init: interleaving failed\n"); + +	check_numabalancing_enable();  }  /* Reset policy of current process to default */ @@ -2362,14 +2598,13 @@ void numa_default_policy(void)   * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag   * Used only for mpol_parse_str() and mpol_to_str()   */ -#define MPOL_LOCAL MPOL_MAX  static const char * const policy_modes[] =  {  	[MPOL_DEFAULT]    = "default",  	[MPOL_PREFERRED]  = "prefer",  	[MPOL_BIND]       = "bind",  	[MPOL_INTERLEAVE] = "interleave", -	[MPOL_LOCAL]      = "local" +	[MPOL_LOCAL]      = "local",  }; @@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)  	if (flags)  		*flags++ = '\0';	/* terminate mode string */ -	for (mode = 0; mode <= MPOL_LOCAL; mode++) { +	for (mode = 0; mode < MPOL_MAX; mode++) {  		if (!strcmp(str, policy_modes[mode])) {  			break;  		}  	} -	if (mode > MPOL_LOCAL) +	if (mode >= MPOL_MAX)  		goto out;  	switch (mode) { diff --git a/mm/migrate.c b/mm/migrate.c index cae02711181..32efd8028bc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -39,6 +39,9 @@  #include <asm/tlbflush.h> +#define CREATE_TRACE_POINTS +#include <trace/events/migrate.h> +  #include "internal.h"  /* @@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,  		struct page *newpage, struct page *page,  		struct buffer_head *head, enum migrate_mode mode)  { -	int expected_count; +	int expected_count = 0;  	void **pslot;  	if (!mapping) { @@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,   */  void migrate_page_copy(struct page *newpage, struct page *page)  { -	if (PageHuge(page)) +	if (PageHuge(page) || PageTransHuge(page))  		copy_huge_page(newpage, page);  	else  		copy_highpage(newpage, page); @@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,  	 */  	if (PageAnon(page)) {  		/* -		 * Only page_lock_anon_vma() understands the subtleties of +		 * Only page_lock_anon_vma_read() understands the subtleties of  		 * getting a hold on an anon_vma from outside one of its mms.  		 */  		anon_vma = page_get_anon_vma(page); @@ -998,10 +1001,11 @@ out:   */  int migrate_pages(struct list_head *from,  		new_page_t get_new_page, unsigned long private, bool offlining, -		enum migrate_mode mode) +		enum migrate_mode mode, int reason)  {  	int retry = 1;  	int nr_failed = 0; +	int nr_succeeded = 0;  	int pass = 0;  	struct page *page;  	struct page *page2; @@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from,  				retry++;  				break;  			case MIGRATEPAGE_SUCCESS: +				nr_succeeded++;  				break;  			default:  				/* Permanent failure */ @@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from,  	}  	rc = nr_failed + retry;  out: +	if (nr_succeeded) +		count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); +	if (nr_failed) +		count_vm_events(PGMIGRATE_FAIL, nr_failed); +	trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); +  	if (!swapwrite)  		current->flags &= ~PF_SWAPWRITE; @@ -1176,7 +1187,8 @@ set_status:  	err = 0;  	if (!list_empty(&pagelist)) {  		err = migrate_pages(&pagelist, new_page_node, -				(unsigned long)pm, 0, MIGRATE_SYNC); +				(unsigned long)pm, 0, MIGRATE_SYNC, +				MR_SYSCALL);  		if (err)  			putback_lru_pages(&pagelist);  	} @@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,   	}   	return err;  } -#endif + +#ifdef CONFIG_NUMA_BALANCING +/* + * Returns true if this is a safe migration target node for misplaced NUMA + * pages. Currently it only checks the watermarks which crude + */ +static bool migrate_balanced_pgdat(struct pglist_data *pgdat, +				   int nr_migrate_pages) +{ +	int z; +	for (z = pgdat->nr_zones - 1; z >= 0; z--) { +		struct zone *zone = pgdat->node_zones + z; + +		if (!populated_zone(zone)) +			continue; + +		if (zone->all_unreclaimable) +			continue; + +		/* Avoid waking kswapd by allocating pages_to_migrate pages. */ +		if (!zone_watermark_ok(zone, 0, +				       high_wmark_pages(zone) + +				       nr_migrate_pages, +				       0, 0)) +			continue; +		return true; +	} +	return false; +} + +static struct page *alloc_misplaced_dst_page(struct page *page, +					   unsigned long data, +					   int **result) +{ +	int nid = (int) data; +	struct page *newpage; + +	newpage = alloc_pages_exact_node(nid, +					 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | +					  __GFP_NOMEMALLOC | __GFP_NORETRY | +					  __GFP_NOWARN) & +					 ~GFP_IOFS, 0); +	if (newpage) +		page_xchg_last_nid(newpage, page_last_nid(page)); + +	return newpage; +} + +/* + * page migration rate limiting control. + * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs + * window of time. Default here says do not migrate more than 1280M per second. + * If a node is rate-limited then PTE NUMA updates are also rate-limited. However + * as it is faults that reset the window, pte updates will happen unconditionally + * if there has not been a fault since @pteupdate_interval_millisecs after the + * throttle window closed. + */ +static unsigned int migrate_interval_millisecs __read_mostly = 100; +static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; +static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); + +/* Returns true if NUMA migration is currently rate limited */ +bool migrate_ratelimited(int node) +{ +	pg_data_t *pgdat = NODE_DATA(node); + +	if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + +				msecs_to_jiffies(pteupdate_interval_millisecs))) +		return false; + +	if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) +		return false; + +	return true; +} + +/* Returns true if the node is migrate rate-limited after the update */ +bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) +{ +	bool rate_limited = false; + +	/* +	 * Rate-limit the amount of data that is being migrated to a node. +	 * Optimal placement is no good if the memory bus is saturated and +	 * all the time is being spent migrating! +	 */ +	spin_lock(&pgdat->numabalancing_migrate_lock); +	if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { +		pgdat->numabalancing_migrate_nr_pages = 0; +		pgdat->numabalancing_migrate_next_window = jiffies + +			msecs_to_jiffies(migrate_interval_millisecs); +	} +	if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) +		rate_limited = true; +	else +		pgdat->numabalancing_migrate_nr_pages += nr_pages; +	spin_unlock(&pgdat->numabalancing_migrate_lock); +	 +	return rate_limited; +} + +int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +{ +	int ret = 0; + +	/* Avoid migrating to a node that is nearly full */ +	if (migrate_balanced_pgdat(pgdat, 1)) { +		int page_lru; + +		if (isolate_lru_page(page)) { +			put_page(page); +			return 0; +		} + +		/* Page is isolated */ +		ret = 1; +		page_lru = page_is_file_cache(page); +		if (!PageTransHuge(page)) +			inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); +		else +			mod_zone_page_state(page_zone(page), +					NR_ISOLATED_ANON + page_lru, +					HPAGE_PMD_NR); +	} + +	/* +	 * Page is either isolated or there is not enough space on the target +	 * node. If isolated, then it has taken a reference count and the +	 * callers reference can be safely dropped without the page +	 * disappearing underneath us during migration. Otherwise the page is +	 * not to be migrated but the callers reference should still be +	 * dropped so it does not leak. +	 */ +	put_page(page); + +	return ret; +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, int node) +{ +	pg_data_t *pgdat = NODE_DATA(node); +	int isolated = 0; +	int nr_remaining; +	LIST_HEAD(migratepages); + +	/* +	 * Don't migrate pages that are mapped in multiple processes. +	 * TODO: Handle false sharing detection instead of this hammer +	 */ +	if (page_mapcount(page) != 1) { +		put_page(page); +		goto out; +	} + +	/* +	 * Rate-limit the amount of data that is being migrated to a node. +	 * Optimal placement is no good if the memory bus is saturated and +	 * all the time is being spent migrating! +	 */ +	if (numamigrate_update_ratelimit(pgdat, 1)) { +		put_page(page); +		goto out; +	} + +	isolated = numamigrate_isolate_page(pgdat, page); +	if (!isolated) +		goto out; + +	list_add(&page->lru, &migratepages); +	nr_remaining = migrate_pages(&migratepages, +			alloc_misplaced_dst_page, +			node, false, MIGRATE_ASYNC, +			MR_NUMA_MISPLACED); +	if (nr_remaining) { +		putback_lru_pages(&migratepages); +		isolated = 0; +	} else +		count_vm_numa_event(NUMA_PAGE_MIGRATE); +	BUG_ON(!list_empty(&migratepages)); +out: +	return isolated; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +int migrate_misplaced_transhuge_page(struct mm_struct *mm, +				struct vm_area_struct *vma, +				pmd_t *pmd, pmd_t entry, +				unsigned long address, +				struct page *page, int node) +{ +	unsigned long haddr = address & HPAGE_PMD_MASK; +	pg_data_t *pgdat = NODE_DATA(node); +	int isolated = 0; +	struct page *new_page = NULL; +	struct mem_cgroup *memcg = NULL; +	int page_lru = page_is_file_cache(page); + +	/* +	 * Don't migrate pages that are mapped in multiple processes. +	 * TODO: Handle false sharing detection instead of this hammer +	 */ +	if (page_mapcount(page) != 1) +		goto out_dropref; + +	/* +	 * Rate-limit the amount of data that is being migrated to a node. +	 * Optimal placement is no good if the memory bus is saturated and +	 * all the time is being spent migrating! +	 */ +	if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) +		goto out_dropref; + +	new_page = alloc_pages_node(node, +		(GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); +	if (!new_page) { +		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); +		goto out_dropref; +	} +	page_xchg_last_nid(new_page, page_last_nid(page)); + +	isolated = numamigrate_isolate_page(pgdat, page); +	if (!isolated) { +		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); +		put_page(new_page); +		goto out_keep_locked; +	} + +	/* Prepare a page as a migration target */ +	__set_page_locked(new_page); +	SetPageSwapBacked(new_page); + +	/* anon mapping, we can simply copy page->mapping to the new page: */ +	new_page->mapping = page->mapping; +	new_page->index = page->index; +	migrate_page_copy(new_page, page); +	WARN_ON(PageLRU(new_page)); + +	/* Recheck the target PMD */ +	spin_lock(&mm->page_table_lock); +	if (unlikely(!pmd_same(*pmd, entry))) { +		spin_unlock(&mm->page_table_lock); + +		/* Reverse changes made by migrate_page_copy() */ +		if (TestClearPageActive(new_page)) +			SetPageActive(page); +		if (TestClearPageUnevictable(new_page)) +			SetPageUnevictable(page); +		mlock_migrate_page(page, new_page); + +		unlock_page(new_page); +		put_page(new_page);		/* Free it */ + +		unlock_page(page); +		putback_lru_page(page); + +		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); +		goto out; +	} + +	/* +	 * Traditional migration needs to prepare the memcg charge +	 * transaction early to prevent the old page from being +	 * uncharged when installing migration entries.  Here we can +	 * save the potential rollback and start the charge transfer +	 * only when migration is already known to end successfully. +	 */ +	mem_cgroup_prepare_migration(page, new_page, &memcg); + +	entry = mk_pmd(new_page, vma->vm_page_prot); +	entry = pmd_mknonnuma(entry); +	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); +	entry = pmd_mkhuge(entry); + +	page_add_new_anon_rmap(new_page, vma, haddr); + +	set_pmd_at(mm, haddr, pmd, entry); +	update_mmu_cache_pmd(vma, address, entry); +	page_remove_rmap(page); +	/* +	 * Finish the charge transaction under the page table lock to +	 * prevent split_huge_page() from dividing up the charge +	 * before it's fully transferred to the new page. +	 */ +	mem_cgroup_end_migration(memcg, page, new_page, true); +	spin_unlock(&mm->page_table_lock); + +	unlock_page(new_page); +	unlock_page(page); +	put_page(page);			/* Drop the rmap reference */ +	put_page(page);			/* Drop the LRU isolation reference */ + +	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); +	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); + +out: +	mod_zone_page_state(page_zone(page), +			NR_ISOLATED_ANON + page_lru, +			-HPAGE_PMD_NR); +	return isolated; + +out_dropref: +	put_page(page); +out_keep_locked: +	return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* CONFIG_NUMA */ diff --git a/mm/mmap.c b/mm/mmap.c index 2b7d9e78a56..f54b235f29a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -736,7 +736,7 @@ again:			remove_next = 1 + (end > next->vm_end);  	if (anon_vma) {  		VM_BUG_ON(adjust_next && next->anon_vma &&  			  anon_vma != next->anon_vma); -		anon_vma_lock(anon_vma); +		anon_vma_lock_write(anon_vma);  		anon_vma_interval_tree_pre_update_vma(vma);  		if (adjust_next)  			anon_vma_interval_tree_pre_update_vma(next); @@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)  		 * The LSB of head.next can't change from under us  		 * because we hold the mm_all_locks_mutex.  		 */ -		mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); +		down_write(&anon_vma->root->rwsem);  		/*  		 * We can safely modify head.next after taking the -		 * anon_vma->root->mutex. If some other vma in this mm shares +		 * anon_vma->root->rwsem. If some other vma in this mm shares  		 * the same anon_vma we won't take it again.  		 *  		 * No need of atomic instructions here, head.next  		 * can't change from under us thanks to the -		 * anon_vma->root->mutex. +		 * anon_vma->root->rwsem.  		 */  		if (__test_and_set_bit(0, (unsigned long *)  				       &anon_vma->root->rb_root.rb_node)) @@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)  		 *  		 * No need of atomic instructions here, head.next  		 * can't change from under us until we release the -		 * anon_vma->root->mutex. +		 * anon_vma->root->rwsem.  		 */  		if (!__test_and_clear_bit(0, (unsigned long *)  					  &anon_vma->root->rb_root.rb_node)) diff --git a/mm/mprotect.c b/mm/mprotect.c index e8c3938db6f..3dca970367d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)  }  #endif -static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,  		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +		int dirty_accountable, int prot_numa, bool *ret_all_same_node)  { +	struct mm_struct *mm = vma->vm_mm;  	pte_t *pte, oldpte;  	spinlock_t *ptl; +	unsigned long pages = 0; +	bool all_same_node = true; +	int last_nid = -1;  	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);  	arch_enter_lazy_mmu_mode(); @@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,  		oldpte = *pte;  		if (pte_present(oldpte)) {  			pte_t ptent; +			bool updated = false;  			ptent = ptep_modify_prot_start(mm, addr, pte); -			ptent = pte_modify(ptent, newprot); +			if (!prot_numa) { +				ptent = pte_modify(ptent, newprot); +				updated = true; +			} else { +				struct page *page; + +				page = vm_normal_page(vma, addr, oldpte); +				if (page) { +					int this_nid = page_to_nid(page); +					if (last_nid == -1) +						last_nid = this_nid; +					if (last_nid != this_nid) +						all_same_node = false; + +					/* only check non-shared pages */ +					if (!pte_numa(oldpte) && +					    page_mapcount(page) == 1) { +						ptent = pte_mknuma(ptent); +						updated = true; +					} +				} +			}  			/*  			 * Avoid taking write faults for pages we know to be  			 * dirty.  			 */ -			if (dirty_accountable && pte_dirty(ptent)) +			if (dirty_accountable && pte_dirty(ptent)) {  				ptent = pte_mkwrite(ptent); +				updated = true; +			} +			if (updated) +				pages++;  			ptep_modify_prot_commit(mm, addr, pte, ptent);  		} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {  			swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,  				set_pte_at(mm, addr, pte,  					swp_entry_to_pte(entry));  			} +			pages++;  		}  	} while (pte++, addr += PAGE_SIZE, addr != end);  	arch_leave_lazy_mmu_mode();  	pte_unmap_unlock(pte - 1, ptl); + +	*ret_all_same_node = all_same_node; +	return pages; +} + +#ifdef CONFIG_NUMA_BALANCING +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, +		pmd_t *pmd) +{ +	spin_lock(&mm->page_table_lock); +	set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); +	spin_unlock(&mm->page_table_lock); +} +#else +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, +		pmd_t *pmd) +{ +	BUG();  } +#endif /* CONFIG_NUMA_BALANCING */ -static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,  		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +		int dirty_accountable, int prot_numa)  {  	pmd_t *pmd;  	unsigned long next; +	unsigned long pages = 0; +	bool all_same_node;  	pmd = pmd_offset(pud, addr);  	do { @@ -91,42 +143,59 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,  		if (pmd_trans_huge(*pmd)) {  			if (next - addr != HPAGE_PMD_SIZE)  				split_huge_page_pmd(vma, addr, pmd); -			else if (change_huge_pmd(vma, pmd, addr, newprot)) +			else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { +				pages += HPAGE_PMD_NR;  				continue; +			}  			/* fall through */  		}  		if (pmd_none_or_clear_bad(pmd))  			continue; -		change_pte_range(vma->vm_mm, pmd, addr, next, newprot, -				 dirty_accountable); +		pages += change_pte_range(vma, pmd, addr, next, newprot, +				 dirty_accountable, prot_numa, &all_same_node); + +		/* +		 * If we are changing protections for NUMA hinting faults then +		 * set pmd_numa if the examined pages were all on the same +		 * node. This allows a regular PMD to be handled as one fault +		 * and effectively batches the taking of the PTL +		 */ +		if (prot_numa && all_same_node) +			change_pmd_protnuma(vma->vm_mm, addr, pmd);  	} while (pmd++, addr = next, addr != end); + +	return pages;  } -static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,  		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +		int dirty_accountable, int prot_numa)  {  	pud_t *pud;  	unsigned long next; +	unsigned long pages = 0;  	pud = pud_offset(pgd, addr);  	do {  		next = pud_addr_end(addr, end);  		if (pud_none_or_clear_bad(pud))  			continue; -		change_pmd_range(vma, pud, addr, next, newprot, -				 dirty_accountable); +		pages += change_pmd_range(vma, pud, addr, next, newprot, +				 dirty_accountable, prot_numa);  	} while (pud++, addr = next, addr != end); + +	return pages;  } -static void change_protection(struct vm_area_struct *vma, +static unsigned long change_protection_range(struct vm_area_struct *vma,  		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +		int dirty_accountable, int prot_numa)  {  	struct mm_struct *mm = vma->vm_mm;  	pgd_t *pgd;  	unsigned long next;  	unsigned long start = addr; +	unsigned long pages = 0;  	BUG_ON(addr >= end);  	pgd = pgd_offset(mm, addr); @@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma,  		next = pgd_addr_end(addr, end);  		if (pgd_none_or_clear_bad(pgd))  			continue; -		change_pud_range(vma, pgd, addr, next, newprot, -				 dirty_accountable); +		pages += change_pud_range(vma, pgd, addr, next, newprot, +				 dirty_accountable, prot_numa);  	} while (pgd++, addr = next, addr != end); -	flush_tlb_range(vma, start, end); + +	/* Only flush the TLB if we actually modified any entries: */ +	if (pages) +		flush_tlb_range(vma, start, end); + +	return pages; +} + +unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, +		       unsigned long end, pgprot_t newprot, +		       int dirty_accountable, int prot_numa) +{ +	struct mm_struct *mm = vma->vm_mm; +	unsigned long pages; + +	mmu_notifier_invalidate_range_start(mm, start, end); +	if (is_vm_hugetlb_page(vma)) +		pages = hugetlb_change_protection(vma, start, end, newprot); +	else +		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); +	mmu_notifier_invalidate_range_end(mm, start, end); + +	return pages;  }  int @@ -213,12 +304,8 @@ success:  		dirty_accountable = 1;  	} -	mmu_notifier_invalidate_range_start(mm, start, end); -	if (is_vm_hugetlb_page(vma)) -		hugetlb_change_protection(vma, start, end, vma->vm_page_prot); -	else -		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); -	mmu_notifier_invalidate_range_end(mm, start, end); +	change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); +  	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);  	vm_stat_account(mm, newflags, vma->vm_file, nrpages);  	perf_event_mmap(vma); diff --git a/mm/mremap.c b/mm/mremap.c index eabb24da6c9..e1031e1f6a6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,  		}  		if (vma->anon_vma) {  			anon_vma = vma->anon_vma; -			anon_vma_lock(anon_vma); +			anon_vma_lock_write(anon_vma);  		}  	} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83637dfba11..d037c8bc151 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -611,6 +611,7 @@ static inline int free_pages_check(struct page *page)  		bad_page(page);  		return 1;  	} +	reset_page_last_nid(page);  	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)  		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;  	return 0; @@ -3883,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  		mminit_verify_page_links(page, zone, nid, pfn);  		init_page_count(page);  		reset_page_mapcount(page); +		reset_page_last_nid(page);  		SetPageReserved(page);  		/*  		 * Mark the block movable so that blocks are reserved for @@ -4526,6 +4528,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,  	int ret;  	pgdat_resize_init(pgdat); +#ifdef CONFIG_NUMA_BALANCING +	spin_lock_init(&pgdat->numabalancing_migrate_lock); +	pgdat->numabalancing_migrate_nr_pages = 0; +	pgdat->numabalancing_migrate_next_window = jiffies; +#endif  	init_waitqueue_head(&pgdat->kswapd_wait);  	init_waitqueue_head(&pgdat->pfmemalloc_wait);  	pgdat_page_cgroup_init(pgdat); @@ -5800,7 +5807,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,  		ret = migrate_pages(&cc->migratepages,  				    alloc_migrate_target, -				    0, false, MIGRATE_SYNC); +				    0, false, MIGRATE_SYNC, +				    MR_CMA);  	}  	putback_movable_pages(&cc->migratepages); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e642627da6b..0c8323fe6c8 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -12,8 +12,8 @@  #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS  /* - * Only sets the access flags (dirty, accessed, and - * writable). Furthermore, we know it always gets set to a "more + * Only sets the access flags (dirty, accessed), as well as write  + * permission. Furthermore, we know it always gets set to a "more   * permissive" setting, which allows most architectures to optimize   * this. We return whether the PTE actually changed, which in turn   * instructs the caller to do things like update__mmu_cache.  This @@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,  	int changed = !pte_same(*ptep, entry);  	if (changed) {  		set_pte_at(vma->vm_mm, address, ptep, entry); -		flush_tlb_page(vma, address); +		flush_tlb_fix_spurious_fault(vma, address);  	}  	return changed;  } @@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,  {  	pte_t pte;  	pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); -	flush_tlb_page(vma, address); +	if (pte_accessible(pte)) +		flush_tlb_page(vma, address);  	return pte;  }  #endif diff --git a/mm/rmap.c b/mm/rmap.c index face808a489..2c78f8cadc9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@   *   mm->mmap_sem   *     page->flags PG_locked (lock_page)   *       mapping->i_mmap_mutex - *         anon_vma->mutex + *         anon_vma->rwsem   *           mm->page_table_lock or pte_lock   *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)   *             swap_lock (in swap_duplicate, swap_info_get) @@ -37,7 +37,7 @@   *                           in arch-dependent flush_dcache_mmap_lock,   *                           within bdi.wb->list_lock in __sync_single_inode)   * - * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon) + * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)   *   ->tasklist_lock   *     pte map lock   */ @@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)  	VM_BUG_ON(atomic_read(&anon_vma->refcount));  	/* -	 * Synchronize against page_lock_anon_vma() such that +	 * Synchronize against page_lock_anon_vma_read() such that  	 * we can safely hold the lock without the anon_vma getting  	 * freed.  	 *  	 * Relies on the full mb implied by the atomic_dec_and_test() from  	 * put_anon_vma() against the acquire barrier implied by -	 * mutex_trylock() from page_lock_anon_vma(). This orders: +	 * down_read_trylock() from page_lock_anon_vma_read(). This orders:  	 * -	 * page_lock_anon_vma()		VS	put_anon_vma() -	 *   mutex_trylock()			  atomic_dec_and_test() +	 * page_lock_anon_vma_read()	VS	put_anon_vma() +	 *   down_read_trylock()		  atomic_dec_and_test()  	 *   LOCK				  MB -	 *   atomic_read()			  mutex_is_locked() +	 *   atomic_read()			  rwsem_is_locked()  	 *  	 * LOCK should suffice since the actual taking of the lock must  	 * happen _before_ what follows.  	 */ -	if (mutex_is_locked(&anon_vma->root->mutex)) { -		anon_vma_lock(anon_vma); +	if (rwsem_is_locked(&anon_vma->root->rwsem)) { +		anon_vma_lock_write(anon_vma);  		anon_vma_unlock(anon_vma);  	} @@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,   * allocate a new one.   *   * Anon-vma allocations are very subtle, because we may have - * optimistically looked up an anon_vma in page_lock_anon_vma() + * optimistically looked up an anon_vma in page_lock_anon_vma_read()   * and that may actually touch the spinlock even in the newly   * allocated vma (it depends on RCU to make sure that the   * anon_vma isn't actually destroyed). @@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)  			allocated = anon_vma;  		} -		anon_vma_lock(anon_vma); +		anon_vma_lock_write(anon_vma);  		/* page_table_lock to protect against threads */  		spin_lock(&mm->page_table_lock);  		if (likely(!vma->anon_vma)) { @@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct  	struct anon_vma *new_root = anon_vma->root;  	if (new_root != root) {  		if (WARN_ON_ONCE(root)) -			mutex_unlock(&root->mutex); +			up_write(&root->rwsem);  		root = new_root; -		mutex_lock(&root->mutex); +		down_write(&root->rwsem);  	}  	return root;  } @@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct  static inline void unlock_anon_vma_root(struct anon_vma *root)  {  	if (root) -		mutex_unlock(&root->mutex); +		up_write(&root->rwsem);  }  /* @@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)  	get_anon_vma(anon_vma->root);  	/* Mark this anon_vma as the one where our new (COWed) pages go. */  	vma->anon_vma = anon_vma; -	anon_vma_lock(anon_vma); +	anon_vma_lock_write(anon_vma);  	anon_vma_chain_link(vma, avc, anon_vma);  	anon_vma_unlock(anon_vma); @@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)  	/*  	 * Iterate the list once more, it now only contains empty and unlinked  	 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() -	 * needing to acquire the anon_vma->root->mutex. +	 * needing to write-acquire the anon_vma->root->rwsem.  	 */  	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {  		struct anon_vma *anon_vma = avc->anon_vma; @@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)  {  	struct anon_vma *anon_vma = data; -	mutex_init(&anon_vma->mutex); +	init_rwsem(&anon_vma->rwsem);  	atomic_set(&anon_vma->refcount, 0);  	anon_vma->rb_root = RB_ROOT;  } @@ -442,7 +442,7 @@ out:   * atomic op -- the trylock. If we fail the trylock, we fall back to getting a   * reference like with page_get_anon_vma() and then block on the mutex.   */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma_read(struct page *page)  {  	struct anon_vma *anon_vma = NULL;  	struct anon_vma *root_anon_vma; @@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)  	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);  	root_anon_vma = ACCESS_ONCE(anon_vma->root); -	if (mutex_trylock(&root_anon_vma->mutex)) { +	if (down_read_trylock(&root_anon_vma->rwsem)) {  		/*  		 * If the page is still mapped, then this anon_vma is still  		 * its anon_vma, and holding the mutex ensures that it will  		 * not go away, see anon_vma_free().  		 */  		if (!page_mapped(page)) { -			mutex_unlock(&root_anon_vma->mutex); +			up_read(&root_anon_vma->rwsem);  			anon_vma = NULL;  		}  		goto out; @@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)  	/* we pinned the anon_vma, its safe to sleep */  	rcu_read_unlock(); -	anon_vma_lock(anon_vma); +	anon_vma_lock_read(anon_vma);  	if (atomic_dec_and_test(&anon_vma->refcount)) {  		/*  		 * Oops, we held the last refcount, release the lock  		 * and bail -- can't simply use put_anon_vma() because -		 * we'll deadlock on the anon_vma_lock() recursion. +		 * we'll deadlock on the anon_vma_lock_write() recursion.  		 */ -		anon_vma_unlock(anon_vma); +		anon_vma_unlock_read(anon_vma);  		__put_anon_vma(anon_vma);  		anon_vma = NULL;  	} @@ -504,9 +504,9 @@ out:  	return anon_vma;  } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma_read(struct anon_vma *anon_vma)  { -	anon_vma_unlock(anon_vma); +	anon_vma_unlock_read(anon_vma);  }  /* @@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page,  	struct anon_vma_chain *avc;  	int referenced = 0; -	anon_vma = page_lock_anon_vma(page); +	anon_vma = page_lock_anon_vma_read(page);  	if (!anon_vma)  		return referenced; @@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page,  			break;  	} -	page_unlock_anon_vma(anon_vma); +	page_unlock_anon_vma_read(anon_vma);  	return referenced;  } @@ -1315,7 +1315,7 @@ out_mlock:  	/*  	 * We need mmap_sem locking, Otherwise VM_LOCKED check makes  	 * unstable result and race. Plus, We can't wait here because -	 * we now hold anon_vma->mutex or mapping->i_mmap_mutex. +	 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.  	 * if trylock failed, the page remain in evictable lru and later  	 * vmscan could retry to move the page to unevictable lru if the  	 * page is actually mlocked. @@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)  	struct anon_vma_chain *avc;  	int ret = SWAP_AGAIN; -	anon_vma = page_lock_anon_vma(page); +	anon_vma = page_lock_anon_vma_read(page);  	if (!anon_vma)  		return ret; @@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)  			break;  	} -	page_unlock_anon_vma(anon_vma); +	page_unlock_anon_vma_read(anon_vma);  	return ret;  } @@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,  	int ret = SWAP_AGAIN;  	/* -	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() +	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()  	 * because that depends on page_mapped(); but not all its usages  	 * are holding mmap_sem. Users without mmap_sem are required to  	 * take a reference count to prevent the anon_vma disappearing @@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,  	anon_vma = page_anon_vma(page);  	if (!anon_vma)  		return ret; -	anon_vma_lock(anon_vma); +	anon_vma_lock_read(anon_vma);  	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {  		struct vm_area_struct *vma = avc->vma;  		unsigned long address = vma_address(page, vma); @@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,  		if (ret != SWAP_AGAIN)  			break;  	} -	anon_vma_unlock(anon_vma); +	anon_vma_unlock_read(anon_vma);  	return ret;  } diff --git a/mm/vmstat.c b/mm/vmstat.c index df14808f0a3..9800306c819 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -774,10 +774,20 @@ const char * const vmstat_text[] = {  	"pgrotated", +#ifdef CONFIG_NUMA_BALANCING +	"numa_pte_updates", +	"numa_hint_faults", +	"numa_hint_faults_local", +	"numa_pages_migrated", +#endif +#ifdef CONFIG_MIGRATION +	"pgmigrate_success", +	"pgmigrate_fail", +#endif  #ifdef CONFIG_COMPACTION -	"compact_blocks_moved", -	"compact_pages_moved", -	"compact_pagemigrate_failed", +	"compact_migrate_scanned", +	"compact_free_scanned", +	"compact_isolated",  	"compact_stall",  	"compact_fail",  	"compact_success",  |