diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 3 | ||||
| -rw-r--r-- | mm/Makefile | 3 | ||||
| -rw-r--r-- | mm/backing-dev.c | 5 | ||||
| -rw-r--r-- | mm/bounce.c | 9 | ||||
| -rw-r--r-- | mm/compaction.c | 26 | ||||
| -rw-r--r-- | mm/debug-pagealloc.c | 56 | ||||
| -rw-r--r-- | mm/filemap.c | 3 | ||||
| -rw-r--r-- | mm/highmem.c | 2 | ||||
| -rw-r--r-- | mm/huge_memory.c | 91 | ||||
| -rw-r--r-- | mm/internal.h | 46 | ||||
| -rw-r--r-- | mm/ksm.c | 3 | ||||
| -rw-r--r-- | mm/memblock.c | 11 | ||||
| -rw-r--r-- | mm/memcontrol.c | 1009 | ||||
| -rw-r--r-- | mm/memory-failure.c | 12 | ||||
| -rw-r--r-- | mm/memory.c | 2 | ||||
| -rw-r--r-- | mm/mempolicy.c | 2 | ||||
| -rw-r--r-- | mm/migrate.c | 83 | ||||
| -rw-r--r-- | mm/mlock.c | 13 | ||||
| -rw-r--r-- | mm/mmap.c | 9 | ||||
| -rw-r--r-- | mm/mremap.c | 42 | ||||
| -rw-r--r-- | mm/oom_kill.c | 53 | ||||
| -rw-r--r-- | mm/page-writeback.c | 4 | ||||
| -rw-r--r-- | mm/page_alloc.c | 17 | ||||
| -rw-r--r-- | mm/page_cgroup.c | 12 | ||||
| -rw-r--r-- | mm/process_vm_access.c | 496 | ||||
| -rw-r--r-- | mm/rmap.c | 2 | ||||
| -rw-r--r-- | mm/shmem.c | 12 | ||||
| -rw-r--r-- | mm/slab.c | 19 | ||||
| -rw-r--r-- | mm/slub.c | 605 | ||||
| -rw-r--r-- | mm/swap.c | 83 | ||||
| -rw-r--r-- | mm/swapfile.c | 2 | ||||
| -rw-r--r-- | mm/thrash.c | 2 | ||||
| -rw-r--r-- | mm/vmalloc.c | 80 | ||||
| -rw-r--r-- | mm/vmscan.c | 331 | ||||
| -rw-r--r-- | mm/vmstat.c | 3 | 
35 files changed, 2037 insertions, 1114 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f2f1ca19ed5..011b110365c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -131,6 +131,9 @@ config SPARSEMEM_VMEMMAP  config HAVE_MEMBLOCK  	boolean +config NO_BOOTMEM +	boolean +  # eventually, we can have this option just 'select SPARSEMEM'  config MEMORY_HOTPLUG  	bool "Allow for memory hot-add" diff --git a/mm/Makefile b/mm/Makefile index 836e4163c1b..50ec00ef2a0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,7 +5,8 @@  mmu-y			:= nommu.o  mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \  			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ -			   vmalloc.o pagewalk.o pgtable-generic.o +			   vmalloc.o pagewalk.o pgtable-generic.o \ +			   process_vm_access.o  obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \  			   maccess.o page_alloc.o page-writeback.o \ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dd8916feb05..a0860640378 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -406,9 +406,8 @@ static int bdi_forker_thread(void *ptr)  		/*  		 * In the following loop we are going to check whether we have  		 * some work to do without any synchronization with tasks -		 * waking us up to do work for them. So we have to set task -		 * state already here so that we don't miss wakeups coming -		 * after we verify some condition. +		 * waking us up to do work for them. Set the task state here +		 * so that we don't miss wakeups after verifying conditions.  		 */  		set_current_state(TASK_INTERRUPTIBLE); diff --git a/mm/bounce.c b/mm/bounce.c index 1481de68184..434fb4f0c5e 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -14,6 +14,7 @@  #include <linux/init.h>  #include <linux/hash.h>  #include <linux/highmem.h> +#include <linux/bootmem.h>  #include <asm/tlbflush.h>  #include <trace/events/block.h> @@ -26,12 +27,10 @@ static mempool_t *page_pool, *isa_page_pool;  #ifdef CONFIG_HIGHMEM  static __init int init_emergency_pool(void)  { -	struct sysinfo i; -	si_meminfo(&i); -	si_swapinfo(&i); - -	if (!i.totalhigh) +#ifndef CONFIG_MEMORY_HOTPLUG +	if (max_pfn <= max_low_pfn)  		return 0; +#endif  	page_pool = mempool_create_page_pool(POOL_SIZE, 0);  	BUG_ON(!page_pool); diff --git a/mm/compaction.c b/mm/compaction.c index 6cc604bd564..899d9563858 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -35,10 +35,6 @@ struct compact_control {  	unsigned long migrate_pfn;	/* isolate_migratepages search base */  	bool sync;			/* Synchronous migration */ -	/* Account for isolated anon and file pages */ -	unsigned long nr_anon; -	unsigned long nr_file; -  	unsigned int order;		/* order a direct compactor needs */  	int migratetype;		/* MOVABLE, RECLAIMABLE etc */  	struct zone *zone; @@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,  static void acct_isolated(struct zone *zone, struct compact_control *cc)  {  	struct page *page; -	unsigned int count[NR_LRU_LISTS] = { 0, }; +	unsigned int count[2] = { 0, }; -	list_for_each_entry(page, &cc->migratepages, lru) { -		int lru = page_lru_base_type(page); -		count[lru]++; -	} +	list_for_each_entry(page, &cc->migratepages, lru) +		count[!!page_is_file_cache(page)]++; -	cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; -	cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; -	__mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); -	__mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); +	__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); +	__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);  }  /* Similar to reclaim, but different enough that they don't share logic */ @@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,  	unsigned long last_pageblock_nr = 0, pageblock_nr;  	unsigned long nr_scanned = 0, nr_isolated = 0;  	struct list_head *migratelist = &cc->migratepages; +	isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;  	/* Do not scan outside zone boundaries */  	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); @@ -356,8 +349,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,  			continue;  		} +		if (!cc->sync) +			mode |= ISOLATE_CLEAN; +  		/* Try isolate the page */ -		if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) +		if (__isolate_lru_page(page, mode, 0) != 0)  			continue;  		VM_BUG_ON(PageTransCompound(page)); @@ -586,7 +582,7 @@ out:  	return ret;  } -unsigned long compact_zone_order(struct zone *zone, +static unsigned long compact_zone_order(struct zone *zone,  				 int order, gfp_t gfp_mask,  				 bool sync)  { diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index a1e3324de2b..7cea557407f 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c @@ -1,7 +1,10 @@  #include <linux/kernel.h> +#include <linux/string.h>  #include <linux/mm.h> +#include <linux/highmem.h>  #include <linux/page-debug-flags.h>  #include <linux/poison.h> +#include <linux/ratelimit.h>  static inline void set_page_poison(struct page *page)  { @@ -18,28 +21,13 @@ static inline bool page_poison(struct page *page)  	return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);  } -static void poison_highpage(struct page *page) -{ -	/* -	 * Page poisoning for highmem pages is not implemented. -	 * -	 * This can be called from interrupt contexts. -	 * So we need to create a new kmap_atomic slot for this -	 * application and it will need interrupt protection. -	 */ -} -  static void poison_page(struct page *page)  { -	void *addr; +	void *addr = kmap_atomic(page); -	if (PageHighMem(page)) { -		poison_highpage(page); -		return; -	}  	set_page_poison(page); -	addr = page_address(page);  	memset(addr, PAGE_POISON, PAGE_SIZE); +	kunmap_atomic(addr);  }  static void poison_pages(struct page *page, int n) @@ -59,14 +47,12 @@ static bool single_bit_flip(unsigned char a, unsigned char b)  static void check_poison_mem(unsigned char *mem, size_t bytes)  { +	static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);  	unsigned char *start;  	unsigned char *end; -	for (start = mem; start < mem + bytes; start++) { -		if (*start != PAGE_POISON) -			break; -	} -	if (start == mem + bytes) +	start = memchr_inv(mem, PAGE_POISON, bytes); +	if (!start)  		return;  	for (end = mem + bytes - 1; end > start; end--) { @@ -74,7 +60,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)  			break;  	} -	if (!printk_ratelimit()) +	if (!__ratelimit(&ratelimit))  		return;  	else if (start == end && single_bit_flip(*start, PAGE_POISON))  		printk(KERN_ERR "pagealloc: single bit error\n"); @@ -86,27 +72,17 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)  	dump_stack();  } -static void unpoison_highpage(struct page *page) -{ -	/* -	 * See comment in poison_highpage(). -	 * Highmem pages should not be poisoned for now -	 */ -	BUG_ON(page_poison(page)); -} -  static void unpoison_page(struct page *page)  { -	if (PageHighMem(page)) { -		unpoison_highpage(page); +	void *addr; + +	if (!page_poison(page))  		return; -	} -	if (page_poison(page)) { -		void *addr = page_address(page); -		check_poison_mem(addr, PAGE_SIZE); -		clear_page_poison(page); -	} +	addr = kmap_atomic(page); +	check_poison_mem(addr, PAGE_SIZE); +	clear_page_poison(page); +	kunmap_atomic(addr);  }  static void unpoison_pages(struct page *page, int n) diff --git a/mm/filemap.c b/mm/filemap.c index 7771871fa35..5cf820a7c8e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2115,6 +2115,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)  	} else {  		const struct iovec *iov = i->iov;  		size_t base = i->iov_offset; +		unsigned long nr_segs = i->nr_segs;  		/*  		 * The !iov->iov_len check ensures we skip over unlikely @@ -2130,11 +2131,13 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)  			base += copy;  			if (iov->iov_len == base) {  				iov++; +				nr_segs--;  				base = 0;  			}  		}  		i->iov = iov;  		i->iov_offset = base; +		i->nr_segs = nr_segs;  	}  }  EXPORT_SYMBOL(iov_iter_advance); diff --git a/mm/highmem.c b/mm/highmem.c index 5ef672c07f7..e159a7b1cc2 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -250,7 +250,7 @@ void *kmap_high_get(struct page *page)  #endif  /** - * kunmap_high - map a highmem page into memory + * kunmap_high - unmap a highmem page into memory   * @page: &struct page to unmap   *   * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e2d1587be26..4298abaae15 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -89,7 +89,8 @@ struct khugepaged_scan {  	struct list_head mm_head;  	struct mm_slot *mm_slot;  	unsigned long address; -} khugepaged_scan = { +}; +static struct khugepaged_scan khugepaged_scan = {  	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),  }; @@ -829,7 +830,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,  	for (i = 0; i < HPAGE_PMD_NR; i++) {  		copy_user_highpage(pages[i], page + i, -				   haddr + PAGE_SHIFT*i, vma); +				   haddr + PAGE_SIZE * i, vma);  		__SetPageUptodate(pages[i]);  		cond_resched();  	} @@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,  	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;  	VM_BUG_ON(!PageCompound(page));  	if (flags & FOLL_GET) -		get_page(page); +		get_page_foll(page);  out:  	return page; @@ -1052,6 +1053,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,  	return ret;  } +int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, +		  unsigned long old_addr, +		  unsigned long new_addr, unsigned long old_end, +		  pmd_t *old_pmd, pmd_t *new_pmd) +{ +	int ret = 0; +	pmd_t pmd; + +	struct mm_struct *mm = vma->vm_mm; + +	if ((old_addr & ~HPAGE_PMD_MASK) || +	    (new_addr & ~HPAGE_PMD_MASK) || +	    old_end - old_addr < HPAGE_PMD_SIZE || +	    (new_vma->vm_flags & VM_NOHUGEPAGE)) +		goto out; + +	/* +	 * The destination pmd shouldn't be established, free_pgtables() +	 * should have release it. +	 */ +	if (WARN_ON(!pmd_none(*new_pmd))) { +		VM_BUG_ON(pmd_trans_huge(*new_pmd)); +		goto out; +	} + +	spin_lock(&mm->page_table_lock); +	if (likely(pmd_trans_huge(*old_pmd))) { +		if (pmd_trans_splitting(*old_pmd)) { +			spin_unlock(&mm->page_table_lock); +			wait_split_huge_page(vma->anon_vma, old_pmd); +			ret = -1; +		} else { +			pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); +			VM_BUG_ON(!pmd_none(*new_pmd)); +			set_pmd_at(mm, new_addr, new_pmd, pmd); +			spin_unlock(&mm->page_table_lock); +			ret = 1; +		} +	} else { +		spin_unlock(&mm->page_table_lock); +	} +out: +	return ret; +} +  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,  		unsigned long addr, pgprot_t newprot)  { @@ -1156,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page)  	unsigned long head_index = page->index;  	struct zone *zone = page_zone(page);  	int zonestat; +	int tail_count = 0;  	/* prevent PageLRU to go away from under us, and freeze lru stats */  	spin_lock_irq(&zone->lru_lock); @@ -1164,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page)  	for (i = 1; i < HPAGE_PMD_NR; i++) {  		struct page *page_tail = page + i; -		/* tail_page->_count cannot change */ -		atomic_sub(atomic_read(&page_tail->_count), &page->_count); -		BUG_ON(page_count(page) <= 0); -		atomic_add(page_mapcount(page) + 1, &page_tail->_count); -		BUG_ON(atomic_read(&page_tail->_count) <= 0); +		/* tail_page->_mapcount cannot change */ +		BUG_ON(page_mapcount(page_tail) < 0); +		tail_count += page_mapcount(page_tail); +		/* check for overflow */ +		BUG_ON(tail_count < 0); +		BUG_ON(atomic_read(&page_tail->_count) != 0); +		/* +		 * tail_page->_count is zero and not changing from +		 * under us. But get_page_unless_zero() may be running +		 * from under us on the tail_page. If we used +		 * atomic_set() below instead of atomic_add(), we +		 * would then run atomic_set() concurrently with +		 * get_page_unless_zero(), and atomic_set() is +		 * implemented in C not using locked ops. spin_unlock +		 * on x86 sometime uses locked ops because of PPro +		 * errata 66, 92, so unless somebody can guarantee +		 * atomic_set() here would be safe on all archs (and +		 * not only on x86), it's safer to use atomic_add(). +		 */ +		atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, +			   &page_tail->_count);  		/* after clearing PageTail the gup refcount can be released */  		smp_mb(); @@ -1186,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page)  				      (1L << PG_uptodate)));  		page_tail->flags |= (1L << PG_dirty); -		/* -		 * 1) clear PageTail before overwriting first_page -		 * 2) clear PageTail before clearing PageHead for VM_BUG_ON -		 */ +		/* clear PageTail before overwriting first_page */  		smp_wmb();  		/* @@ -1206,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page)  		 * status is achieved setting a reserved bit in the  		 * pmd, not by clearing the present bit.  		*/ -		BUG_ON(page_mapcount(page_tail));  		page_tail->_mapcount = page->_mapcount;  		BUG_ON(page_tail->mapping); @@ -1223,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page)  		lru_add_page_tail(zone, page, page_tail);  	} +	atomic_sub(tail_count, &page->_count); +	BUG_ON(atomic_read(&page->_count) <= 0);  	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);  	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); @@ -1906,7 +1967,7 @@ static void collapse_huge_page(struct mm_struct *mm,  	BUG_ON(!pmd_none(*pmd));  	page_add_new_anon_rmap(new_page, vma, address);  	set_pmd_at(mm, address, pmd, _pmd); -	update_mmu_cache(vma, address, entry); +	update_mmu_cache(vma, address, _pmd);  	prepare_pmd_huge_pte(pgtable, mm);  	mm->nr_ptes--;  	spin_unlock(&mm->page_table_lock); @@ -2024,6 +2085,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot)  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,  					    struct page **hpage) +	__releases(&khugepaged_mm_lock) +	__acquires(&khugepaged_mm_lock)  {  	struct mm_slot *mm_slot;  	struct mm_struct *mm; diff --git a/mm/internal.h b/mm/internal.h index d071d380fb4..2189af49178 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)  	atomic_dec(&page->_count);  } +static inline void __get_page_tail_foll(struct page *page, +					bool get_page_head) +{ +	/* +	 * If we're getting a tail page, the elevated page->_count is +	 * required only in the head page and we will elevate the head +	 * page->_count and tail page->_mapcount. +	 * +	 * We elevate page_tail->_mapcount for tail pages to force +	 * page_tail->_count to be zero at all times to avoid getting +	 * false positives from get_page_unless_zero() with +	 * speculative page access (like in +	 * page_cache_get_speculative()) on tail pages. +	 */ +	VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); +	VM_BUG_ON(atomic_read(&page->_count) != 0); +	VM_BUG_ON(page_mapcount(page) < 0); +	if (get_page_head) +		atomic_inc(&page->first_page->_count); +	atomic_inc(&page->_mapcount); +} + +/* + * This is meant to be called as the FOLL_GET operation of + * follow_page() and it must be called while holding the proper PT + * lock while the pte (or pmd_trans_huge) is still mapping the page. + */ +static inline void get_page_foll(struct page *page) +{ +	if (unlikely(PageTail(page))) +		/* +		 * This is safe only because +		 * __split_huge_page_refcount() can't run under +		 * get_page_foll() because we hold the proper PT lock. +		 */ +		__get_page_tail_foll(page, true); +	else { +		/* +		 * Getting a normal page or the head of a compound page +		 * requires to already have an elevated page->_count. +		 */ +		VM_BUG_ON(atomic_read(&page->_count) <= 0); +		atomic_inc(&page->_count); +	} +} +  extern unsigned long highest_memmap_pfn;  /* @@ -1905,7 +1905,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,  			oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);  			err = unmerge_and_remove_all_rmap_items(); -			test_set_oom_score_adj(oom_score_adj); +			compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, +								oom_score_adj);  			if (err) {  				ksm_run = KSM_RUN_STOP;  				count = err; diff --git a/mm/memblock.c b/mm/memblock.c index ccbf9733959..84bec4969ed 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -58,7 +58,8 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p  	return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));  } -long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +static long __init_memblock memblock_overlaps_region(struct memblock_type *type, +					phys_addr_t base, phys_addr_t size)  {  	unsigned long i; @@ -267,7 +268,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)  	return 0;  } -extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, +int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,  					  phys_addr_t addr2, phys_addr_t size2)  {  	return 1; @@ -626,6 +627,12 @@ phys_addr_t __init memblock_phys_mem_size(void)  	return memblock.memory_size;  } +/* lowest address */ +phys_addr_t __init_memblock memblock_start_of_DRAM(void) +{ +	return memblock.memory.regions[0].base; +} +  phys_addr_t __init_memblock memblock_end_of_DRAM(void)  {  	int idx = memblock.memory.cnt - 1; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3508777837c..7af1d5ee159 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -201,8 +201,8 @@ struct mem_cgroup_eventfd_list {  	struct eventfd_ctx *eventfd;  }; -static void mem_cgroup_threshold(struct mem_cgroup *mem); -static void mem_cgroup_oom_notify(struct mem_cgroup *mem); +static void mem_cgroup_threshold(struct mem_cgroup *memcg); +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);  /*   * The memory controller data structure. The memory controller controls both @@ -362,29 +362,29 @@ enum charge_type {  #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2  #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT) -static void mem_cgroup_get(struct mem_cgroup *mem); -static void mem_cgroup_put(struct mem_cgroup *mem); -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); -static void drain_all_stock_async(struct mem_cgroup *mem); +static void mem_cgroup_get(struct mem_cgroup *memcg); +static void mem_cgroup_put(struct mem_cgroup *memcg); +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); +static void drain_all_stock_async(struct mem_cgroup *memcg);  static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) +mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)  { -	return &mem->info.nodeinfo[nid]->zoneinfo[zid]; +	return &memcg->info.nodeinfo[nid]->zoneinfo[zid];  } -struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)  { -	return &mem->css; +	return &memcg->css;  }  static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) +page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)  {  	int nid = page_to_nid(page);  	int zid = page_zonenum(page); -	return mem_cgroup_zoneinfo(mem, nid, zid); +	return mem_cgroup_zoneinfo(memcg, nid, zid);  }  static struct mem_cgroup_tree_per_zone * @@ -403,7 +403,7 @@ soft_limit_tree_from_page(struct page *page)  }  static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, +__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,  				struct mem_cgroup_per_zone *mz,  				struct mem_cgroup_tree_per_zone *mctz,  				unsigned long long new_usage_in_excess) @@ -437,7 +437,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,  }  static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, +__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,  				struct mem_cgroup_per_zone *mz,  				struct mem_cgroup_tree_per_zone *mctz)  { @@ -448,17 +448,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,  }  static void -mem_cgroup_remove_exceeded(struct mem_cgroup *mem, +mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,  				struct mem_cgroup_per_zone *mz,  				struct mem_cgroup_tree_per_zone *mctz)  {  	spin_lock(&mctz->lock); -	__mem_cgroup_remove_exceeded(mem, mz, mctz); +	__mem_cgroup_remove_exceeded(memcg, mz, mctz);  	spin_unlock(&mctz->lock);  } -static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)  {  	unsigned long long excess;  	struct mem_cgroup_per_zone *mz; @@ -471,9 +471,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)  	 * Necessary to update all ancestors when hierarchy is used.  	 * because their event counter is not touched.  	 */ -	for (; mem; mem = parent_mem_cgroup(mem)) { -		mz = mem_cgroup_zoneinfo(mem, nid, zid); -		excess = res_counter_soft_limit_excess(&mem->res); +	for (; memcg; memcg = parent_mem_cgroup(memcg)) { +		mz = mem_cgroup_zoneinfo(memcg, nid, zid); +		excess = res_counter_soft_limit_excess(&memcg->res);  		/*  		 * We have to update the tree if mz is on RB-tree or  		 * mem is over its softlimit. @@ -482,18 +482,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)  			spin_lock(&mctz->lock);  			/* if on-tree, remove it */  			if (mz->on_tree) -				__mem_cgroup_remove_exceeded(mem, mz, mctz); +				__mem_cgroup_remove_exceeded(memcg, mz, mctz);  			/*  			 * Insert again. mz->usage_in_excess will be updated.  			 * If excess is 0, no tree ops.  			 */ -			__mem_cgroup_insert_exceeded(mem, mz, mctz, excess); +			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);  			spin_unlock(&mctz->lock);  		}  	}  } -static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) +static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)  {  	int node, zone;  	struct mem_cgroup_per_zone *mz; @@ -501,9 +501,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)  	for_each_node_state(node, N_POSSIBLE) {  		for (zone = 0; zone < MAX_NR_ZONES; zone++) { -			mz = mem_cgroup_zoneinfo(mem, node, zone); +			mz = mem_cgroup_zoneinfo(memcg, node, zone);  			mctz = soft_limit_tree_node_zone(node, zone); -			mem_cgroup_remove_exceeded(mem, mz, mctz); +			mem_cgroup_remove_exceeded(memcg, mz, mctz);  		}  	}  } @@ -564,7 +564,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)   * common workload, threashold and synchonization as vmstat[] should be   * implemented.   */ -static long mem_cgroup_read_stat(struct mem_cgroup *mem, +static long mem_cgroup_read_stat(struct mem_cgroup *memcg,  				 enum mem_cgroup_stat_index idx)  {  	long val = 0; @@ -572,81 +572,83 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,  	get_online_cpus();  	for_each_online_cpu(cpu) -		val += per_cpu(mem->stat->count[idx], cpu); +		val += per_cpu(memcg->stat->count[idx], cpu);  #ifdef CONFIG_HOTPLUG_CPU -	spin_lock(&mem->pcp_counter_lock); -	val += mem->nocpu_base.count[idx]; -	spin_unlock(&mem->pcp_counter_lock); +	spin_lock(&memcg->pcp_counter_lock); +	val += memcg->nocpu_base.count[idx]; +	spin_unlock(&memcg->pcp_counter_lock);  #endif  	put_online_cpus();  	return val;  } -static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,  					 bool charge)  {  	int val = (charge) ? 1 : -1; -	this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); +	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);  } -void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) +void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)  { -	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); +	this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);  } -void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) +void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)  { -	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); +	this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);  } -static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, +static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,  					    enum mem_cgroup_events_index idx)  {  	unsigned long val = 0;  	int cpu;  	for_each_online_cpu(cpu) -		val += per_cpu(mem->stat->events[idx], cpu); +		val += per_cpu(memcg->stat->events[idx], cpu);  #ifdef CONFIG_HOTPLUG_CPU -	spin_lock(&mem->pcp_counter_lock); -	val += mem->nocpu_base.events[idx]; -	spin_unlock(&mem->pcp_counter_lock); +	spin_lock(&memcg->pcp_counter_lock); +	val += memcg->nocpu_base.events[idx]; +	spin_unlock(&memcg->pcp_counter_lock);  #endif  	return val;  } -static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, +static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,  					 bool file, int nr_pages)  {  	preempt_disable();  	if (file) -		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); +		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], +				nr_pages);  	else -		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); +		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], +				nr_pages);  	/* pagein of a big page is an event. So, ignore page size */  	if (nr_pages > 0) -		__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); +		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);  	else { -		__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); +		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);  		nr_pages = -nr_pages; /* for event */  	} -	__this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); +	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);  	preempt_enable();  }  unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, +mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,  			unsigned int lru_mask)  {  	struct mem_cgroup_per_zone *mz;  	enum lru_list l;  	unsigned long ret = 0; -	mz = mem_cgroup_zoneinfo(mem, nid, zid); +	mz = mem_cgroup_zoneinfo(memcg, nid, zid);  	for_each_lru(l) {  		if (BIT(l) & lru_mask) @@ -656,44 +658,45 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,  }  static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, +mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,  			int nid, unsigned int lru_mask)  {  	u64 total = 0;  	int zid;  	for (zid = 0; zid < MAX_NR_ZONES; zid++) -		total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); +		total += mem_cgroup_zone_nr_lru_pages(memcg, +						nid, zid, lru_mask);  	return total;  } -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,  			unsigned int lru_mask)  {  	int nid;  	u64 total = 0;  	for_each_node_state(nid, N_HIGH_MEMORY) -		total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); +		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);  	return total;  } -static bool __memcg_event_check(struct mem_cgroup *mem, int target) +static bool __memcg_event_check(struct mem_cgroup *memcg, int target)  {  	unsigned long val, next; -	val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); -	next = this_cpu_read(mem->stat->targets[target]); +	val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); +	next = __this_cpu_read(memcg->stat->targets[target]);  	/* from time_after() in jiffies.h */  	return ((long)next - (long)val < 0);  } -static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) +static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)  {  	unsigned long val, next; -	val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); +	val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);  	switch (target) {  	case MEM_CGROUP_TARGET_THRESH: @@ -709,34 +712,36 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)  		return;  	} -	this_cpu_write(mem->stat->targets[target], next); +	__this_cpu_write(memcg->stat->targets[target], next);  }  /*   * Check events in order.   *   */ -static void memcg_check_events(struct mem_cgroup *mem, struct page *page) +static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)  { +	preempt_disable();  	/* threshold event is triggered in finer grain than soft limit */ -	if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { -		mem_cgroup_threshold(mem); -		__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); -		if (unlikely(__memcg_event_check(mem, +	if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { +		mem_cgroup_threshold(memcg); +		__mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); +		if (unlikely(__memcg_event_check(memcg,  			     MEM_CGROUP_TARGET_SOFTLIMIT))) { -			mem_cgroup_update_tree(mem, page); -			__mem_cgroup_target_update(mem, +			mem_cgroup_update_tree(memcg, page); +			__mem_cgroup_target_update(memcg,  						   MEM_CGROUP_TARGET_SOFTLIMIT);  		}  #if MAX_NUMNODES > 1 -		if (unlikely(__memcg_event_check(mem, +		if (unlikely(__memcg_event_check(memcg,  			MEM_CGROUP_TARGET_NUMAINFO))) { -			atomic_inc(&mem->numainfo_events); -			__mem_cgroup_target_update(mem, +			atomic_inc(&memcg->numainfo_events); +			__mem_cgroup_target_update(memcg,  				MEM_CGROUP_TARGET_NUMAINFO);  		}  #endif  	} +	preempt_enable();  }  static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) @@ -762,7 +767,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)  struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)  { -	struct mem_cgroup *mem = NULL; +	struct mem_cgroup *memcg = NULL;  	if (!mm)  		return NULL; @@ -773,25 +778,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)  	 */  	rcu_read_lock();  	do { -		mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); -		if (unlikely(!mem)) +		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); +		if (unlikely(!memcg))  			break; -	} while (!css_tryget(&mem->css)); +	} while (!css_tryget(&memcg->css));  	rcu_read_unlock(); -	return mem; +	return memcg;  }  /* The caller has to guarantee "mem" exists before calling this */ -static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) +static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg)  {  	struct cgroup_subsys_state *css;  	int found; -	if (!mem) /* ROOT cgroup has the smallest ID */ +	if (!memcg) /* ROOT cgroup has the smallest ID */  		return root_mem_cgroup; /*css_put/get against root is ignored*/ -	if (!mem->use_hierarchy) { -		if (css_tryget(&mem->css)) -			return mem; +	if (!memcg->use_hierarchy) { +		if (css_tryget(&memcg->css)) +			return memcg;  		return NULL;  	}  	rcu_read_lock(); @@ -799,13 +804,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)  	 * searching a memory cgroup which has the smallest ID under given  	 * ROOT cgroup. (ID >= 1)  	 */ -	css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); +	css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);  	if (css && css_tryget(css)) -		mem = container_of(css, struct mem_cgroup, css); +		memcg = container_of(css, struct mem_cgroup, css);  	else -		mem = NULL; +		memcg = NULL;  	rcu_read_unlock(); -	return mem; +	return memcg;  }  static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, @@ -859,29 +864,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,  	for_each_mem_cgroup_tree_cond(iter, NULL, true) -static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)  { -	return (mem == root_mem_cgroup); +	return (memcg == root_mem_cgroup);  }  void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)  { -	struct mem_cgroup *mem; +	struct mem_cgroup *memcg;  	if (!mm)  		return;  	rcu_read_lock(); -	mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); -	if (unlikely(!mem)) +	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); +	if (unlikely(!memcg))  		goto out;  	switch (idx) {  	case PGMAJFAULT: -		mem_cgroup_pgmajfault(mem, 1); +		mem_cgroup_pgmajfault(memcg, 1);  		break;  	case PGFAULT: -		mem_cgroup_pgfault(mem, 1); +		mem_cgroup_pgfault(memcg, 1);  		break;  	default:  		BUG(); @@ -990,6 +995,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)  		return;  	pc = lookup_page_cgroup(page);  	VM_BUG_ON(PageCgroupAcctLRU(pc)); +	/* +	 * putback:				charge: +	 * SetPageLRU				SetPageCgroupUsed +	 * smp_mb				smp_mb +	 * PageCgroupUsed && add to memcg LRU	PageLRU && add to memcg LRU +	 * +	 * Ensure that one of the two sides adds the page to the memcg +	 * LRU during a race. +	 */ +	smp_mb();  	if (!PageCgroupUsed(pc))  		return;  	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ @@ -1041,7 +1056,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page)  	unsigned long flags;  	struct zone *zone = page_zone(page);  	struct page_cgroup *pc = lookup_page_cgroup(page); - +	/* +	 * putback:				charge: +	 * SetPageLRU				SetPageCgroupUsed +	 * smp_mb				smp_mb +	 * PageCgroupUsed && add to memcg LRU	PageLRU && add to memcg LRU +	 * +	 * Ensure that one of the two sides adds the page to the memcg +	 * LRU during a race. +	 */ +	smp_mb();  	/* taking care of that the page is added to LRU while we commit it */  	if (likely(!PageLRU(page)))  		return; @@ -1063,21 +1087,21 @@ void mem_cgroup_move_lists(struct page *page,  }  /* - * Checks whether given mem is same or in the root_mem's + * Checks whether given mem is same or in the root_mem_cgroup's   * hierarchy subtree   */ -static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, -		struct mem_cgroup *mem) +static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, +		struct mem_cgroup *memcg)  { -	if (root_mem != mem) { -		return (root_mem->use_hierarchy && -			css_is_ancestor(&mem->css, &root_mem->css)); +	if (root_memcg != memcg) { +		return (root_memcg->use_hierarchy && +			css_is_ancestor(&memcg->css, &root_memcg->css));  	}  	return true;  } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) +int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)  {  	int ret;  	struct mem_cgroup *curr = NULL; @@ -1091,25 +1115,29 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)  	if (!curr)  		return 0;  	/* -	 * We should check use_hierarchy of "mem" not "curr". Because checking +	 * We should check use_hierarchy of "memcg" not "curr". Because checking  	 * use_hierarchy of "curr" here make this function true if hierarchy is -	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* -	 * hierarchy(even if use_hierarchy is disabled in "mem"). +	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* +	 * hierarchy(even if use_hierarchy is disabled in "memcg").  	 */ -	ret = mem_cgroup_same_or_subtree(mem, curr); +	ret = mem_cgroup_same_or_subtree(memcg, curr);  	css_put(&curr->css);  	return ret;  } -static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) +int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)  { -	unsigned long active; +	unsigned long inactive_ratio; +	int nid = zone_to_nid(zone); +	int zid = zone_idx(zone);  	unsigned long inactive; +	unsigned long active;  	unsigned long gb; -	unsigned long inactive_ratio; -	inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); -	active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); +	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, +						BIT(LRU_INACTIVE_ANON)); +	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, +					      BIT(LRU_ACTIVE_ANON));  	gb = (inactive + active) >> (30 - PAGE_SHIFT);  	if (gb) @@ -1117,39 +1145,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_  	else  		inactive_ratio = 1; -	if (present_pages) { -		present_pages[0] = inactive; -		present_pages[1] = active; -	} - -	return inactive_ratio; +	return inactive * inactive_ratio < active;  } -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) -{ -	unsigned long active; -	unsigned long inactive; -	unsigned long present_pages[2]; -	unsigned long inactive_ratio; - -	inactive_ratio = calc_inactive_ratio(memcg, present_pages); - -	inactive = present_pages[0]; -	active = present_pages[1]; - -	if (inactive * inactive_ratio < active) -		return 1; - -	return 0; -} - -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)  {  	unsigned long active;  	unsigned long inactive; +	int zid = zone_idx(zone); +	int nid = zone_to_nid(zone); -	inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); -	active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); +	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, +						BIT(LRU_INACTIVE_FILE)); +	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, +					      BIT(LRU_ACTIVE_FILE));  	return (active > inactive);  } @@ -1185,7 +1194,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)  unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,  					struct list_head *dst,  					unsigned long *scanned, int order, -					int mode, struct zone *z, +					isolate_mode_t mode, +					struct zone *z,  					struct mem_cgroup *mem_cont,  					int active, int file)  { @@ -1253,13 +1263,13 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,   * Returns the maximum amount of memory @mem can be charged with, in   * pages.   */ -static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) +static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)  {  	unsigned long long margin; -	margin = res_counter_margin(&mem->res); +	margin = res_counter_margin(&memcg->res);  	if (do_swap_account) -		margin = min(margin, res_counter_margin(&mem->memsw)); +		margin = min(margin, res_counter_margin(&memcg->memsw));  	return margin >> PAGE_SHIFT;  } @@ -1274,33 +1284,33 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)  	return memcg->swappiness;  } -static void mem_cgroup_start_move(struct mem_cgroup *mem) +static void mem_cgroup_start_move(struct mem_cgroup *memcg)  {  	int cpu;  	get_online_cpus(); -	spin_lock(&mem->pcp_counter_lock); +	spin_lock(&memcg->pcp_counter_lock);  	for_each_online_cpu(cpu) -		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; -	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; -	spin_unlock(&mem->pcp_counter_lock); +		per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; +	memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; +	spin_unlock(&memcg->pcp_counter_lock);  	put_online_cpus();  	synchronize_rcu();  } -static void mem_cgroup_end_move(struct mem_cgroup *mem) +static void mem_cgroup_end_move(struct mem_cgroup *memcg)  {  	int cpu; -	if (!mem) +	if (!memcg)  		return;  	get_online_cpus(); -	spin_lock(&mem->pcp_counter_lock); +	spin_lock(&memcg->pcp_counter_lock);  	for_each_online_cpu(cpu) -		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; -	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; -	spin_unlock(&mem->pcp_counter_lock); +		per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; +	memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; +	spin_unlock(&memcg->pcp_counter_lock);  	put_online_cpus();  }  /* @@ -1315,13 +1325,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem)   *			  waiting at hith-memory prressure caused by "move".   */ -static bool mem_cgroup_stealed(struct mem_cgroup *mem) +static bool mem_cgroup_stealed(struct mem_cgroup *memcg)  {  	VM_BUG_ON(!rcu_read_lock_held()); -	return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; +	return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;  } -static bool mem_cgroup_under_move(struct mem_cgroup *mem) +static bool mem_cgroup_under_move(struct mem_cgroup *memcg)  {  	struct mem_cgroup *from;  	struct mem_cgroup *to; @@ -1336,17 +1346,17 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)  	if (!from)  		goto unlock; -	ret = mem_cgroup_same_or_subtree(mem, from) -		|| mem_cgroup_same_or_subtree(mem, to); +	ret = mem_cgroup_same_or_subtree(memcg, from) +		|| mem_cgroup_same_or_subtree(memcg, to);  unlock:  	spin_unlock(&mc.lock);  	return ret;  } -static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) +static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)  {  	if (mc.moving_task && current != mc.moving_task) { -		if (mem_cgroup_under_move(mem)) { +		if (mem_cgroup_under_move(memcg)) {  			DEFINE_WAIT(wait);  			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);  			/* moving charge context might have finished. */ @@ -1430,12 +1440,12 @@ done:   * This function returns the number of memcg under hierarchy tree. Returns   * 1(self count) if no children.   */ -static int mem_cgroup_count_children(struct mem_cgroup *mem) +static int mem_cgroup_count_children(struct mem_cgroup *memcg)  {  	int num = 0;  	struct mem_cgroup *iter; -	for_each_mem_cgroup_tree(iter, mem) +	for_each_mem_cgroup_tree(iter, memcg)  		num++;  	return num;  } @@ -1465,21 +1475,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)   * that to reclaim free pages from.   */  static struct mem_cgroup * -mem_cgroup_select_victim(struct mem_cgroup *root_mem) +mem_cgroup_select_victim(struct mem_cgroup *root_memcg)  {  	struct mem_cgroup *ret = NULL;  	struct cgroup_subsys_state *css;  	int nextid, found; -	if (!root_mem->use_hierarchy) { -		css_get(&root_mem->css); -		ret = root_mem; +	if (!root_memcg->use_hierarchy) { +		css_get(&root_memcg->css); +		ret = root_memcg;  	}  	while (!ret) {  		rcu_read_lock(); -		nextid = root_mem->last_scanned_child + 1; -		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, +		nextid = root_memcg->last_scanned_child + 1; +		css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,  				   &found);  		if (css && css_tryget(css))  			ret = container_of(css, struct mem_cgroup, css); @@ -1488,9 +1498,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)  		/* Updates scanning parameter */  		if (!css) {  			/* this means start scan from ID:1 */ -			root_mem->last_scanned_child = 0; +			root_memcg->last_scanned_child = 0;  		} else -			root_mem->last_scanned_child = found; +			root_memcg->last_scanned_child = found;  	}  	return ret; @@ -1506,14 +1516,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)   * reclaimable pages on a node. Returns true if there are any reclaimable   * pages in the node.   */ -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, +static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,  		int nid, bool noswap)  { -	if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) +	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))  		return true;  	if (noswap || !total_swap_pages)  		return false; -	if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) +	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))  		return true;  	return false; @@ -1526,29 +1536,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,   * nodes based on the zonelist. So update the list loosely once per 10 secs.   *   */ -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)  {  	int nid;  	/*  	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET  	 * pagein/pageout changes since the last update.  	 */ -	if (!atomic_read(&mem->numainfo_events)) +	if (!atomic_read(&memcg->numainfo_events))  		return; -	if (atomic_inc_return(&mem->numainfo_updating) > 1) +	if (atomic_inc_return(&memcg->numainfo_updating) > 1)  		return;  	/* make a nodemask where this memcg uses memory from */ -	mem->scan_nodes = node_states[N_HIGH_MEMORY]; +	memcg->scan_nodes = node_states[N_HIGH_MEMORY];  	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { -		if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) -			node_clear(nid, mem->scan_nodes); +		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) +			node_clear(nid, memcg->scan_nodes);  	} -	atomic_set(&mem->numainfo_events, 0); -	atomic_set(&mem->numainfo_updating, 0); +	atomic_set(&memcg->numainfo_events, 0); +	atomic_set(&memcg->numainfo_updating, 0);  }  /* @@ -1563,16 +1573,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)   *   * Now, we use round-robin. Better algorithm is welcomed.   */ -int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)  {  	int node; -	mem_cgroup_may_update_nodemask(mem); -	node = mem->last_scanned_node; +	mem_cgroup_may_update_nodemask(memcg); +	node = memcg->last_scanned_node; -	node = next_node(node, mem->scan_nodes); +	node = next_node(node, memcg->scan_nodes);  	if (node == MAX_NUMNODES) -		node = first_node(mem->scan_nodes); +		node = first_node(memcg->scan_nodes);  	/*  	 * We call this when we hit limit, not when pages are added to LRU.  	 * No LRU may hold pages because all pages are UNEVICTABLE or @@ -1582,7 +1592,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)  	if (unlikely(node == MAX_NUMNODES))  		node = numa_node_id(); -	mem->last_scanned_node = node; +	memcg->last_scanned_node = node;  	return node;  } @@ -1592,7 +1602,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)   * unused nodes. But scan_nodes is lazily updated and may not cotain   * enough new information. We need to do double check.   */ -bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)  {  	int nid; @@ -1600,12 +1610,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)  	 * quick check...making use of scan_node.  	 * We can skip unused nodes.  	 */ -	if (!nodes_empty(mem->scan_nodes)) { -		for (nid = first_node(mem->scan_nodes); +	if (!nodes_empty(memcg->scan_nodes)) { +		for (nid = first_node(memcg->scan_nodes);  		     nid < MAX_NUMNODES; -		     nid = next_node(nid, mem->scan_nodes)) { +		     nid = next_node(nid, memcg->scan_nodes)) { -			if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) +			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))  				return true;  		}  	} @@ -1613,23 +1623,23 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)  	 * Check rest of nodes.  	 */  	for_each_node_state(nid, N_HIGH_MEMORY) { -		if (node_isset(nid, mem->scan_nodes)) +		if (node_isset(nid, memcg->scan_nodes))  			continue; -		if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) +		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))  			return true;  	}  	return false;  }  #else -int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)  {  	return 0;  } -bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)  { -	return test_mem_cgroup_node_reclaimable(mem, 0, noswap); +	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);  }  #endif @@ -1638,14 +1648,14 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)   * we reclaimed from, so that we don't end up penalizing one child extensively   * based on its position in the children list.   * - * root_mem is the original ancestor that we've been reclaim from. + * root_memcg is the original ancestor that we've been reclaim from.   * - * We give up and return to the caller when we visit root_mem twice. + * We give up and return to the caller when we visit root_memcg twice.   * (other groups can be removed while we're walking....)   *   * If shrink==true, for avoiding to free too much, this returns immedieately.   */ -static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, +static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,  						struct zone *zone,  						gfp_t gfp_mask,  						unsigned long reclaim_options, @@ -1660,15 +1670,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,  	unsigned long excess;  	unsigned long nr_scanned; -	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; +	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;  	/* If memsw_is_minimum==1, swap-out is of-no-use. */ -	if (!check_soft && !shrink && root_mem->memsw_is_minimum) +	if (!check_soft && !shrink && root_memcg->memsw_is_minimum)  		noswap = true;  	while (1) { -		victim = mem_cgroup_select_victim(root_mem); -		if (victim == root_mem) { +		victim = mem_cgroup_select_victim(root_memcg); +		if (victim == root_memcg) {  			loop++;  			/*  			 * We are not draining per cpu cached charges during @@ -1677,7 +1687,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,  			 * charges will not give any.  			 */  			if (!check_soft && loop >= 1) -				drain_all_stock_async(root_mem); +				drain_all_stock_async(root_memcg);  			if (loop >= 2) {  				/*  				 * If we have not been able to reclaim @@ -1724,9 +1734,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,  			return ret;  		total += ret;  		if (check_soft) { -			if (!res_counter_soft_limit_excess(&root_mem->res)) +			if (!res_counter_soft_limit_excess(&root_memcg->res))  				return total; -		} else if (mem_cgroup_margin(root_mem)) +		} else if (mem_cgroup_margin(root_memcg))  			return total;  	}  	return total; @@ -1737,12 +1747,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,   * If someone is running, return false.   * Has to be called with memcg_oom_lock   */ -static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) +static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)  {  	struct mem_cgroup *iter, *failed = NULL;  	bool cond = true; -	for_each_mem_cgroup_tree_cond(iter, mem, cond) { +	for_each_mem_cgroup_tree_cond(iter, memcg, cond) {  		if (iter->oom_lock) {  			/*  			 * this subtree of our hierarchy is already locked @@ -1762,7 +1772,7 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)  	 * what we set up to the failing subtree  	 */  	cond = true; -	for_each_mem_cgroup_tree_cond(iter, mem, cond) { +	for_each_mem_cgroup_tree_cond(iter, memcg, cond) {  		if (iter == failed) {  			cond = false;  			continue; @@ -1775,24 +1785,24 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)  /*   * Has to be called with memcg_oom_lock   */ -static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) +static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)  {  	struct mem_cgroup *iter; -	for_each_mem_cgroup_tree(iter, mem) +	for_each_mem_cgroup_tree(iter, memcg)  		iter->oom_lock = false;  	return 0;  } -static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) +static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)  {  	struct mem_cgroup *iter; -	for_each_mem_cgroup_tree(iter, mem) +	for_each_mem_cgroup_tree(iter, memcg)  		atomic_inc(&iter->under_oom);  } -static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)  {  	struct mem_cgroup *iter; @@ -1801,7 +1811,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)  	 * mem_cgroup_oom_lock() may not be called. We have to use  	 * atomic_add_unless() here.  	 */ -	for_each_mem_cgroup_tree(iter, mem) +	for_each_mem_cgroup_tree(iter, memcg)  		atomic_add_unless(&iter->under_oom, -1, 0);  } @@ -1816,85 +1826,85 @@ struct oom_wait_info {  static int memcg_oom_wake_function(wait_queue_t *wait,  	unsigned mode, int sync, void *arg)  { -	struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, -			  *oom_wait_mem; +	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, +			  *oom_wait_memcg;  	struct oom_wait_info *oom_wait_info;  	oom_wait_info = container_of(wait, struct oom_wait_info, wait); -	oom_wait_mem = oom_wait_info->mem; +	oom_wait_memcg = oom_wait_info->mem;  	/*  	 * Both of oom_wait_info->mem and wake_mem are stable under us.  	 * Then we can use css_is_ancestor without taking care of RCU.  	 */ -	if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) -			&& !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) +	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) +		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))  		return 0;  	return autoremove_wake_function(wait, mode, sync, arg);  } -static void memcg_wakeup_oom(struct mem_cgroup *mem) +static void memcg_wakeup_oom(struct mem_cgroup *memcg)  { -	/* for filtering, pass "mem" as argument. */ -	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); +	/* for filtering, pass "memcg" as argument. */ +	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);  } -static void memcg_oom_recover(struct mem_cgroup *mem) +static void memcg_oom_recover(struct mem_cgroup *memcg)  { -	if (mem && atomic_read(&mem->under_oom)) -		memcg_wakeup_oom(mem); +	if (memcg && atomic_read(&memcg->under_oom)) +		memcg_wakeup_oom(memcg);  }  /*   * try to call OOM killer. returns false if we should exit memory-reclaim loop.   */ -bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) +bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)  {  	struct oom_wait_info owait;  	bool locked, need_to_kill; -	owait.mem = mem; +	owait.mem = memcg;  	owait.wait.flags = 0;  	owait.wait.func = memcg_oom_wake_function;  	owait.wait.private = current;  	INIT_LIST_HEAD(&owait.wait.task_list);  	need_to_kill = true; -	mem_cgroup_mark_under_oom(mem); +	mem_cgroup_mark_under_oom(memcg); -	/* At first, try to OOM lock hierarchy under mem.*/ +	/* At first, try to OOM lock hierarchy under memcg.*/  	spin_lock(&memcg_oom_lock); -	locked = mem_cgroup_oom_lock(mem); +	locked = mem_cgroup_oom_lock(memcg);  	/*  	 * Even if signal_pending(), we can't quit charge() loop without  	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL  	 * under OOM is always welcomed, use TASK_KILLABLE here.  	 */  	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); -	if (!locked || mem->oom_kill_disable) +	if (!locked || memcg->oom_kill_disable)  		need_to_kill = false;  	if (locked) -		mem_cgroup_oom_notify(mem); +		mem_cgroup_oom_notify(memcg);  	spin_unlock(&memcg_oom_lock);  	if (need_to_kill) {  		finish_wait(&memcg_oom_waitq, &owait.wait); -		mem_cgroup_out_of_memory(mem, mask); +		mem_cgroup_out_of_memory(memcg, mask);  	} else {  		schedule();  		finish_wait(&memcg_oom_waitq, &owait.wait);  	}  	spin_lock(&memcg_oom_lock);  	if (locked) -		mem_cgroup_oom_unlock(mem); -	memcg_wakeup_oom(mem); +		mem_cgroup_oom_unlock(memcg); +	memcg_wakeup_oom(memcg);  	spin_unlock(&memcg_oom_lock); -	mem_cgroup_unmark_under_oom(mem); +	mem_cgroup_unmark_under_oom(memcg);  	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))  		return false;  	/* Give chance to dying process */ -	schedule_timeout(1); +	schedule_timeout_uninterruptible(1);  	return true;  } @@ -1925,7 +1935,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)  void mem_cgroup_update_page_stat(struct page *page,  				 enum mem_cgroup_page_stat_item idx, int val)  { -	struct mem_cgroup *mem; +	struct mem_cgroup *memcg;  	struct page_cgroup *pc = lookup_page_cgroup(page);  	bool need_unlock = false;  	unsigned long uninitialized_var(flags); @@ -1934,16 +1944,16 @@ void mem_cgroup_update_page_stat(struct page *page,  		return;  	rcu_read_lock(); -	mem = pc->mem_cgroup; -	if (unlikely(!mem || !PageCgroupUsed(pc))) +	memcg = pc->mem_cgroup; +	if (unlikely(!memcg || !PageCgroupUsed(pc)))  		goto out;  	/* pc->mem_cgroup is unstable ? */ -	if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { +	if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {  		/* take a lock against to access pc->mem_cgroup */  		move_lock_page_cgroup(pc, &flags);  		need_unlock = true; -		mem = pc->mem_cgroup; -		if (!mem || !PageCgroupUsed(pc)) +		memcg = pc->mem_cgroup; +		if (!memcg || !PageCgroupUsed(pc))  			goto out;  	} @@ -1959,7 +1969,7 @@ void mem_cgroup_update_page_stat(struct page *page,  		BUG();  	} -	this_cpu_add(mem->stat->count[idx], val); +	this_cpu_add(memcg->stat->count[idx], val);  out:  	if (unlikely(need_unlock)) @@ -1990,13 +2000,13 @@ static DEFINE_MUTEX(percpu_charge_mutex);   * cgroup which is not current target, returns false. This stock will be   * refilled.   */ -static bool consume_stock(struct mem_cgroup *mem) +static bool consume_stock(struct mem_cgroup *memcg)  {  	struct memcg_stock_pcp *stock;  	bool ret = true;  	stock = &get_cpu_var(memcg_stock); -	if (mem == stock->cached && stock->nr_pages) +	if (memcg == stock->cached && stock->nr_pages)  		stock->nr_pages--;  	else /* need to call res_counter_charge */  		ret = false; @@ -2037,24 +2047,24 @@ static void drain_local_stock(struct work_struct *dummy)   * Cache charges(val) which is from res_counter, to local per_cpu area.   * This will be consumed by consume_stock() function, later.   */ -static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)  {  	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); -	if (stock->cached != mem) { /* reset if necessary */ +	if (stock->cached != memcg) { /* reset if necessary */  		drain_stock(stock); -		stock->cached = mem; +		stock->cached = memcg;  	}  	stock->nr_pages += nr_pages;  	put_cpu_var(memcg_stock);  }  /* - * Drains all per-CPU charge caches for given root_mem resp. subtree + * Drains all per-CPU charge caches for given root_memcg resp. subtree   * of the hierarchy under it. sync flag says whether we should block   * until the work is done.   */ -static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) +static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)  {  	int cpu, curcpu; @@ -2063,12 +2073,12 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)  	curcpu = get_cpu();  	for_each_online_cpu(cpu) {  		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); -		struct mem_cgroup *mem; +		struct mem_cgroup *memcg; -		mem = stock->cached; -		if (!mem || !stock->nr_pages) +		memcg = stock->cached; +		if (!memcg || !stock->nr_pages)  			continue; -		if (!mem_cgroup_same_or_subtree(root_mem, mem)) +		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))  			continue;  		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {  			if (cpu == curcpu) @@ -2097,23 +2107,23 @@ out:   * expects some charges will be back to res_counter later but cannot wait for   * it.   */ -static void drain_all_stock_async(struct mem_cgroup *root_mem) +static void drain_all_stock_async(struct mem_cgroup *root_memcg)  {  	/*  	 * If someone calls draining, avoid adding more kworker runs.  	 */  	if (!mutex_trylock(&percpu_charge_mutex))  		return; -	drain_all_stock(root_mem, false); +	drain_all_stock(root_memcg, false);  	mutex_unlock(&percpu_charge_mutex);  }  /* This is a synchronous drain interface. */ -static void drain_all_stock_sync(struct mem_cgroup *root_mem) +static void drain_all_stock_sync(struct mem_cgroup *root_memcg)  {  	/* called when force_empty is called */  	mutex_lock(&percpu_charge_mutex); -	drain_all_stock(root_mem, true); +	drain_all_stock(root_memcg, true);  	mutex_unlock(&percpu_charge_mutex);  } @@ -2121,35 +2131,35 @@ static void drain_all_stock_sync(struct mem_cgroup *root_mem)   * This function drains percpu counter value from DEAD cpu and   * move it to local cpu. Note that this function can be preempted.   */ -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) +static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)  {  	int i; -	spin_lock(&mem->pcp_counter_lock); +	spin_lock(&memcg->pcp_counter_lock);  	for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { -		long x = per_cpu(mem->stat->count[i], cpu); +		long x = per_cpu(memcg->stat->count[i], cpu); -		per_cpu(mem->stat->count[i], cpu) = 0; -		mem->nocpu_base.count[i] += x; +		per_cpu(memcg->stat->count[i], cpu) = 0; +		memcg->nocpu_base.count[i] += x;  	}  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { -		unsigned long x = per_cpu(mem->stat->events[i], cpu); +		unsigned long x = per_cpu(memcg->stat->events[i], cpu); -		per_cpu(mem->stat->events[i], cpu) = 0; -		mem->nocpu_base.events[i] += x; +		per_cpu(memcg->stat->events[i], cpu) = 0; +		memcg->nocpu_base.events[i] += x;  	}  	/* need to clear ON_MOVE value, works as a kind of lock. */ -	per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; -	spin_unlock(&mem->pcp_counter_lock); +	per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; +	spin_unlock(&memcg->pcp_counter_lock);  } -static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) +static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)  {  	int idx = MEM_CGROUP_ON_MOVE; -	spin_lock(&mem->pcp_counter_lock); -	per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; -	spin_unlock(&mem->pcp_counter_lock); +	spin_lock(&memcg->pcp_counter_lock); +	per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx]; +	spin_unlock(&memcg->pcp_counter_lock);  }  static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, @@ -2187,7 +2197,7 @@ enum {  	CHARGE_OOM_DIE,		/* the current is killed because of OOM */  }; -static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, +static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,  				unsigned int nr_pages, bool oom_check)  {  	unsigned long csize = nr_pages * PAGE_SIZE; @@ -2196,16 +2206,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,  	unsigned long flags = 0;  	int ret; -	ret = res_counter_charge(&mem->res, csize, &fail_res); +	ret = res_counter_charge(&memcg->res, csize, &fail_res);  	if (likely(!ret)) {  		if (!do_swap_account)  			return CHARGE_OK; -		ret = res_counter_charge(&mem->memsw, csize, &fail_res); +		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);  		if (likely(!ret))  			return CHARGE_OK; -		res_counter_uncharge(&mem->res, csize); +		res_counter_uncharge(&memcg->res, csize);  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);  		flags |= MEM_CGROUP_RECLAIM_NOSWAP;  	} else @@ -2263,12 +2273,12 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,  static int __mem_cgroup_try_charge(struct mm_struct *mm,  				   gfp_t gfp_mask,  				   unsigned int nr_pages, -				   struct mem_cgroup **memcg, +				   struct mem_cgroup **ptr,  				   bool oom)  {  	unsigned int batch = max(CHARGE_BATCH, nr_pages);  	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; -	struct mem_cgroup *mem = NULL; +	struct mem_cgroup *memcg = NULL;  	int ret;  	/* @@ -2286,17 +2296,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,  	 * thread group leader migrates. It's possible that mm is not  	 * set, if so charge the init_mm (happens for pagecache usage).  	 */ -	if (!*memcg && !mm) +	if (!*ptr && !mm)  		goto bypass;  again: -	if (*memcg) { /* css should be a valid one */ -		mem = *memcg; -		VM_BUG_ON(css_is_removed(&mem->css)); -		if (mem_cgroup_is_root(mem)) +	if (*ptr) { /* css should be a valid one */ +		memcg = *ptr; +		VM_BUG_ON(css_is_removed(&memcg->css)); +		if (mem_cgroup_is_root(memcg))  			goto done; -		if (nr_pages == 1 && consume_stock(mem)) +		if (nr_pages == 1 && consume_stock(memcg))  			goto done; -		css_get(&mem->css); +		css_get(&memcg->css);  	} else {  		struct task_struct *p; @@ -2304,7 +2314,7 @@ again:  		p = rcu_dereference(mm->owner);  		/*  		 * Because we don't have task_lock(), "p" can exit. -		 * In that case, "mem" can point to root or p can be NULL with +		 * In that case, "memcg" can point to root or p can be NULL with  		 * race with swapoff. Then, we have small risk of mis-accouning.  		 * But such kind of mis-account by race always happens because  		 * we don't have cgroup_mutex(). It's overkill and we allo that @@ -2312,12 +2322,12 @@ again:  		 * (*) swapoff at el will charge against mm-struct not against  		 * task-struct. So, mm->owner can be NULL.  		 */ -		mem = mem_cgroup_from_task(p); -		if (!mem || mem_cgroup_is_root(mem)) { +		memcg = mem_cgroup_from_task(p); +		if (!memcg || mem_cgroup_is_root(memcg)) {  			rcu_read_unlock();  			goto done;  		} -		if (nr_pages == 1 && consume_stock(mem)) { +		if (nr_pages == 1 && consume_stock(memcg)) {  			/*  			 * It seems dagerous to access memcg without css_get().  			 * But considering how consume_stok works, it's not @@ -2330,7 +2340,7 @@ again:  			goto done;  		}  		/* after here, we may be blocked. we need to get refcnt */ -		if (!css_tryget(&mem->css)) { +		if (!css_tryget(&memcg->css)) {  			rcu_read_unlock();  			goto again;  		} @@ -2342,7 +2352,7 @@ again:  		/* If killed, bypass charge */  		if (fatal_signal_pending(current)) { -			css_put(&mem->css); +			css_put(&memcg->css);  			goto bypass;  		} @@ -2352,43 +2362,43 @@ again:  			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;  		} -		ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); +		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);  		switch (ret) {  		case CHARGE_OK:  			break;  		case CHARGE_RETRY: /* not in OOM situation but retry */  			batch = nr_pages; -			css_put(&mem->css); -			mem = NULL; +			css_put(&memcg->css); +			memcg = NULL;  			goto again;  		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ -			css_put(&mem->css); +			css_put(&memcg->css);  			goto nomem;  		case CHARGE_NOMEM: /* OOM routine works */  			if (!oom) { -				css_put(&mem->css); +				css_put(&memcg->css);  				goto nomem;  			}  			/* If oom, we never return -ENOMEM */  			nr_oom_retries--;  			break;  		case CHARGE_OOM_DIE: /* Killed by OOM Killer */ -			css_put(&mem->css); +			css_put(&memcg->css);  			goto bypass;  		}  	} while (ret != CHARGE_OK);  	if (batch > nr_pages) -		refill_stock(mem, batch - nr_pages); -	css_put(&mem->css); +		refill_stock(memcg, batch - nr_pages); +	css_put(&memcg->css);  done: -	*memcg = mem; +	*ptr = memcg;  	return 0;  nomem: -	*memcg = NULL; +	*ptr = NULL;  	return -ENOMEM;  bypass: -	*memcg = NULL; +	*ptr = NULL;  	return 0;  } @@ -2397,15 +2407,15 @@ bypass:   * This function is for that and do uncharge, put css's refcnt.   * gotten by try_charge().   */ -static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, +static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,  				       unsigned int nr_pages)  { -	if (!mem_cgroup_is_root(mem)) { +	if (!mem_cgroup_is_root(memcg)) {  		unsigned long bytes = nr_pages * PAGE_SIZE; -		res_counter_uncharge(&mem->res, bytes); +		res_counter_uncharge(&memcg->res, bytes);  		if (do_swap_account) -			res_counter_uncharge(&mem->memsw, bytes); +			res_counter_uncharge(&memcg->memsw, bytes);  	}  } @@ -2430,7 +2440,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)  { -	struct mem_cgroup *mem = NULL; +	struct mem_cgroup *memcg = NULL;  	struct page_cgroup *pc;  	unsigned short id;  	swp_entry_t ent; @@ -2440,23 +2450,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)  	pc = lookup_page_cgroup(page);  	lock_page_cgroup(pc);  	if (PageCgroupUsed(pc)) { -		mem = pc->mem_cgroup; -		if (mem && !css_tryget(&mem->css)) -			mem = NULL; +		memcg = pc->mem_cgroup; +		if (memcg && !css_tryget(&memcg->css)) +			memcg = NULL;  	} else if (PageSwapCache(page)) {  		ent.val = page_private(page);  		id = lookup_swap_cgroup(ent);  		rcu_read_lock(); -		mem = mem_cgroup_lookup(id); -		if (mem && !css_tryget(&mem->css)) -			mem = NULL; +		memcg = mem_cgroup_lookup(id); +		if (memcg && !css_tryget(&memcg->css)) +			memcg = NULL;  		rcu_read_unlock();  	}  	unlock_page_cgroup(pc); -	return mem; +	return memcg;  } -static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, +static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  				       struct page *page,  				       unsigned int nr_pages,  				       struct page_cgroup *pc, @@ -2465,14 +2475,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,  	lock_page_cgroup(pc);  	if (unlikely(PageCgroupUsed(pc))) {  		unlock_page_cgroup(pc); -		__mem_cgroup_cancel_charge(mem, nr_pages); +		__mem_cgroup_cancel_charge(memcg, nr_pages);  		return;  	}  	/*  	 * we don't need page_cgroup_lock about tail pages, becase they are not  	 * accessed by any other context at this point.  	 */ -	pc->mem_cgroup = mem; +	pc->mem_cgroup = memcg;  	/*  	 * We access a page_cgroup asynchronously without lock_page_cgroup().  	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup @@ -2495,14 +2505,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,  		break;  	} -	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); +	mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);  	unlock_page_cgroup(pc);  	/*  	 * "charge_statistics" updated event counter. Then, check it.  	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.  	 * if they exceeds softlimit.  	 */ -	memcg_check_events(mem, page); +	memcg_check_events(memcg, page);  }  #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2689,7 +2699,7 @@ out:  static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,  				gfp_t gfp_mask, enum charge_type ctype)  { -	struct mem_cgroup *mem = NULL; +	struct mem_cgroup *memcg = NULL;  	unsigned int nr_pages = 1;  	struct page_cgroup *pc;  	bool oom = true; @@ -2708,11 +2718,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,  	pc = lookup_page_cgroup(page);  	BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ -	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); -	if (ret || !mem) +	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); +	if (ret || !memcg)  		return ret; -	__mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); +	__mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);  	return 0;  } @@ -2741,7 +2751,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,  					enum charge_type ctype);  static void -__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, +__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,  					enum charge_type ctype)  {  	struct page_cgroup *pc = lookup_page_cgroup(page); @@ -2751,7 +2761,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,  	 * LRU. Take care of it.  	 */  	mem_cgroup_lru_del_before_commit(page); -	__mem_cgroup_commit_charge(mem, page, 1, pc, ctype); +	__mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);  	mem_cgroup_lru_add_after_commit(page);  	return;  } @@ -2759,7 +2769,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,  int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,  				gfp_t gfp_mask)  { -	struct mem_cgroup *mem = NULL; +	struct mem_cgroup *memcg = NULL;  	int ret;  	if (mem_cgroup_disabled()) @@ -2771,8 +2781,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,  		mm = &init_mm;  	if (page_is_file_cache(page)) { -		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); -		if (ret || !mem) +		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); +		if (ret || !memcg)  			return ret;  		/* @@ -2780,15 +2790,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,  		 * put that would remove them from the LRU list, make  		 * sure that they get relinked properly.  		 */ -		__mem_cgroup_commit_charge_lrucare(page, mem, +		__mem_cgroup_commit_charge_lrucare(page, memcg,  					MEM_CGROUP_CHARGE_TYPE_CACHE);  		return ret;  	}  	/* shmem */  	if (PageSwapCache(page)) { -		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); +		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);  		if (!ret) -			__mem_cgroup_commit_charge_swapin(page, mem, +			__mem_cgroup_commit_charge_swapin(page, memcg,  					MEM_CGROUP_CHARGE_TYPE_SHMEM);  	} else  		ret = mem_cgroup_charge_common(page, mm, gfp_mask, @@ -2807,7 +2817,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,  				 struct page *page,  				 gfp_t mask, struct mem_cgroup **ptr)  { -	struct mem_cgroup *mem; +	struct mem_cgroup *memcg;  	int ret;  	*ptr = NULL; @@ -2825,12 +2835,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,  	 */  	if (!PageSwapCache(page))  		goto charge_cur_mm; -	mem = try_get_mem_cgroup_from_page(page); -	if (!mem) +	memcg = try_get_mem_cgroup_from_page(page); +	if (!memcg)  		goto charge_cur_mm; -	*ptr = mem; +	*ptr = memcg;  	ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); -	css_put(&mem->css); +	css_put(&memcg->css);  	return ret;  charge_cur_mm:  	if (unlikely(!mm)) @@ -2890,16 +2900,16 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)  					MEM_CGROUP_CHARGE_TYPE_MAPPED);  } -void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) +void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)  {  	if (mem_cgroup_disabled())  		return; -	if (!mem) +	if (!memcg)  		return; -	__mem_cgroup_cancel_charge(mem, 1); +	__mem_cgroup_cancel_charge(memcg, 1);  } -static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, +static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,  				   unsigned int nr_pages,  				   const enum charge_type ctype)  { @@ -2917,7 +2927,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,  	 * uncharges. Then, it's ok to ignore memcg's refcnt.  	 */  	if (!batch->memcg) -		batch->memcg = mem; +		batch->memcg = memcg;  	/*  	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.  	 * In those cases, all pages freed continuously can be expected to be in @@ -2937,7 +2947,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,  	 * merge a series of uncharges to an uncharge of res_counter.  	 * If not, we uncharge res_counter ony by one.  	 */ -	if (batch->memcg != mem) +	if (batch->memcg != memcg)  		goto direct_uncharge;  	/* remember freed charge and uncharge it later */  	batch->nr_pages++; @@ -2945,11 +2955,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,  		batch->memsw_nr_pages++;  	return;  direct_uncharge: -	res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); +	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);  	if (uncharge_memsw) -		res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); -	if (unlikely(batch->memcg != mem)) -		memcg_oom_recover(mem); +		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); +	if (unlikely(batch->memcg != memcg)) +		memcg_oom_recover(memcg);  	return;  } @@ -2959,7 +2969,7 @@ direct_uncharge:  static struct mem_cgroup *  __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)  { -	struct mem_cgroup *mem = NULL; +	struct mem_cgroup *memcg = NULL;  	unsigned int nr_pages = 1;  	struct page_cgroup *pc; @@ -2982,7 +2992,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)  	lock_page_cgroup(pc); -	mem = pc->mem_cgroup; +	memcg = pc->mem_cgroup;  	if (!PageCgroupUsed(pc))  		goto unlock_out; @@ -3005,7 +3015,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)  		break;  	} -	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); +	mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);  	ClearPageCgroupUsed(pc);  	/* @@ -3017,18 +3027,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)  	unlock_page_cgroup(pc);  	/* -	 * even after unlock, we have mem->res.usage here and this memcg +	 * even after unlock, we have memcg->res.usage here and this memcg  	 * will never be freed.  	 */ -	memcg_check_events(mem, page); +	memcg_check_events(memcg, page);  	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { -		mem_cgroup_swap_statistics(mem, true); -		mem_cgroup_get(mem); +		mem_cgroup_swap_statistics(memcg, true); +		mem_cgroup_get(memcg);  	} -	if (!mem_cgroup_is_root(mem)) -		mem_cgroup_do_uncharge(mem, nr_pages, ctype); +	if (!mem_cgroup_is_root(memcg)) +		mem_cgroup_do_uncharge(memcg, nr_pages, ctype); -	return mem; +	return memcg;  unlock_out:  	unlock_page_cgroup(pc); @@ -3218,7 +3228,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,  int mem_cgroup_prepare_migration(struct page *page,  	struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)  { -	struct mem_cgroup *mem = NULL; +	struct mem_cgroup *memcg = NULL;  	struct page_cgroup *pc;  	enum charge_type ctype;  	int ret = 0; @@ -3232,8 +3242,8 @@ int mem_cgroup_prepare_migration(struct page *page,  	pc = lookup_page_cgroup(page);  	lock_page_cgroup(pc);  	if (PageCgroupUsed(pc)) { -		mem = pc->mem_cgroup; -		css_get(&mem->css); +		memcg = pc->mem_cgroup; +		css_get(&memcg->css);  		/*  		 * At migrating an anonymous page, its mapcount goes down  		 * to 0 and uncharge() will be called. But, even if it's fully @@ -3271,12 +3281,12 @@ int mem_cgroup_prepare_migration(struct page *page,  	 * If the page is not charged at this point,  	 * we return here.  	 */ -	if (!mem) +	if (!memcg)  		return 0; -	*ptr = mem; +	*ptr = memcg;  	ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); -	css_put(&mem->css);/* drop extra refcnt */ +	css_put(&memcg->css);/* drop extra refcnt */  	if (ret || *ptr == NULL) {  		if (PageAnon(page)) {  			lock_page_cgroup(pc); @@ -3302,21 +3312,21 @@ int mem_cgroup_prepare_migration(struct page *page,  		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;  	else  		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; -	__mem_cgroup_commit_charge(mem, page, 1, pc, ctype); +	__mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);  	return ret;  }  /* remove redundant charge if migration failed*/ -void mem_cgroup_end_migration(struct mem_cgroup *mem, +void mem_cgroup_end_migration(struct mem_cgroup *memcg,  	struct page *oldpage, struct page *newpage, bool migration_ok)  {  	struct page *used, *unused;  	struct page_cgroup *pc; -	if (!mem) +	if (!memcg)  		return;  	/* blocks rmdir() */ -	cgroup_exclude_rmdir(&mem->css); +	cgroup_exclude_rmdir(&memcg->css);  	if (!migration_ok) {  		used = oldpage;  		unused = newpage; @@ -3352,7 +3362,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,  	 * So, rmdir()->pre_destroy() can be called while we do this charge.  	 * In that case, we need to call pre_destroy() again. check it here.  	 */ -	cgroup_release_and_wakeup_rmdir(&mem->css); +	cgroup_release_and_wakeup_rmdir(&memcg->css);  }  #ifdef CONFIG_DEBUG_VM @@ -3431,7 +3441,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,  		/*  		 * Rather than hide all in some function, I do this in  		 * open coded manner. You see what this really does. -		 * We have to guarantee mem->res.limit < mem->memsw.limit. +		 * We have to guarantee memcg->res.limit < memcg->memsw.limit.  		 */  		mutex_lock(&set_limit_mutex);  		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); @@ -3493,7 +3503,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,  		/*  		 * Rather than hide all in some function, I do this in  		 * open coded manner. You see what this really does. -		 * We have to guarantee mem->res.limit < mem->memsw.limit. +		 * We have to guarantee memcg->res.limit < memcg->memsw.limit.  		 */  		mutex_lock(&set_limit_mutex);  		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -3631,7 +3641,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,   * This routine traverse page_cgroup in given list and drop them all.   * *And* this routine doesn't reclaim page itself, just removes page_cgroup.   */ -static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, +static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,  				int node, int zid, enum lru_list lru)  {  	struct zone *zone; @@ -3642,7 +3652,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,  	int ret = 0;  	zone = &NODE_DATA(node)->node_zones[zid]; -	mz = mem_cgroup_zoneinfo(mem, node, zid); +	mz = mem_cgroup_zoneinfo(memcg, node, zid);  	list = &mz->lists[lru];  	loop = MEM_CGROUP_ZSTAT(mz, lru); @@ -3669,7 +3679,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,  		page = lookup_cgroup_page(pc); -		ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); +		ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);  		if (ret == -ENOMEM)  			break; @@ -3690,14 +3700,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,   * make mem_cgroup's charge to be 0 if there is no task.   * This enables deleting this mem_cgroup.   */ -static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) +static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)  {  	int ret;  	int node, zid, shrink;  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; -	struct cgroup *cgrp = mem->css.cgroup; +	struct cgroup *cgrp = memcg->css.cgroup; -	css_get(&mem->css); +	css_get(&memcg->css);  	shrink = 0;  	/* should free all ? */ @@ -3713,14 +3723,14 @@ move_account:  			goto out;  		/* This is for making all *used* pages to be on LRU. */  		lru_add_drain_all(); -		drain_all_stock_sync(mem); +		drain_all_stock_sync(memcg);  		ret = 0; -		mem_cgroup_start_move(mem); +		mem_cgroup_start_move(memcg);  		for_each_node_state(node, N_HIGH_MEMORY) {  			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {  				enum lru_list l;  				for_each_lru(l) { -					ret = mem_cgroup_force_empty_list(mem, +					ret = mem_cgroup_force_empty_list(memcg,  							node, zid, l);  					if (ret)  						break; @@ -3729,16 +3739,16 @@ move_account:  			if (ret)  				break;  		} -		mem_cgroup_end_move(mem); -		memcg_oom_recover(mem); +		mem_cgroup_end_move(memcg); +		memcg_oom_recover(memcg);  		/* it seems parent cgroup doesn't have enough mem */  		if (ret == -ENOMEM)  			goto try_to_free;  		cond_resched();  	/* "ret" should also be checked to ensure all lists are empty. */ -	} while (mem->res.usage > 0 || ret); +	} while (memcg->res.usage > 0 || ret);  out: -	css_put(&mem->css); +	css_put(&memcg->css);  	return ret;  try_to_free: @@ -3751,14 +3761,14 @@ try_to_free:  	lru_add_drain_all();  	/* try to free all pages in this cgroup */  	shrink = 1; -	while (nr_retries && mem->res.usage > 0) { +	while (nr_retries && memcg->res.usage > 0) {  		int progress;  		if (signal_pending(current)) {  			ret = -EINTR;  			goto out;  		} -		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, +		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,  						false);  		if (!progress) {  			nr_retries--; @@ -3787,12 +3797,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,  					u64 val)  {  	int retval = 0; -	struct mem_cgroup *mem = mem_cgroup_from_cont(cont); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);  	struct cgroup *parent = cont->parent; -	struct mem_cgroup *parent_mem = NULL; +	struct mem_cgroup *parent_memcg = NULL;  	if (parent) -		parent_mem = mem_cgroup_from_cont(parent); +		parent_memcg = mem_cgroup_from_cont(parent);  	cgroup_lock();  	/* @@ -3803,10 +3813,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,  	 * For the root cgroup, parent_mem is NULL, we allow value to be  	 * set if there are no children.  	 */ -	if ((!parent_mem || !parent_mem->use_hierarchy) && +	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&  				(val == 1 || val == 0)) {  		if (list_empty(&cont->children)) -			mem->use_hierarchy = val; +			memcg->use_hierarchy = val;  		else  			retval = -EBUSY;  	} else @@ -3817,14 +3827,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,  } -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, +static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,  					       enum mem_cgroup_stat_index idx)  {  	struct mem_cgroup *iter;  	long val = 0;  	/* Per-cpu values can be negative, use a signed accumulator */ -	for_each_mem_cgroup_tree(iter, mem) +	for_each_mem_cgroup_tree(iter, memcg)  		val += mem_cgroup_read_stat(iter, idx);  	if (val < 0) /* race ? */ @@ -3832,29 +3842,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,  	return val;  } -static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) +static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)  {  	u64 val; -	if (!mem_cgroup_is_root(mem)) { +	if (!mem_cgroup_is_root(memcg)) {  		if (!swap) -			return res_counter_read_u64(&mem->res, RES_USAGE); +			return res_counter_read_u64(&memcg->res, RES_USAGE);  		else -			return res_counter_read_u64(&mem->memsw, RES_USAGE); +			return res_counter_read_u64(&memcg->memsw, RES_USAGE);  	} -	val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); -	val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); +	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); +	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);  	if (swap) -		val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); +		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);  	return val << PAGE_SHIFT;  }  static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)  { -	struct mem_cgroup *mem = mem_cgroup_from_cont(cont); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);  	u64 val;  	int type, name; @@ -3863,15 +3873,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)  	switch (type) {  	case _MEM:  		if (name == RES_USAGE) -			val = mem_cgroup_usage(mem, false); +			val = mem_cgroup_usage(memcg, false);  		else -			val = res_counter_read_u64(&mem->res, name); +			val = res_counter_read_u64(&memcg->res, name);  		break;  	case _MEMSWAP:  		if (name == RES_USAGE) -			val = mem_cgroup_usage(mem, true); +			val = mem_cgroup_usage(memcg, true);  		else -			val = res_counter_read_u64(&mem->memsw, name); +			val = res_counter_read_u64(&memcg->memsw, name);  		break;  	default:  		BUG(); @@ -3959,24 +3969,24 @@ out:  static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)  { -	struct mem_cgroup *mem; +	struct mem_cgroup *memcg;  	int type, name; -	mem = mem_cgroup_from_cont(cont); +	memcg = mem_cgroup_from_cont(cont);  	type = MEMFILE_TYPE(event);  	name = MEMFILE_ATTR(event);  	switch (name) {  	case RES_MAX_USAGE:  		if (type == _MEM) -			res_counter_reset_max(&mem->res); +			res_counter_reset_max(&memcg->res);  		else -			res_counter_reset_max(&mem->memsw); +			res_counter_reset_max(&memcg->memsw);  		break;  	case RES_FAILCNT:  		if (type == _MEM) -			res_counter_reset_failcnt(&mem->res); +			res_counter_reset_failcnt(&memcg->res);  		else -			res_counter_reset_failcnt(&mem->memsw); +			res_counter_reset_failcnt(&memcg->memsw);  		break;  	} @@ -3993,7 +4003,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,  static int mem_cgroup_move_charge_write(struct cgroup *cgrp,  					struct cftype *cft, u64 val)  { -	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);  	if (val >= (1 << NR_MOVE_TYPE))  		return -EINVAL; @@ -4003,7 +4013,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,  	 * inconsistent.  	 */  	cgroup_lock(); -	mem->move_charge_at_immigrate = val; +	memcg->move_charge_at_immigrate = val;  	cgroup_unlock();  	return 0; @@ -4060,49 +4070,49 @@ struct {  static void -mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) +mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)  {  	s64 val;  	/* per cpu stat */ -	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); +	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);  	s->stat[MCS_CACHE] += val * PAGE_SIZE; -	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); +	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);  	s->stat[MCS_RSS] += val * PAGE_SIZE; -	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); +	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);  	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; -	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); +	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);  	s->stat[MCS_PGPGIN] += val; -	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); +	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);  	s->stat[MCS_PGPGOUT] += val;  	if (do_swap_account) { -		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); +		val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);  		s->stat[MCS_SWAP] += val * PAGE_SIZE;  	} -	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); +	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);  	s->stat[MCS_PGFAULT] += val; -	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); +	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);  	s->stat[MCS_PGMAJFAULT] += val;  	/* per zone stat */ -	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); +	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));  	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; -	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); +	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));  	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; -	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); +	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));  	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; -	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); +	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));  	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; -	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); +	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));  	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;  }  static void -mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) +mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)  {  	struct mem_cgroup *iter; -	for_each_mem_cgroup_tree(iter, mem) +	for_each_mem_cgroup_tree(iter, memcg)  		mem_cgroup_get_local_stat(iter, s);  } @@ -4188,8 +4198,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,  	}  #ifdef CONFIG_DEBUG_VM -	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); -  	{  		int nid, zid;  		struct mem_cgroup_per_zone *mz; @@ -4326,20 +4334,20 @@ static int compare_thresholds(const void *a, const void *b)  	return _a->threshold - _b->threshold;  } -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)  {  	struct mem_cgroup_eventfd_list *ev; -	list_for_each_entry(ev, &mem->oom_notify, list) +	list_for_each_entry(ev, &memcg->oom_notify, list)  		eventfd_signal(ev->eventfd, 1);  	return 0;  } -static void mem_cgroup_oom_notify(struct mem_cgroup *mem) +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)  {  	struct mem_cgroup *iter; -	for_each_mem_cgroup_tree(iter, mem) +	for_each_mem_cgroup_tree(iter, memcg)  		mem_cgroup_oom_notify_cb(iter);  } @@ -4529,7 +4537,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,  static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,  	struct cftype *cft, struct eventfd_ctx *eventfd)  { -	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);  	struct mem_cgroup_eventfd_list *ev, *tmp;  	int type = MEMFILE_TYPE(cft->private); @@ -4537,7 +4545,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,  	spin_lock(&memcg_oom_lock); -	list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { +	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {  		if (ev->eventfd == eventfd) {  			list_del(&ev->list);  			kfree(ev); @@ -4550,11 +4558,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,  static int mem_cgroup_oom_control_read(struct cgroup *cgrp,  	struct cftype *cft,  struct cgroup_map_cb *cb)  { -	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); -	cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); +	cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); -	if (atomic_read(&mem->under_oom)) +	if (atomic_read(&memcg->under_oom))  		cb->fill(cb, "under_oom", 1);  	else  		cb->fill(cb, "under_oom", 0); @@ -4564,7 +4572,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,  static int mem_cgroup_oom_control_write(struct cgroup *cgrp,  	struct cftype *cft, u64 val)  { -	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);  	struct mem_cgroup *parent;  	/* cannot set to root cgroup and only 0 and 1 are allowed */ @@ -4576,13 +4584,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,  	cgroup_lock();  	/* oom-kill-disable is a flag for subhierarchy. */  	if ((parent->use_hierarchy) || -	    (mem->use_hierarchy && !list_empty(&cgrp->children))) { +	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {  		cgroup_unlock();  		return -EINVAL;  	} -	mem->oom_kill_disable = val; +	memcg->oom_kill_disable = val;  	if (!val) -		memcg_oom_recover(mem); +		memcg_oom_recover(memcg);  	cgroup_unlock();  	return 0;  } @@ -4718,7 +4726,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)  }  #endif -static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) +static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)  {  	struct mem_cgroup_per_node *pn;  	struct mem_cgroup_per_zone *mz; @@ -4738,21 +4746,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)  	if (!pn)  		return 1; -	mem->info.nodeinfo[node] = pn;  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {  		mz = &pn->zoneinfo[zone];  		for_each_lru(l)  			INIT_LIST_HEAD(&mz->lists[l]);  		mz->usage_in_excess = 0;  		mz->on_tree = false; -		mz->mem = mem; +		mz->mem = memcg;  	} +	memcg->info.nodeinfo[node] = pn;  	return 0;  } -static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) +static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)  { -	kfree(mem->info.nodeinfo[node]); +	kfree(memcg->info.nodeinfo[node]);  }  static struct mem_cgroup *mem_cgroup_alloc(void) @@ -4794,51 +4802,51 @@ out_free:   * Removal of cgroup itself succeeds regardless of refs from swap.   */ -static void __mem_cgroup_free(struct mem_cgroup *mem) +static void __mem_cgroup_free(struct mem_cgroup *memcg)  {  	int node; -	mem_cgroup_remove_from_trees(mem); -	free_css_id(&mem_cgroup_subsys, &mem->css); +	mem_cgroup_remove_from_trees(memcg); +	free_css_id(&mem_cgroup_subsys, &memcg->css);  	for_each_node_state(node, N_POSSIBLE) -		free_mem_cgroup_per_zone_info(mem, node); +		free_mem_cgroup_per_zone_info(memcg, node); -	free_percpu(mem->stat); +	free_percpu(memcg->stat);  	if (sizeof(struct mem_cgroup) < PAGE_SIZE) -		kfree(mem); +		kfree(memcg);  	else -		vfree(mem); +		vfree(memcg);  } -static void mem_cgroup_get(struct mem_cgroup *mem) +static void mem_cgroup_get(struct mem_cgroup *memcg)  { -	atomic_inc(&mem->refcnt); +	atomic_inc(&memcg->refcnt);  } -static void __mem_cgroup_put(struct mem_cgroup *mem, int count) +static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)  { -	if (atomic_sub_and_test(count, &mem->refcnt)) { -		struct mem_cgroup *parent = parent_mem_cgroup(mem); -		__mem_cgroup_free(mem); +	if (atomic_sub_and_test(count, &memcg->refcnt)) { +		struct mem_cgroup *parent = parent_mem_cgroup(memcg); +		__mem_cgroup_free(memcg);  		if (parent)  			mem_cgroup_put(parent);  	}  } -static void mem_cgroup_put(struct mem_cgroup *mem) +static void mem_cgroup_put(struct mem_cgroup *memcg)  { -	__mem_cgroup_put(mem, 1); +	__mem_cgroup_put(memcg, 1);  }  /*   * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.   */ -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)  { -	if (!mem->res.parent) +	if (!memcg->res.parent)  		return NULL; -	return mem_cgroup_from_res_counter(mem->res.parent, res); +	return mem_cgroup_from_res_counter(memcg->res.parent, res);  }  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -4881,16 +4889,16 @@ static int mem_cgroup_soft_limit_tree_init(void)  static struct cgroup_subsys_state * __ref  mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)  { -	struct mem_cgroup *mem, *parent; +	struct mem_cgroup *memcg, *parent;  	long error = -ENOMEM;  	int node; -	mem = mem_cgroup_alloc(); -	if (!mem) +	memcg = mem_cgroup_alloc(); +	if (!memcg)  		return ERR_PTR(error);  	for_each_node_state(node, N_POSSIBLE) -		if (alloc_mem_cgroup_per_zone_info(mem, node)) +		if (alloc_mem_cgroup_per_zone_info(memcg, node))  			goto free_out;  	/* root ? */ @@ -4898,7 +4906,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)  		int cpu;  		enable_swap_cgroup();  		parent = NULL; -		root_mem_cgroup = mem; +		root_mem_cgroup = memcg;  		if (mem_cgroup_soft_limit_tree_init())  			goto free_out;  		for_each_possible_cpu(cpu) { @@ -4909,13 +4917,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)  		hotcpu_notifier(memcg_cpu_hotplug_callback, 0);  	} else {  		parent = mem_cgroup_from_cont(cont->parent); -		mem->use_hierarchy = parent->use_hierarchy; -		mem->oom_kill_disable = parent->oom_kill_disable; +		memcg->use_hierarchy = parent->use_hierarchy; +		memcg->oom_kill_disable = parent->oom_kill_disable;  	}  	if (parent && parent->use_hierarchy) { -		res_counter_init(&mem->res, &parent->res); -		res_counter_init(&mem->memsw, &parent->memsw); +		res_counter_init(&memcg->res, &parent->res); +		res_counter_init(&memcg->memsw, &parent->memsw);  		/*  		 * We increment refcnt of the parent to ensure that we can  		 * safely access it on res_counter_charge/uncharge. @@ -4924,21 +4932,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)  		 */  		mem_cgroup_get(parent);  	} else { -		res_counter_init(&mem->res, NULL); -		res_counter_init(&mem->memsw, NULL); +		res_counter_init(&memcg->res, NULL); +		res_counter_init(&memcg->memsw, NULL);  	} -	mem->last_scanned_child = 0; -	mem->last_scanned_node = MAX_NUMNODES; -	INIT_LIST_HEAD(&mem->oom_notify); +	memcg->last_scanned_child = 0; +	memcg->last_scanned_node = MAX_NUMNODES; +	INIT_LIST_HEAD(&memcg->oom_notify);  	if (parent) -		mem->swappiness = mem_cgroup_swappiness(parent); -	atomic_set(&mem->refcnt, 1); -	mem->move_charge_at_immigrate = 0; -	mutex_init(&mem->thresholds_lock); -	return &mem->css; +		memcg->swappiness = mem_cgroup_swappiness(parent); +	atomic_set(&memcg->refcnt, 1); +	memcg->move_charge_at_immigrate = 0; +	mutex_init(&memcg->thresholds_lock); +	return &memcg->css;  free_out: -	__mem_cgroup_free(mem); +	__mem_cgroup_free(memcg);  	root_mem_cgroup = NULL;  	return ERR_PTR(error);  } @@ -4946,17 +4954,17 @@ free_out:  static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,  					struct cgroup *cont)  { -	struct mem_cgroup *mem = mem_cgroup_from_cont(cont); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); -	return mem_cgroup_force_empty(mem, false); +	return mem_cgroup_force_empty(memcg, false);  }  static void mem_cgroup_destroy(struct cgroup_subsys *ss,  				struct cgroup *cont)  { -	struct mem_cgroup *mem = mem_cgroup_from_cont(cont); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); -	mem_cgroup_put(mem); +	mem_cgroup_put(memcg);  }  static int mem_cgroup_populate(struct cgroup_subsys *ss, @@ -4979,9 +4987,9 @@ static int mem_cgroup_do_precharge(unsigned long count)  {  	int ret = 0;  	int batch_count = PRECHARGE_COUNT_AT_ONCE; -	struct mem_cgroup *mem = mc.to; +	struct mem_cgroup *memcg = mc.to; -	if (mem_cgroup_is_root(mem)) { +	if (mem_cgroup_is_root(memcg)) {  		mc.precharge += count;  		/* we don't need css_get for root */  		return ret; @@ -4990,16 +4998,16 @@ static int mem_cgroup_do_precharge(unsigned long count)  	if (count > 1) {  		struct res_counter *dummy;  		/* -		 * "mem" cannot be under rmdir() because we've already checked +		 * "memcg" cannot be under rmdir() because we've already checked  		 * by cgroup_lock_live_cgroup() that it is not removed and we  		 * are still under the same cgroup_mutex. So we can postpone  		 * css_get().  		 */ -		if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) +		if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))  			goto one_by_one; -		if (do_swap_account && res_counter_charge(&mem->memsw, +		if (do_swap_account && res_counter_charge(&memcg->memsw,  						PAGE_SIZE * count, &dummy)) { -			res_counter_uncharge(&mem->res, PAGE_SIZE * count); +			res_counter_uncharge(&memcg->res, PAGE_SIZE * count);  			goto one_by_one;  		}  		mc.precharge += count; @@ -5016,8 +5024,9 @@ one_by_one:  			batch_count = PRECHARGE_COUNT_AT_ONCE;  			cond_resched();  		} -		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); -		if (ret || !mem) +		ret = __mem_cgroup_try_charge(NULL, +					GFP_KERNEL, 1, &memcg, false); +		if (ret || !memcg)  			/* mem_cgroup_clear_mc() will do uncharge later */  			return -ENOMEM;  		mc.precharge++; @@ -5291,13 +5300,13 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,  				struct task_struct *p)  {  	int ret = 0; -	struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); -	if (mem->move_charge_at_immigrate) { +	if (memcg->move_charge_at_immigrate) {  		struct mm_struct *mm;  		struct mem_cgroup *from = mem_cgroup_from_task(p); -		VM_BUG_ON(from == mem); +		VM_BUG_ON(from == memcg);  		mm = get_task_mm(p);  		if (!mm) @@ -5312,7 +5321,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,  			mem_cgroup_start_move(from);  			spin_lock(&mc.lock);  			mc.from = from; -			mc.to = mem; +			mc.to = memcg;  			spin_unlock(&mc.lock);  			/* We set mc.moving_task later */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2b43ba051ac..edc388db730 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1310,7 +1310,7 @@ int unpoison_memory(unsigned long pfn)  		 * to the end.  		 */  		if (PageHuge(page)) { -			pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); +			pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);  			return 0;  		}  		if (TestClearPageHWPoison(p)) @@ -1419,7 +1419,7 @@ static int soft_offline_huge_page(struct page *page, int flags)  	if (PageHWPoison(hpage)) {  		put_page(hpage); -		pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); +		pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);  		return -EBUSY;  	} @@ -1433,8 +1433,8 @@ static int soft_offline_huge_page(struct page *page, int flags)  		list_for_each_entry_safe(page1, page2, &pagelist, lru)  			put_page(page1); -		pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", -			 pfn, ret, page->flags); +		pr_info("soft offline: %#lx: migration failed %d, type %lx\n", +			pfn, ret, page->flags);  		if (ret > 0)  			ret = -EIO;  		return ret; @@ -1505,7 +1505,7 @@ int soft_offline_page(struct page *page, int flags)  	}  	if (!PageLRU(page)) {  		pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", -				pfn, page->flags); +			pfn, page->flags);  		return -EIO;  	} @@ -1566,7 +1566,7 @@ int soft_offline_page(struct page *page, int flags)  		}  	} else {  		pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", -				pfn, ret, page_count(page), page->flags); +			pfn, ret, page_count(page), page->flags);  	}  	if (ret)  		return ret; diff --git a/mm/memory.c b/mm/memory.c index a56e3ba816b..b2b87315cdc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1503,7 +1503,7 @@ split_fallthrough:  	}  	if (flags & FOLL_GET) -		get_page(page); +		get_page_foll(page);  	if (flags & FOLL_TOUCH) {  		if ((flags & FOLL_WRITE) &&  		    !pte_dirty(pte) && !PageDirty(page)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9c51f9f58ca..cd237f47830 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -111,7 +111,7 @@ enum zone_type policy_zone = 0;  /*   * run-time system-wide default policy => local allocation   */ -struct mempolicy default_policy = { +static struct mempolicy default_policy = {  	.refcnt = ATOMIC_INIT(1), /* never free it */  	.mode = MPOL_PREFERRED,  	.flags = MPOL_F_LOCAL, diff --git a/mm/migrate.c b/mm/migrate.c index 666e4e67741..33358f87811 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,  		ptep = pte_offset_map(pmd, addr); -		if (!is_swap_pte(*ptep)) { -			pte_unmap(ptep); -			goto out; -		} +		/* +		 * Peek to check is_swap_pte() before taking ptlock?  No, we +		 * can race mremap's move_ptes(), which skips anon_vma lock. +		 */  		ptl = pte_lockptr(mm, pmd);  	} @@ -621,38 +621,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,  	return rc;  } -/* - * Obtain the lock on page, remove all ptes and migrate the page - * to the newly allocated page in newpage. - */ -static int unmap_and_move(new_page_t get_new_page, unsigned long private, -			struct page *page, int force, bool offlining, bool sync) +static int __unmap_and_move(struct page *page, struct page *newpage, +				int force, bool offlining, bool sync)  { -	int rc = 0; -	int *result = NULL; -	struct page *newpage = get_new_page(page, private, &result); +	int rc = -EAGAIN;  	int remap_swapcache = 1;  	int charge = 0;  	struct mem_cgroup *mem;  	struct anon_vma *anon_vma = NULL; -	if (!newpage) -		return -ENOMEM; - -	if (page_count(page) == 1) { -		/* page was freed from under us. So we are done. */ -		goto move_newpage; -	} -	if (unlikely(PageTransHuge(page))) -		if (unlikely(split_huge_page(page))) -			goto move_newpage; - -	/* prepare cgroup just returns 0 or -ENOMEM */ -	rc = -EAGAIN; -  	if (!trylock_page(page)) {  		if (!force || !sync) -			goto move_newpage; +			goto out;  		/*  		 * It's not safe for direct compaction to call lock_page. @@ -668,7 +648,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,  		 * altogether.  		 */  		if (current->flags & PF_MEMALLOC) -			goto move_newpage; +			goto out;  		lock_page(page);  	} @@ -785,27 +765,52 @@ uncharge:  		mem_cgroup_end_migration(mem, page, newpage, rc == 0);  unlock:  	unlock_page(page); +out: +	return rc; +} -move_newpage: +/* + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +static int unmap_and_move(new_page_t get_new_page, unsigned long private, +			struct page *page, int force, bool offlining, bool sync) +{ +	int rc = 0; +	int *result = NULL; +	struct page *newpage = get_new_page(page, private, &result); + +	if (!newpage) +		return -ENOMEM; + +	if (page_count(page) == 1) { +		/* page was freed from under us. So we are done. */ +		goto out; +	} + +	if (unlikely(PageTransHuge(page))) +		if (unlikely(split_huge_page(page))) +			goto out; + +	rc = __unmap_and_move(page, newpage, force, offlining, sync); +out:  	if (rc != -EAGAIN) { - 		/* - 		 * A page that has been migrated has all references - 		 * removed and will be freed. A page that has not been - 		 * migrated will have kepts its references and be - 		 * restored. - 		 */ - 		list_del(&page->lru); +		/* +		 * A page that has been migrated has all references +		 * removed and will be freed. A page that has not been +		 * migrated will have kepts its references and be +		 * restored. +		 */ +		list_del(&page->lru);  		dec_zone_page_state(page, NR_ISOLATED_ANON +  				page_is_file_cache(page));  		putback_lru_page(page);  	} -  	/*  	 * Move the new page to the LRU. If migration was not successful  	 * then this will free the page.  	 */  	putback_lru_page(newpage); -  	if (result) {  		if (rc)  			*result = rc; diff --git a/mm/mlock.c b/mm/mlock.c index 048260c4e02..bd34b3a1085 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -110,7 +110,15 @@ void munlock_vma_page(struct page *page)  	if (TestClearPageMlocked(page)) {  		dec_zone_page_state(page, NR_MLOCK);  		if (!isolate_lru_page(page)) { -			int ret = try_to_munlock(page); +			int ret = SWAP_AGAIN; + +			/* +			 * Optimization: if the page was mapped just once, +			 * that's our mapping and we don't need to check all the +			 * other vmas. +			 */ +			if (page_mapcount(page) > 1) +				ret = try_to_munlock(page);  			/*  			 * did try_to_unlock() succeed or punt?  			 */ @@ -549,7 +557,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)  	if (!can_do_mlock())  		goto out; -	lru_add_drain_all();	/* flush pagevec */ +	if (flags & MCL_CURRENT) +		lru_add_drain_all();	/* flush pagevec */  	down_write(¤t->mm->mmap_sem); diff --git a/mm/mmap.c b/mm/mmap.c index a65efd4db3e..3c0061f744f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2558,7 +2558,6 @@ int mm_take_all_locks(struct mm_struct *mm)  {  	struct vm_area_struct *vma;  	struct anon_vma_chain *avc; -	int ret = -EINTR;  	BUG_ON(down_read_trylock(&mm->mmap_sem)); @@ -2579,13 +2578,11 @@ int mm_take_all_locks(struct mm_struct *mm)  				vm_lock_anon_vma(mm, avc->anon_vma);  	} -	ret = 0; +	return 0;  out_unlock: -	if (ret) -		mm_drop_all_locks(mm); - -	return ret; +	mm_drop_all_locks(mm); +	return -EINTR;  }  static void vm_unlock_anon_vma(struct anon_vma *anon_vma) diff --git a/mm/mremap.c b/mm/mremap.c index 506fa44403d..d6959cb4df5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)  		return NULL;  	pmd = pmd_offset(pud, addr); -	split_huge_page_pmd(mm, pmd); -	if (pmd_none_or_clear_bad(pmd)) +	if (pmd_none(*pmd))  		return NULL;  	return pmd; @@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,  		return NULL;  	VM_BUG_ON(pmd_trans_huge(*pmd)); -	if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) -		return NULL;  	return pmd;  } @@ -80,11 +77,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,  	struct mm_struct *mm = vma->vm_mm;  	pte_t *old_pte, *new_pte, pte;  	spinlock_t *old_ptl, *new_ptl; -	unsigned long old_start; -	old_start = old_addr; -	mmu_notifier_invalidate_range_start(vma->vm_mm, -					    old_start, old_end);  	if (vma->vm_file) {  		/*  		 * Subtle point from Rajesh Venkatasubramanian: before @@ -111,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,  				   new_pte++, new_addr += PAGE_SIZE) {  		if (pte_none(*old_pte))  			continue; -		pte = ptep_clear_flush(vma, old_addr, old_pte); +		pte = ptep_get_and_clear(mm, old_addr, old_pte);  		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);  		set_pte_at(mm, new_addr, new_pte, pte);  	} @@ -123,7 +116,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,  	pte_unmap_unlock(old_pte - 1, old_ptl);  	if (mapping)  		mutex_unlock(&mapping->i_mmap_mutex); -	mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);  }  #define LATENCY_LIMIT	(64 * PAGE_SIZE) @@ -134,22 +126,43 @@ unsigned long move_page_tables(struct vm_area_struct *vma,  {  	unsigned long extent, next, old_end;  	pmd_t *old_pmd, *new_pmd; +	bool need_flush = false;  	old_end = old_addr + len;  	flush_cache_range(vma, old_addr, old_end); +	mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); +  	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {  		cond_resched();  		next = (old_addr + PMD_SIZE) & PMD_MASK; -		if (next - 1 > old_end) -			next = old_end; +		/* even if next overflowed, extent below will be ok */  		extent = next - old_addr; +		if (extent > old_end - old_addr) +			extent = old_end - old_addr;  		old_pmd = get_old_pmd(vma->vm_mm, old_addr);  		if (!old_pmd)  			continue;  		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);  		if (!new_pmd)  			break; +		if (pmd_trans_huge(*old_pmd)) { +			int err = 0; +			if (extent == HPAGE_PMD_SIZE) +				err = move_huge_pmd(vma, new_vma, old_addr, +						    new_addr, old_end, +						    old_pmd, new_pmd); +			if (err > 0) { +				need_flush = true; +				continue; +			} else if (!err) { +				split_huge_page_pmd(vma->vm_mm, old_pmd); +			} +			VM_BUG_ON(pmd_trans_huge(*old_pmd)); +		} +		if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, +						      new_pmd, new_addr)) +			break;  		next = (new_addr + PMD_SIZE) & PMD_MASK;  		if (extent > next - new_addr)  			extent = next - new_addr; @@ -157,7 +170,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,  			extent = LATENCY_LIMIT;  		move_ptes(vma, old_pmd, old_addr, old_addr + extent,  				new_vma, new_pmd, new_addr); +		need_flush = true;  	} +	if (likely(need_flush)) +		flush_tlb_range(vma, old_end-len, old_addr); + +	mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);  	return len + old_addr - old_end;	/* how much done */  } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 626303b52f3..e916168b6e0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -32,12 +32,32 @@  #include <linux/mempolicy.h>  #include <linux/security.h>  #include <linux/ptrace.h> +#include <linux/freezer.h>  int sysctl_panic_on_oom;  int sysctl_oom_kill_allocating_task;  int sysctl_oom_dump_tasks = 1;  static DEFINE_SPINLOCK(zone_scan_lock); +/* + * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj + * @old_val: old oom_score_adj for compare + * @new_val: new oom_score_adj for swap + * + * Sets the oom_score_adj value for current to @new_val iff its present value is + * @old_val.  Usually used to reinstate a previous value to prevent racing with + * userspacing tuning the value in the interim. + */ +void compare_swap_oom_score_adj(int old_val, int new_val) +{ +	struct sighand_struct *sighand = current->sighand; + +	spin_lock_irq(&sighand->siglock); +	if (current->signal->oom_score_adj == old_val) +		current->signal->oom_score_adj = new_val; +	spin_unlock_irq(&sighand->siglock); +} +  /**   * test_set_oom_score_adj() - set current's oom_score_adj and return old value   * @new_val: new oom_score_adj value @@ -53,13 +73,7 @@ int test_set_oom_score_adj(int new_val)  	spin_lock_irq(&sighand->siglock);  	old_val = current->signal->oom_score_adj; -	if (new_val != old_val) { -		if (new_val == OOM_SCORE_ADJ_MIN) -			atomic_inc(¤t->mm->oom_disable_count); -		else if (old_val == OOM_SCORE_ADJ_MIN) -			atomic_dec(¤t->mm->oom_disable_count); -		current->signal->oom_score_adj = new_val; -	} +	current->signal->oom_score_adj = new_val;  	spin_unlock_irq(&sighand->siglock);  	return old_val; @@ -172,16 +186,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,  		return 0;  	/* -	 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN -	 * so the entire heuristic doesn't need to be executed for something -	 * that cannot be killed. -	 */ -	if (atomic_read(&p->mm->oom_disable_count)) { -		task_unlock(p); -		return 0; -	} - -	/*  	 * The memory controller may have a limit of 0 bytes, so avoid a divide  	 * by zero, if necessary.  	 */ @@ -317,8 +321,11 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,  		 * blocked waiting for another task which itself is waiting  		 * for memory. Is there a better alternative?  		 */ -		if (test_tsk_thread_flag(p, TIF_MEMDIE)) +		if (test_tsk_thread_flag(p, TIF_MEMDIE)) { +			if (unlikely(frozen(p))) +				thaw_process(p);  			return ERR_PTR(-1UL); +		}  		if (!p->mm)  			continue; @@ -435,7 +442,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)  	task_unlock(p);  	/* -	 * Kill all processes sharing p->mm in other thread groups, if any. +	 * Kill all user processes sharing p->mm in other thread groups, if any.  	 * They don't get access to memory reserves or a higher scheduler  	 * priority, though, to avoid depletion of all memory or task  	 * starvation.  This prevents mm->mmap_sem livelock when an oom killed @@ -445,7 +452,11 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)  	 * signal.  	 */  	for_each_process(q) -		if (q->mm == mm && !same_thread_group(q, p)) { +		if (q->mm == mm && !same_thread_group(q, p) && +		    !(q->flags & PF_KTHREAD)) { +			if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) +				continue; +  			task_lock(q);	/* Protect ->comm from prctl() */  			pr_err("Kill process %d (%s) sharing same memory\n",  				task_pid_nr(q), q->comm); @@ -722,7 +733,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,  	read_lock(&tasklist_lock);  	if (sysctl_oom_kill_allocating_task &&  	    !oom_unkillable_task(current, NULL, nodemask) && -	    current->mm && !atomic_read(¤t->mm->oom_disable_count)) { +	    current->mm) {  		/*  		 * oom_kill_process() needs tasklist_lock held.  If it returns  		 * non-zero, current could not be killed so we must fallback to diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 650846b6158..9c31199dae9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -251,7 +251,9 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,  }  /* - * + * bdi_min_ratio keeps the sum of the minimum dirty shares of all + * registered backing devices, which, for obvious reasons, can not + * exceed 100%.   */  static unsigned int bdi_min_ratio; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e8ecb6e021..9dd443d89d8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -318,6 +318,7 @@ static void bad_page(struct page *page)  		current->comm, page_to_pfn(page));  	dump_page(page); +	print_modules();  	dump_stack();  out:  	/* Leave bad fields for debug, except PageBuddy could make trouble */ @@ -1753,7 +1754,6 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,  void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)  { -	va_list args;  	unsigned int filter = SHOW_MEM_FILTER_NODES;  	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) @@ -1772,14 +1772,21 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)  		filter &= ~SHOW_MEM_FILTER_NODES;  	if (fmt) { -		printk(KERN_WARNING); +		struct va_format vaf; +		va_list args; +  		va_start(args, fmt); -		vprintk(fmt, args); + +		vaf.fmt = fmt; +		vaf.va = &args; + +		pr_warn("%pV", &vaf); +  		va_end(args);  	} -	pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", -		   current->comm, order, gfp_mask); +	pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", +		current->comm, order, gfp_mask);  	dump_stack();  	if (!should_suppress_show_mem()) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 39d216d535e..2d123f94a8d 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc)  static void *__meminit alloc_page_cgroup(size_t size, int nid)  {  	void *addr = NULL; +	gfp_t flags = GFP_KERNEL | __GFP_NOWARN; -	addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); -	if (addr) +	addr = alloc_pages_exact_nid(nid, size, flags); +	if (addr) { +		kmemleak_alloc(addr, size, 1, flags);  		return addr; +	}  	if (node_state(nid, N_HIGH_MEMORY))  		addr = vmalloc_node(size, nid); @@ -357,7 +360,7 @@ struct swap_cgroup_ctrl {  	spinlock_t	lock;  }; -struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];  struct swap_cgroup {  	unsigned short		id; @@ -513,11 +516,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)  	length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);  	array_size = length * sizeof(void *); -	array = vmalloc(array_size); +	array = vzalloc(array_size);  	if (!array)  		goto nomem; -	memset(array, 0, array_size);  	ctrl = &swap_cgroup_ctrl[type];  	mutex_lock(&swap_cgroup_mutex);  	ctrl->length = length; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c new file mode 100644 index 00000000000..e920aa3ce10 --- /dev/null +++ b/mm/process_vm_access.c @@ -0,0 +1,496 @@ +/* + * linux/mm/process_vm_access.c + * + * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/mm.h> +#include <linux/uio.h> +#include <linux/sched.h> +#include <linux/highmem.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/syscalls.h> + +#ifdef CONFIG_COMPAT +#include <linux/compat.h> +#endif + +/** + * process_vm_rw_pages - read/write pages from task specified + * @task: task to read/write from + * @mm: mm for task + * @process_pages: struct pages area that can store at least + *  nr_pages_to_copy struct page pointers + * @pa: address of page in task to start copying from/to + * @start_offset: offset in page to start copying from/to + * @len: number of bytes to copy + * @lvec: iovec array specifying where to copy to/from + * @lvec_cnt: number of elements in iovec array + * @lvec_current: index in iovec array we are up to + * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @vm_write: 0 means copy from, 1 means copy to + * @nr_pages_to_copy: number of pages to copy + * @bytes_copied: returns number of bytes successfully copied + * Returns 0 on success, error code otherwise + */ +static int process_vm_rw_pages(struct task_struct *task, +			       struct mm_struct *mm, +			       struct page **process_pages, +			       unsigned long pa, +			       unsigned long start_offset, +			       unsigned long len, +			       const struct iovec *lvec, +			       unsigned long lvec_cnt, +			       unsigned long *lvec_current, +			       size_t *lvec_offset, +			       int vm_write, +			       unsigned int nr_pages_to_copy, +			       ssize_t *bytes_copied) +{ +	int pages_pinned; +	void *target_kaddr; +	int pgs_copied = 0; +	int j; +	int ret; +	ssize_t bytes_to_copy; +	ssize_t rc = 0; + +	*bytes_copied = 0; + +	/* Get the pages we're interested in */ +	down_read(&mm->mmap_sem); +	pages_pinned = get_user_pages(task, mm, pa, +				      nr_pages_to_copy, +				      vm_write, 0, process_pages, NULL); +	up_read(&mm->mmap_sem); + +	if (pages_pinned != nr_pages_to_copy) { +		rc = -EFAULT; +		goto end; +	} + +	/* Do the copy for each page */ +	for (pgs_copied = 0; +	     (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); +	     pgs_copied++) { +		/* Make sure we have a non zero length iovec */ +		while (*lvec_current < lvec_cnt +		       && lvec[*lvec_current].iov_len == 0) +			(*lvec_current)++; +		if (*lvec_current == lvec_cnt) +			break; + +		/* +		 * Will copy smallest of: +		 * - bytes remaining in page +		 * - bytes remaining in destination iovec +		 */ +		bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, +				      len - *bytes_copied); +		bytes_to_copy = min_t(ssize_t, bytes_to_copy, +				      lvec[*lvec_current].iov_len +				      - *lvec_offset); + +		target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; + +		if (vm_write) +			ret = copy_from_user(target_kaddr, +					     lvec[*lvec_current].iov_base +					     + *lvec_offset, +					     bytes_to_copy); +		else +			ret = copy_to_user(lvec[*lvec_current].iov_base +					   + *lvec_offset, +					   target_kaddr, bytes_to_copy); +		kunmap(process_pages[pgs_copied]); +		if (ret) { +			*bytes_copied += bytes_to_copy - ret; +			pgs_copied++; +			rc = -EFAULT; +			goto end; +		} +		*bytes_copied += bytes_to_copy; +		*lvec_offset += bytes_to_copy; +		if (*lvec_offset == lvec[*lvec_current].iov_len) { +			/* +			 * Need to copy remaining part of page into the +			 * next iovec if there are any bytes left in page +			 */ +			(*lvec_current)++; +			*lvec_offset = 0; +			start_offset = (start_offset + bytes_to_copy) +				% PAGE_SIZE; +			if (start_offset) +				pgs_copied--; +		} else { +			start_offset = 0; +		} +	} + +end: +	if (vm_write) { +		for (j = 0; j < pages_pinned; j++) { +			if (j < pgs_copied) +				set_page_dirty_lock(process_pages[j]); +			put_page(process_pages[j]); +		} +	} else { +		for (j = 0; j < pages_pinned; j++) +			put_page(process_pages[j]); +	} + +	return rc; +} + +/* Maximum number of pages kmalloc'd to hold struct page's during copy */ +#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2) + +/** + * process_vm_rw_single_vec - read/write pages from task specified + * @addr: start memory address of target process + * @len: size of area to copy to/from + * @lvec: iovec array specifying where to copy to/from locally + * @lvec_cnt: number of elements in iovec array + * @lvec_current: index in iovec array we are up to + * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @process_pages: struct pages area that can store at least + *  nr_pages_to_copy struct page pointers + * @mm: mm for task + * @task: task to read/write from + * @vm_write: 0 means copy from, 1 means copy to + * @bytes_copied: returns number of bytes successfully copied + * Returns 0 on success or on failure error code + */ +static int process_vm_rw_single_vec(unsigned long addr, +				    unsigned long len, +				    const struct iovec *lvec, +				    unsigned long lvec_cnt, +				    unsigned long *lvec_current, +				    size_t *lvec_offset, +				    struct page **process_pages, +				    struct mm_struct *mm, +				    struct task_struct *task, +				    int vm_write, +				    ssize_t *bytes_copied) +{ +	unsigned long pa = addr & PAGE_MASK; +	unsigned long start_offset = addr - pa; +	unsigned long nr_pages; +	ssize_t bytes_copied_loop; +	ssize_t rc = 0; +	unsigned long nr_pages_copied = 0; +	unsigned long nr_pages_to_copy; +	unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES +		/ sizeof(struct pages *); + +	*bytes_copied = 0; + +	/* Work out address and page range required */ +	if (len == 0) +		return 0; +	nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; + +	while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { +		nr_pages_to_copy = min(nr_pages - nr_pages_copied, +				       max_pages_per_loop); + +		rc = process_vm_rw_pages(task, mm, process_pages, pa, +					 start_offset, len, +					 lvec, lvec_cnt, +					 lvec_current, lvec_offset, +					 vm_write, nr_pages_to_copy, +					 &bytes_copied_loop); +		start_offset = 0; +		*bytes_copied += bytes_copied_loop; + +		if (rc < 0) { +			return rc; +		} else { +			len -= bytes_copied_loop; +			nr_pages_copied += nr_pages_to_copy; +			pa += nr_pages_to_copy * PAGE_SIZE; +		} +	} + +	return rc; +} + +/* Maximum number of entries for process pages array +   which lives on stack */ +#define PVM_MAX_PP_ARRAY_COUNT 16 + +/** + * process_vm_rw_core - core of reading/writing pages from task specified + * @pid: PID of process to read/write from/to + * @lvec: iovec array specifying where to copy to/from locally + * @liovcnt: size of lvec array + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * Returns the number of bytes read/written or error code. May + *  return less bytes than expected if an error occurs during the copying + *  process. + */ +static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, +				  unsigned long liovcnt, +				  const struct iovec *rvec, +				  unsigned long riovcnt, +				  unsigned long flags, int vm_write) +{ +	struct task_struct *task; +	struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; +	struct page **process_pages = pp_stack; +	struct mm_struct *mm; +	unsigned long i; +	ssize_t rc = 0; +	ssize_t bytes_copied_loop; +	ssize_t bytes_copied = 0; +	unsigned long nr_pages = 0; +	unsigned long nr_pages_iov; +	unsigned long iov_l_curr_idx = 0; +	size_t iov_l_curr_offset = 0; +	ssize_t iov_len; + +	/* +	 * Work out how many pages of struct pages we're going to need +	 * when eventually calling get_user_pages +	 */ +	for (i = 0; i < riovcnt; i++) { +		iov_len = rvec[i].iov_len; +		if (iov_len > 0) { +			nr_pages_iov = ((unsigned long)rvec[i].iov_base +					+ iov_len) +				/ PAGE_SIZE - (unsigned long)rvec[i].iov_base +				/ PAGE_SIZE + 1; +			nr_pages = max(nr_pages, nr_pages_iov); +		} +	} + +	if (nr_pages == 0) +		return 0; + +	if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { +		/* For reliability don't try to kmalloc more than +		   2 pages worth */ +		process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES, +					      sizeof(struct pages *)*nr_pages), +					GFP_KERNEL); + +		if (!process_pages) +			return -ENOMEM; +	} + +	/* Get process information */ +	rcu_read_lock(); +	task = find_task_by_vpid(pid); +	if (task) +		get_task_struct(task); +	rcu_read_unlock(); +	if (!task) { +		rc = -ESRCH; +		goto free_proc_pages; +	} + +	task_lock(task); +	if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { +		task_unlock(task); +		rc = -EPERM; +		goto put_task_struct; +	} +	mm = task->mm; + +	if (!mm || (task->flags & PF_KTHREAD)) { +		task_unlock(task); +		rc = -EINVAL; +		goto put_task_struct; +	} + +	atomic_inc(&mm->mm_users); +	task_unlock(task); + +	for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { +		rc = process_vm_rw_single_vec( +			(unsigned long)rvec[i].iov_base, rvec[i].iov_len, +			lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, +			process_pages, mm, task, vm_write, &bytes_copied_loop); +		bytes_copied += bytes_copied_loop; +		if (rc != 0) { +			/* If we have managed to copy any data at all then +			   we return the number of bytes copied. Otherwise +			   we return the error code */ +			if (bytes_copied) +				rc = bytes_copied; +			goto put_mm; +		} +	} + +	rc = bytes_copied; +put_mm: +	mmput(mm); + +put_task_struct: +	put_task_struct(task); + +free_proc_pages: +	if (process_pages != pp_stack) +		kfree(process_pages); +	return rc; +} + +/** + * process_vm_rw - check iovecs before calling core routine + * @pid: PID of process to read/write from/to + * @lvec: iovec array specifying where to copy to/from locally + * @liovcnt: size of lvec array + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * Returns the number of bytes read/written or error code. May + *  return less bytes than expected if an error occurs during the copying + *  process. + */ +static ssize_t process_vm_rw(pid_t pid, +			     const struct iovec __user *lvec, +			     unsigned long liovcnt, +			     const struct iovec __user *rvec, +			     unsigned long riovcnt, +			     unsigned long flags, int vm_write) +{ +	struct iovec iovstack_l[UIO_FASTIOV]; +	struct iovec iovstack_r[UIO_FASTIOV]; +	struct iovec *iov_l = iovstack_l; +	struct iovec *iov_r = iovstack_r; +	ssize_t rc; + +	if (flags != 0) +		return -EINVAL; + +	/* Check iovecs */ +	if (vm_write) +		rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, +					   iovstack_l, &iov_l, 1); +	else +		rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, +					   iovstack_l, &iov_l, 1); +	if (rc <= 0) +		goto free_iovecs; + +	rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, +				   iovstack_r, &iov_r, 0); +	if (rc <= 0) +		goto free_iovecs; + +	rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, +				vm_write); + +free_iovecs: +	if (iov_r != iovstack_r) +		kfree(iov_r); +	if (iov_l != iovstack_l) +		kfree(iov_l); + +	return rc; +} + +SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, +		unsigned long, liovcnt, const struct iovec __user *, rvec, +		unsigned long, riovcnt,	unsigned long, flags) +{ +	return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); +} + +SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, +		const struct iovec __user *, lvec, +		unsigned long, liovcnt, const struct iovec __user *, rvec, +		unsigned long, riovcnt,	unsigned long, flags) +{ +	return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); +} + +#ifdef CONFIG_COMPAT + +asmlinkage ssize_t +compat_process_vm_rw(compat_pid_t pid, +		     const struct compat_iovec __user *lvec, +		     unsigned long liovcnt, +		     const struct compat_iovec __user *rvec, +		     unsigned long riovcnt, +		     unsigned long flags, int vm_write) +{ +	struct iovec iovstack_l[UIO_FASTIOV]; +	struct iovec iovstack_r[UIO_FASTIOV]; +	struct iovec *iov_l = iovstack_l; +	struct iovec *iov_r = iovstack_r; +	ssize_t rc = -EFAULT; + +	if (flags != 0) +		return -EINVAL; + +	if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) +		goto out; + +	if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) +		goto out; + +	if (vm_write) +		rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, +						  UIO_FASTIOV, iovstack_l, +						  &iov_l, 1); +	else +		rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, +						  UIO_FASTIOV, iovstack_l, +						  &iov_l, 1); +	if (rc <= 0) +		goto free_iovecs; +	rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, +					  UIO_FASTIOV, iovstack_r, +					  &iov_r, 0); +	if (rc <= 0) +		goto free_iovecs; + +	rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, +			   vm_write); + +free_iovecs: +	if (iov_r != iovstack_r) +		kfree(iov_r); +	if (iov_l != iovstack_l) +		kfree(iov_l); + +out: +	return rc; +} + +asmlinkage ssize_t +compat_sys_process_vm_readv(compat_pid_t pid, +			    const struct compat_iovec __user *lvec, +			    unsigned long liovcnt, +			    const struct compat_iovec __user *rvec, +			    unsigned long riovcnt, +			    unsigned long flags) +{ +	return compat_process_vm_rw(pid, lvec, liovcnt, rvec, +				    riovcnt, flags, 0); +} + +asmlinkage ssize_t +compat_sys_process_vm_writev(compat_pid_t pid, +			     const struct compat_iovec __user *lvec, +			     unsigned long liovcnt, +			     const struct compat_iovec __user *rvec, +			     unsigned long riovcnt, +			     unsigned long flags) +{ +	return compat_process_vm_rw(pid, lvec, liovcnt, rvec, +				    riovcnt, flags, 1); +} + +#endif diff --git a/mm/rmap.c b/mm/rmap.c index 8005080fb9e..6541cf7fd1d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1164,7 +1164,7 @@ void page_remove_rmap(struct page *page)  /*   * Subfunctions of try_to_unmap: try_to_unmap_one called - * repeatedly from either try_to_unmap_anon or try_to_unmap_file. + * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.   */  int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,  		     unsigned long address, enum ttu_flags flags) diff --git a/mm/shmem.c b/mm/shmem.c index 32f6763f16f..45b9acb575f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1068,6 +1068,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)  		user_shm_unlock(inode->i_size, user);  		info->flags &= ~VM_LOCKED;  		mapping_clear_unevictable(file->f_mapping); +		/* +		 * Ensure that a racing putback_lru_page() can see +		 * the pages of this mapping are evictable when we +		 * skip them due to !PageLRU during the scan. +		 */ +		smp_mb__after_clear_bit();  		scan_mapping_unevictable_pages(file->f_mapping);  	}  	retval = 0; @@ -1458,7 +1464,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)  	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);  	if (inode) {  		error = security_inode_init_security(inode, dir, -						     &dentry->d_name, NULL, +						     &dentry->d_name,  						     NULL, NULL);  		if (error) {  			if (error != -EOPNOTSUPP) { @@ -1598,7 +1604,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s  	if (!inode)  		return -ENOSPC; -	error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, +	error = security_inode_init_security(inode, dir, &dentry->d_name,  					     NULL, NULL);  	if (error) {  		if (error != -EOPNOTSUPP) { @@ -2497,7 +2503,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags  	d_instantiate(path.dentry, inode);  	inode->i_size = size; -	inode->i_nlink = 0;	/* It is unlinked */ +	clear_nlink(inode);	/* It is unlinked */  #ifndef CONFIG_MMU  	error = ramfs_nommu_expand_for_mapping(inode, size);  	if (error) diff --git a/mm/slab.c b/mm/slab.c index 6d90a091fdc..708efe88615 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1851,15 +1851,15 @@ static void dump_line(char *data, int offset, int limit)  	unsigned char error = 0;  	int bad_count = 0; -	printk(KERN_ERR "%03x:", offset); +	printk(KERN_ERR "%03x: ", offset);  	for (i = 0; i < limit; i++) {  		if (data[offset + i] != POISON_FREE) {  			error = data[offset + i];  			bad_count++;  		} -		printk(" %02x", (unsigned char)data[offset + i]);  	} -	printk("\n"); +	print_hex_dump(KERN_CONT, "", 0, 16, 1, +			&data[offset], limit, 1);  	if (bad_count == 1) {  		error ^= POISON_FREE; @@ -3039,14 +3039,9 @@ bad:  		printk(KERN_ERR "slab: Internal list corruption detected in "  				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",  			cachep->name, cachep->num, slabp, slabp->inuse); -		for (i = 0; -		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); -		     i++) { -			if (i % 16 == 0) -				printk("\n%03x:", i); -			printk(" %02x", ((unsigned char *)slabp)[i]); -		} -		printk("\n"); +		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, +			sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), +			1);  		BUG();  	}  } @@ -4584,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = {  static int __init slab_proc_init(void)  { -	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); +	proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);  #ifdef CONFIG_DEBUG_SLAB_LEAK  	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);  #endif diff --git a/mm/slub.c b/mm/slub.c index 7c54fe83a90..7d2a996c307 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -467,34 +467,8 @@ static int disable_higher_order_debug;   */  static void print_section(char *text, u8 *addr, unsigned int length)  { -	int i, offset; -	int newline = 1; -	char ascii[17]; - -	ascii[16] = 0; - -	for (i = 0; i < length; i++) { -		if (newline) { -			printk(KERN_ERR "%8s 0x%p: ", text, addr + i); -			newline = 0; -		} -		printk(KERN_CONT " %02x", addr[i]); -		offset = i % 16; -		ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; -		if (offset == 15) { -			printk(KERN_CONT " %s\n", ascii); -			newline = 1; -		} -	} -	if (!newline) { -		i %= 16; -		while (i < 16) { -			printk(KERN_CONT "   "); -			ascii[i] = ' '; -			i++; -		} -		printk(KERN_CONT " %s\n", ascii); -	} +	print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, +			length, 1);  }  static struct track *get_track(struct kmem_cache *s, void *object, @@ -625,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)  			p, p - addr, get_freepointer(s, p));  	if (p > addr + 16) -		print_section("Bytes b4", p - 16, 16); - -	print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); +		print_section("Bytes b4 ", p - 16, 16); +	print_section("Object ", p, min_t(unsigned long, s->objsize, +				PAGE_SIZE));  	if (s->flags & SLAB_RED_ZONE) -		print_section("Redzone", p + s->objsize, +		print_section("Redzone ", p + s->objsize,  			s->inuse - s->objsize);  	if (s->offset) @@ -643,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)  	if (off != s->size)  		/* Beginning of the filler is the free pointer */ -		print_section("Padding", p + off, s->size - off); +		print_section("Padding ", p + off, s->size - off);  	dump_stack();  } @@ -681,49 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)  		memset(p + s->objsize, val, s->inuse - s->objsize);  } -static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) -{ -	while (bytes) { -		if (*start != value) -			return start; -		start++; -		bytes--; -	} -	return NULL; -} - -static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) -{ -	u64 value64; -	unsigned int words, prefix; - -	if (bytes <= 16) -		return check_bytes8(start, value, bytes); - -	value64 = value | value << 8 | value << 16 | value << 24; -	value64 = (value64 & 0xffffffff) | value64 << 32; -	prefix = 8 - ((unsigned long)start) % 8; - -	if (prefix) { -		u8 *r = check_bytes8(start, value, prefix); -		if (r) -			return r; -		start += prefix; -		bytes -= prefix; -	} - -	words = bytes / 8; - -	while (words) { -		if (*(u64 *)start != value64) -			return check_bytes8(start, value, 8); -		start += 8; -		words--; -	} - -	return check_bytes8(start, value, bytes % 8); -} -  static void restore_bytes(struct kmem_cache *s, char *message, u8 data,  						void *from, void *to)  { @@ -738,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,  	u8 *fault;  	u8 *end; -	fault = check_bytes(start, value, bytes); +	fault = memchr_inv(start, value, bytes);  	if (!fault)  		return 1; @@ -831,14 +762,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)  	if (!remainder)  		return 1; -	fault = check_bytes(end - remainder, POISON_INUSE, remainder); +	fault = memchr_inv(end - remainder, POISON_INUSE, remainder);  	if (!fault)  		return 1;  	while (end > fault && end[-1] == POISON_INUSE)  		end--;  	slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); -	print_section("Padding", end - remainder, remainder); +	print_section("Padding ", end - remainder, remainder);  	restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);  	return 0; @@ -987,7 +918,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,  			page->freelist);  		if (!alloc) -			print_section("Object", (void *)object, s->objsize); +			print_section("Object ", (void *)object, s->objsize);  		dump_stack();  	} @@ -1447,7 +1378,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)  	set_freepointer(s, last, NULL);  	page->freelist = start; -	page->inuse = 0; +	page->inuse = page->objects;  	page->frozen = 1;  out:  	return page; @@ -1534,7 +1465,7 @@ static inline void add_partial(struct kmem_cache_node *n,  				struct page *page, int tail)  {  	n->nr_partial++; -	if (tail) +	if (tail == DEACTIVATE_TO_TAIL)  		list_add_tail(&page->lru, &n->partial);  	else  		list_add(&page->lru, &n->partial); @@ -1554,10 +1485,13 @@ static inline void remove_partial(struct kmem_cache_node *n,   * Lock slab, remove from the partial list and put the object into the   * per cpu freelist.   * + * Returns a list of objects or NULL if it fails. + *   * Must hold list_lock.   */ -static inline int acquire_slab(struct kmem_cache *s, -		struct kmem_cache_node *n, struct page *page) +static inline void *acquire_slab(struct kmem_cache *s, +		struct kmem_cache_node *n, struct page *page, +		int mode)  {  	void *freelist;  	unsigned long counters; @@ -1572,7 +1506,8 @@ static inline int acquire_slab(struct kmem_cache *s,  		freelist = page->freelist;  		counters = page->counters;  		new.counters = counters; -		new.inuse = page->objects; +		if (mode) +			new.inuse = page->objects;  		VM_BUG_ON(new.frozen);  		new.frozen = 1; @@ -1583,32 +1518,19 @@ static inline int acquire_slab(struct kmem_cache *s,  			"lock and freeze"));  	remove_partial(n, page); - -	if (freelist) { -		/* Populate the per cpu freelist */ -		this_cpu_write(s->cpu_slab->freelist, freelist); -		this_cpu_write(s->cpu_slab->page, page); -		this_cpu_write(s->cpu_slab->node, page_to_nid(page)); -		return 1; -	} else { -		/* -		 * Slab page came from the wrong list. No object to allocate -		 * from. Put it onto the correct list and continue partial -		 * scan. -		 */ -		printk(KERN_ERR "SLUB: %s : Page without available objects on" -			" partial list\n", s->name); -		return 0; -	} +	return freelist;  } +static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +  /*   * Try to allocate a partial slab from a specific node.   */ -static struct page *get_partial_node(struct kmem_cache *s, -					struct kmem_cache_node *n) +static void *get_partial_node(struct kmem_cache *s, +		struct kmem_cache_node *n, struct kmem_cache_cpu *c)  { -	struct page *page; +	struct page *page, *page2; +	void *object = NULL;  	/*  	 * Racy check. If we mistakenly see no partial slabs then we @@ -1620,26 +1542,43 @@ static struct page *get_partial_node(struct kmem_cache *s,  		return NULL;  	spin_lock(&n->list_lock); -	list_for_each_entry(page, &n->partial, lru) -		if (acquire_slab(s, n, page)) -			goto out; -	page = NULL; -out: +	list_for_each_entry_safe(page, page2, &n->partial, lru) { +		void *t = acquire_slab(s, n, page, object == NULL); +		int available; + +		if (!t) +			break; + +		if (!object) { +			c->page = page; +			c->node = page_to_nid(page); +			stat(s, ALLOC_FROM_PARTIAL); +			object = t; +			available =  page->objects - page->inuse; +		} else { +			page->freelist = t; +			available = put_cpu_partial(s, page, 0); +		} +		if (kmem_cache_debug(s) || available > s->cpu_partial / 2) +			break; + +	}  	spin_unlock(&n->list_lock); -	return page; +	return object;  }  /*   * Get a page from somewhere. Search in increasing NUMA distances.   */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, +		struct kmem_cache_cpu *c)  {  #ifdef CONFIG_NUMA  	struct zonelist *zonelist;  	struct zoneref *z;  	struct zone *zone;  	enum zone_type high_zoneidx = gfp_zone(flags); -	struct page *page; +	void *object;  	/*  	 * The defrag ratio allows a configuration of the tradeoffs between @@ -1672,10 +1611,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)  		if (n && cpuset_zone_allowed_hardwall(zone, flags) &&  				n->nr_partial > s->min_partial) { -			page = get_partial_node(s, n); -			if (page) { +			object = get_partial_node(s, n, c); +			if (object) {  				put_mems_allowed(); -				return page; +				return object;  			}  		}  	} @@ -1687,16 +1626,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)  /*   * Get a partial page, lock it and return it.   */ -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, +		struct kmem_cache_cpu *c)  { -	struct page *page; +	void *object;  	int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; -	page = get_partial_node(s, get_node(s, searchnode)); -	if (page || node != NUMA_NO_NODE) -		return page; +	object = get_partial_node(s, get_node(s, searchnode), c); +	if (object || node != NUMA_NO_NODE) +		return object; -	return get_any_partial(s, flags); +	return get_any_partial(s, flags, c);  }  #ifdef CONFIG_PREEMPT @@ -1765,9 +1705,6 @@ void init_kmem_cache_cpus(struct kmem_cache *s)  	for_each_possible_cpu(cpu)  		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);  } -/* - * Remove the cpu slab - */  /*   * Remove the cpu slab @@ -1781,13 +1718,13 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)  	enum slab_modes l = M_NONE, m = M_NONE;  	void *freelist;  	void *nextfree; -	int tail = 0; +	int tail = DEACTIVATE_TO_HEAD;  	struct page new;  	struct page old;  	if (page->freelist) {  		stat(s, DEACTIVATE_REMOTE_FREES); -		tail = 1; +		tail = DEACTIVATE_TO_TAIL;  	}  	c->tid = next_tid(c->tid); @@ -1893,7 +1830,7 @@ redo:  		if (m == M_PARTIAL) {  			add_partial(n, page, tail); -			stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); +			stat(s, tail);  		} else if (m == M_FULL) { @@ -1920,6 +1857,123 @@ redo:  	}  } +/* Unfreeze all the cpu partial slabs */ +static void unfreeze_partials(struct kmem_cache *s) +{ +	struct kmem_cache_node *n = NULL; +	struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); +	struct page *page; + +	while ((page = c->partial)) { +		enum slab_modes { M_PARTIAL, M_FREE }; +		enum slab_modes l, m; +		struct page new; +		struct page old; + +		c->partial = page->next; +		l = M_FREE; + +		do { + +			old.freelist = page->freelist; +			old.counters = page->counters; +			VM_BUG_ON(!old.frozen); + +			new.counters = old.counters; +			new.freelist = old.freelist; + +			new.frozen = 0; + +			if (!new.inuse && (!n || n->nr_partial > s->min_partial)) +				m = M_FREE; +			else { +				struct kmem_cache_node *n2 = get_node(s, +							page_to_nid(page)); + +				m = M_PARTIAL; +				if (n != n2) { +					if (n) +						spin_unlock(&n->list_lock); + +					n = n2; +					spin_lock(&n->list_lock); +				} +			} + +			if (l != m) { +				if (l == M_PARTIAL) +					remove_partial(n, page); +				else +					add_partial(n, page, 1); + +				l = m; +			} + +		} while (!cmpxchg_double_slab(s, page, +				old.freelist, old.counters, +				new.freelist, new.counters, +				"unfreezing slab")); + +		if (m == M_FREE) { +			stat(s, DEACTIVATE_EMPTY); +			discard_slab(s, page); +			stat(s, FREE_SLAB); +		} +	} + +	if (n) +		spin_unlock(&n->list_lock); +} + +/* + * Put a page that was just frozen (in __slab_free) into a partial page + * slot if available. This is done without interrupts disabled and without + * preemption disabled. The cmpxchg is racy and may put the partial page + * onto a random cpus partial slot. + * + * If we did not find a slot then simply move all the partials to the + * per node partial list. + */ +int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +{ +	struct page *oldpage; +	int pages; +	int pobjects; + +	do { +		pages = 0; +		pobjects = 0; +		oldpage = this_cpu_read(s->cpu_slab->partial); + +		if (oldpage) { +			pobjects = oldpage->pobjects; +			pages = oldpage->pages; +			if (drain && pobjects > s->cpu_partial) { +				unsigned long flags; +				/* +				 * partial array is full. Move the existing +				 * set to the per node partial list. +				 */ +				local_irq_save(flags); +				unfreeze_partials(s); +				local_irq_restore(flags); +				pobjects = 0; +				pages = 0; +			} +		} + +		pages++; +		pobjects += page->objects - page->inuse; + +		page->pages = pages; +		page->pobjects = pobjects; +		page->next = oldpage; + +	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); +	stat(s, CPU_PARTIAL_FREE); +	return pobjects; +} +  static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)  {  	stat(s, CPUSLAB_FLUSH); @@ -1935,8 +1989,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)  {  	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); -	if (likely(c && c->page)) -		flush_slab(s, c); +	if (likely(c)) { +		if (c->page) +			flush_slab(s, c); + +		unfreeze_partials(s); +	}  }  static void flush_cpu_slab(void *d) @@ -2027,12 +2085,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)  	}  } +static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, +			int node, struct kmem_cache_cpu **pc) +{ +	void *object; +	struct kmem_cache_cpu *c; +	struct page *page = new_slab(s, flags, node); + +	if (page) { +		c = __this_cpu_ptr(s->cpu_slab); +		if (c->page) +			flush_slab(s, c); + +		/* +		 * No other reference to the page yet so we can +		 * muck around with it freely without cmpxchg +		 */ +		object = page->freelist; +		page->freelist = NULL; + +		stat(s, ALLOC_SLAB); +		c->node = page_to_nid(page); +		c->page = page; +		*pc = c; +	} else +		object = NULL; + +	return object; +} +  /*   * Slow path. The lockless freelist is empty or we need to perform   * debugging duties.   * - * Interrupts are disabled. - *   * Processing is still very fast if new objects have been freed to the   * regular freelist. In that case we simply take over the regular freelist   * as the lockless freelist and zap the regular freelist. @@ -2049,7 +2134,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,  			  unsigned long addr, struct kmem_cache_cpu *c)  {  	void **object; -	struct page *page;  	unsigned long flags;  	struct page new;  	unsigned long counters; @@ -2064,13 +2148,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,  	c = this_cpu_ptr(s->cpu_slab);  #endif -	/* We handle __GFP_ZERO in the caller */ -	gfpflags &= ~__GFP_ZERO; - -	page = c->page; -	if (!page) +	if (!c->page)  		goto new_slab; - +redo:  	if (unlikely(!node_match(c, node))) {  		stat(s, ALLOC_NODE_MISMATCH);  		deactivate_slab(s, c); @@ -2080,8 +2160,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,  	stat(s, ALLOC_SLOWPATH);  	do { -		object = page->freelist; -		counters = page->counters; +		object = c->page->freelist; +		counters = c->page->counters;  		new.counters = counters;  		VM_BUG_ON(!new.frozen); @@ -2093,17 +2173,17 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,  		 *  		 * If there are objects left then we retrieve them  		 * and use them to refill the per cpu queue. -		*/ +		 */ -		new.inuse = page->objects; +		new.inuse = c->page->objects;  		new.frozen = object != NULL; -	} while (!__cmpxchg_double_slab(s, page, +	} while (!__cmpxchg_double_slab(s, c->page,  			object, counters,  			NULL, new.counters,  			"__slab_alloc")); -	if (unlikely(!object)) { +	if (!object) {  		c->page = NULL;  		stat(s, DEACTIVATE_BYPASS);  		goto new_slab; @@ -2112,58 +2192,47 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,  	stat(s, ALLOC_REFILL);  load_freelist: -	VM_BUG_ON(!page->frozen);  	c->freelist = get_freepointer(s, object);  	c->tid = next_tid(c->tid);  	local_irq_restore(flags);  	return object;  new_slab: -	page = get_partial(s, gfpflags, node); -	if (page) { -		stat(s, ALLOC_FROM_PARTIAL); -		object = c->freelist; -		if (kmem_cache_debug(s)) -			goto debug; -		goto load_freelist; +	if (c->partial) { +		c->page = c->partial; +		c->partial = c->page->next; +		c->node = page_to_nid(c->page); +		stat(s, CPU_PARTIAL_ALLOC); +		c->freelist = NULL; +		goto redo;  	} -	page = new_slab(s, gfpflags, node); +	/* Then do expensive stuff like retrieving pages from the partial lists */ +	object = get_partial(s, gfpflags, node, c); -	if (page) { -		c = __this_cpu_ptr(s->cpu_slab); -		if (c->page) -			flush_slab(s, c); +	if (unlikely(!object)) { -		/* -		 * No other reference to the page yet so we can -		 * muck around with it freely without cmpxchg -		 */ -		object = page->freelist; -		page->freelist = NULL; -		page->inuse = page->objects; +		object = new_slab_objects(s, gfpflags, node, &c); -		stat(s, ALLOC_SLAB); -		c->node = page_to_nid(page); -		c->page = page; +		if (unlikely(!object)) { +			if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) +				slab_out_of_memory(s, gfpflags, node); -		if (kmem_cache_debug(s)) -			goto debug; -		goto load_freelist; +			local_irq_restore(flags); +			return NULL; +		}  	} -	if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) -		slab_out_of_memory(s, gfpflags, node); -	local_irq_restore(flags); -	return NULL; -debug: -	if (!object || !alloc_debug_processing(s, page, object, addr)) -		goto new_slab; +	if (likely(!kmem_cache_debug(s))) +		goto load_freelist; + +	/* Only entered in the debug case */ +	if (!alloc_debug_processing(s, c->page, object, addr)) +		goto new_slab;	/* Slab failed checks. Next slab needed */  	c->freelist = get_freepointer(s, object);  	deactivate_slab(s, c); -	c->page = NULL;  	c->node = NUMA_NO_NODE;  	local_irq_restore(flags);  	return object; @@ -2333,16 +2402,29 @@ static void __slab_free(struct kmem_cache *s, struct page *page,  		was_frozen = new.frozen;  		new.inuse--;  		if ((!new.inuse || !prior) && !was_frozen && !n) { -                        n = get_node(s, page_to_nid(page)); -			/* -			 * Speculatively acquire the list_lock. -			 * If the cmpxchg does not succeed then we may -			 * drop the list_lock without any processing. -			 * -			 * Otherwise the list_lock will synchronize with -			 * other processors updating the list of slabs. -			 */ -                        spin_lock_irqsave(&n->list_lock, flags); + +			if (!kmem_cache_debug(s) && !prior) + +				/* +				 * Slab was on no list before and will be partially empty +				 * We can defer the list move and instead freeze it. +				 */ +				new.frozen = 1; + +			else { /* Needs to be taken off a list */ + +	                        n = get_node(s, page_to_nid(page)); +				/* +				 * Speculatively acquire the list_lock. +				 * If the cmpxchg does not succeed then we may +				 * drop the list_lock without any processing. +				 * +				 * Otherwise the list_lock will synchronize with +				 * other processors updating the list of slabs. +				 */ +				spin_lock_irqsave(&n->list_lock, flags); + +			}  		}  		inuse = new.inuse; @@ -2352,7 +2434,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,  		"__slab_free"));  	if (likely(!n)) { -                /* + +		/* +		 * If we just froze the page then put it onto the +		 * per cpu partial list. +		 */ +		if (new.frozen && !was_frozen) +			put_cpu_partial(s, page, 1); + +		/*  		 * The list lock was not taken therefore no list  		 * activity can be necessary.  		 */ @@ -2377,7 +2467,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,  		 */  		if (unlikely(!prior)) {  			remove_full(s, page); -			add_partial(n, page, 1); +			add_partial(n, page, DEACTIVATE_TO_TAIL);  			stat(s, FREE_ADD_PARTIAL);  		}  	} @@ -2421,7 +2511,6 @@ static __always_inline void slab_free(struct kmem_cache *s,  	slab_free_hook(s, x);  redo: -  	/*  	 * Determine the currently cpus per cpu slab.  	 * The cpu may change afterward. However that does not matter since @@ -2685,7 +2774,7 @@ static void early_kmem_cache_node_alloc(int node)  	n = page->freelist;  	BUG_ON(!n);  	page->freelist = get_freepointer(kmem_cache_node, n); -	page->inuse++; +	page->inuse = 1;  	page->frozen = 0;  	kmem_cache_node->node[node] = n;  #ifdef CONFIG_SLUB_DEBUG @@ -2695,7 +2784,7 @@ static void early_kmem_cache_node_alloc(int node)  	init_kmem_cache_node(n, kmem_cache_node);  	inc_slabs_node(kmem_cache_node, node, page->objects); -	add_partial(n, page, 0); +	add_partial(n, page, DEACTIVATE_TO_HEAD);  }  static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2911,7 +3000,34 @@ static int kmem_cache_open(struct kmem_cache *s,  	 * The larger the object size is, the more pages we want on the partial  	 * list to avoid pounding the page allocator excessively.  	 */ -	set_min_partial(s, ilog2(s->size)); +	set_min_partial(s, ilog2(s->size) / 2); + +	/* +	 * cpu_partial determined the maximum number of objects kept in the +	 * per cpu partial lists of a processor. +	 * +	 * Per cpu partial lists mainly contain slabs that just have one +	 * object freed. If they are used for allocation then they can be +	 * filled up again with minimal effort. The slab will never hit the +	 * per node partial lists and therefore no locking will be required. +	 * +	 * This setting also determines +	 * +	 * A) The number of objects from per cpu partial slabs dumped to the +	 *    per node list when we reach the limit. +	 * B) The number of objects in cpu partial slabs to extract from the +	 *    per node list when we run out of per cpu objects. We only fetch 50% +	 *    to keep some capacity around for frees. +	 */ +	if (s->size >= PAGE_SIZE) +		s->cpu_partial = 2; +	else if (s->size >= 1024) +		s->cpu_partial = 6; +	else if (s->size >= 256) +		s->cpu_partial = 13; +	else +		s->cpu_partial = 30; +  	s->refcount = 1;  #ifdef CONFIG_NUMA  	s->remote_node_defrag_ratio = 1000; @@ -2970,13 +3086,13 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,  /*   * Attempt to free all partial slabs on a node. + * This is called from kmem_cache_close(). We must be the last thread + * using the cache and therefore we do not need to lock anymore.   */  static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)  { -	unsigned long flags;  	struct page *page, *h; -	spin_lock_irqsave(&n->list_lock, flags);  	list_for_each_entry_safe(page, h, &n->partial, lru) {  		if (!page->inuse) {  			remove_partial(n, page); @@ -2986,7 +3102,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)  				"Objects remaining on kmem_cache_close()");  		}  	} -	spin_unlock_irqrestore(&n->list_lock, flags);  }  /* @@ -3020,6 +3135,7 @@ void kmem_cache_destroy(struct kmem_cache *s)  	s->refcount--;  	if (!s->refcount) {  		list_del(&s->list); +		up_write(&slub_lock);  		if (kmem_cache_close(s)) {  			printk(KERN_ERR "SLUB %s: %s called for cache that "  				"still has objects.\n", s->name, __func__); @@ -3028,8 +3144,8 @@ void kmem_cache_destroy(struct kmem_cache *s)  		if (s->flags & SLAB_DESTROY_BY_RCU)  			rcu_barrier();  		sysfs_slab_remove(s); -	} -	up_write(&slub_lock); +	} else +		up_write(&slub_lock);  }  EXPORT_SYMBOL(kmem_cache_destroy); @@ -3347,23 +3463,23 @@ int kmem_cache_shrink(struct kmem_cache *s)  		 * list_lock. page->inuse here is the upper limit.  		 */  		list_for_each_entry_safe(page, t, &n->partial, lru) { -			if (!page->inuse) { -				remove_partial(n, page); -				discard_slab(s, page); -			} else { -				list_move(&page->lru, -				slabs_by_inuse + page->inuse); -			} +			list_move(&page->lru, slabs_by_inuse + page->inuse); +			if (!page->inuse) +				n->nr_partial--;  		}  		/*  		 * Rebuild the partial list with the slabs filled up most  		 * first and the least used slabs at the end.  		 */ -		for (i = objects - 1; i >= 0; i--) +		for (i = objects - 1; i > 0; i--)  			list_splice(slabs_by_inuse + i, n->partial.prev);  		spin_unlock_irqrestore(&n->list_lock, flags); + +		/* Release empty slabs */ +		list_for_each_entry_safe(page, t, slabs_by_inuse, lru) +			discard_slab(s, page);  	}  	kfree(slabs_by_inuse); @@ -4319,6 +4435,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,  		for_each_possible_cpu(cpu) {  			struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); +			struct page *page;  			if (!c || c->node < 0)  				continue; @@ -4334,6 +4451,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,  				total += x;  				nodes[c->node] += x;  			} +			page = c->partial; + +			if (page) { +				x = page->pobjects; +                                total += x; +                                nodes[c->node] += x; +			}  			per_cpu[c->node]++;  		}  	} @@ -4412,11 +4536,12 @@ struct slab_attribute {  };  #define SLAB_ATTR_RO(_name) \ -	static struct slab_attribute _name##_attr = __ATTR_RO(_name) +	static struct slab_attribute _name##_attr = \ +	__ATTR(_name, 0400, _name##_show, NULL)  #define SLAB_ATTR(_name) \  	static struct slab_attribute _name##_attr =  \ -	__ATTR(_name, 0644, _name##_show, _name##_store) +	__ATTR(_name, 0600, _name##_show, _name##_store)  static ssize_t slab_size_show(struct kmem_cache *s, char *buf)  { @@ -4485,6 +4610,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,  }  SLAB_ATTR(min_partial); +static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) +{ +	return sprintf(buf, "%u\n", s->cpu_partial); +} + +static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, +				 size_t length) +{ +	unsigned long objects; +	int err; + +	err = strict_strtoul(buf, 10, &objects); +	if (err) +		return err; + +	s->cpu_partial = objects; +	flush_all(s); +	return length; +} +SLAB_ATTR(cpu_partial); +  static ssize_t ctor_show(struct kmem_cache *s, char *buf)  {  	if (!s->ctor) @@ -4523,6 +4669,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)  }  SLAB_ATTR_RO(objects_partial); +static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) +{ +	int objects = 0; +	int pages = 0; +	int cpu; +	int len; + +	for_each_online_cpu(cpu) { +		struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + +		if (page) { +			pages += page->pages; +			objects += page->pobjects; +		} +	} + +	len = sprintf(buf, "%d(%d)", objects, pages); + +#ifdef CONFIG_SMP +	for_each_online_cpu(cpu) { +		struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + +		if (page && len < PAGE_SIZE - 20) +			len += sprintf(buf + len, " C%d=%d(%d)", cpu, +				page->pobjects, page->pages); +	} +#endif +	return len + sprintf(buf + len, "\n"); +} +SLAB_ATTR_RO(slabs_cpu_partial); +  static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)  {  	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); @@ -4845,6 +5022,8 @@ STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);  STAT_ATTR(ORDER_FALLBACK, order_fallback);  STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);  STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); +STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); +STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);  #endif  static struct attribute *slab_attrs[] = { @@ -4853,6 +5032,7 @@ static struct attribute *slab_attrs[] = {  	&objs_per_slab_attr.attr,  	&order_attr.attr,  	&min_partial_attr.attr, +	&cpu_partial_attr.attr,  	&objects_attr.attr,  	&objects_partial_attr.attr,  	&partial_attr.attr, @@ -4865,6 +5045,7 @@ static struct attribute *slab_attrs[] = {  	&destroy_by_rcu_attr.attr,  	&shrink_attr.attr,  	&reserved_attr.attr, +	&slabs_cpu_partial_attr.attr,  #ifdef CONFIG_SLUB_DEBUG  	&total_objects_attr.attr,  	&slabs_attr.attr, @@ -4906,6 +5087,8 @@ static struct attribute *slab_attrs[] = {  	&order_fallback_attr.attr,  	&cmpxchg_double_fail_attr.attr,  	&cmpxchg_double_cpu_fail_attr.attr, +	&cpu_partial_alloc_attr.attr, +	&cpu_partial_free_attr.attr,  #endif  #ifdef CONFIG_FAILSLAB  	&failslab_attr.attr, @@ -5257,7 +5440,7 @@ static const struct file_operations proc_slabinfo_operations = {  static int __init slab_proc_init(void)  { -	proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); +	proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);  	return 0;  }  module_init(slab_proc_init); diff --git a/mm/swap.c b/mm/swap.c index 3a442f18b0b..87627f181c3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)  {  	if (unlikely(PageTail(page))) {  		/* __split_huge_page_refcount can run under us */ -		struct page *page_head = page->first_page; -		smp_rmb(); -		/* -		 * If PageTail is still set after smp_rmb() we can be sure -		 * that the page->first_page we read wasn't a dangling pointer. -		 * See __split_huge_page_refcount() smp_wmb(). -		 */ -		if (likely(PageTail(page) && get_page_unless_zero(page_head))) { +		struct page *page_head = compound_trans_head(page); + +		if (likely(page != page_head && +			   get_page_unless_zero(page_head))) {  			unsigned long flags;  			/* -			 * Verify that our page_head wasn't converted -			 * to a a regular page before we got a -			 * reference on it. +			 * page_head wasn't a dangling pointer but it +			 * may not be a head page anymore by the time +			 * we obtain the lock. That is ok as long as it +			 * can't be freed from under us.  			 */ -			if (unlikely(!PageHead(page_head))) { -				/* PageHead is cleared after PageTail */ -				smp_rmb(); -				VM_BUG_ON(PageTail(page)); -				goto out_put_head; -			} -			/* -			 * Only run compound_lock on a valid PageHead, -			 * after having it pinned with -			 * get_page_unless_zero() above. -			 */ -			smp_mb(); -			/* page_head wasn't a dangling pointer */  			flags = compound_lock_irqsave(page_head);  			if (unlikely(!PageTail(page))) {  				/* __split_huge_page_refcount run before us */  				compound_unlock_irqrestore(page_head, flags);  				VM_BUG_ON(PageHead(page_head)); -			out_put_head:  				if (put_page_testzero(page_head))  					__put_single_page(page_head);  			out_put_single: @@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)  			VM_BUG_ON(page_head != page->first_page);  			/*  			 * We can release the refcount taken by -			 * get_page_unless_zero now that -			 * split_huge_page_refcount is blocked on the -			 * compound_lock. +			 * get_page_unless_zero() now that +			 * __split_huge_page_refcount() is blocked on +			 * the compound_lock.  			 */  			if (put_page_testzero(page_head))  				VM_BUG_ON(1);  			/* __split_huge_page_refcount will wait now */ -			VM_BUG_ON(atomic_read(&page->_count) <= 0); -			atomic_dec(&page->_count); +			VM_BUG_ON(page_mapcount(page) <= 0); +			atomic_dec(&page->_mapcount);  			VM_BUG_ON(atomic_read(&page_head->_count) <= 0); +			VM_BUG_ON(atomic_read(&page->_count) != 0);  			compound_unlock_irqrestore(page_head, flags);  			if (put_page_testzero(page_head)) {  				if (PageHead(page_head)) @@ -160,6 +144,45 @@ void put_page(struct page *page)  }  EXPORT_SYMBOL(put_page); +/* + * This function is exported but must not be called by anything other + * than get_page(). It implements the slow path of get_page(). + */ +bool __get_page_tail(struct page *page) +{ +	/* +	 * This takes care of get_page() if run on a tail page +	 * returned by one of the get_user_pages/follow_page variants. +	 * get_user_pages/follow_page itself doesn't need the compound +	 * lock because it runs __get_page_tail_foll() under the +	 * proper PT lock that already serializes against +	 * split_huge_page(). +	 */ +	unsigned long flags; +	bool got = false; +	struct page *page_head = compound_trans_head(page); + +	if (likely(page != page_head && get_page_unless_zero(page_head))) { +		/* +		 * page_head wasn't a dangling pointer but it +		 * may not be a head page anymore by the time +		 * we obtain the lock. That is ok as long as it +		 * can't be freed from under us. +		 */ +		flags = compound_lock_irqsave(page_head); +		/* here __split_huge_page_refcount won't run anymore */ +		if (likely(PageTail(page))) { +			__get_page_tail_foll(page, false); +			got = true; +		} +		compound_unlock_irqrestore(page_head, flags); +		if (unlikely(!got)) +			put_page(page_head); +	} +	return got; +} +EXPORT_SYMBOL(__get_page_tail); +  /**   * put_pages_list() - release a list of pages   * @pages: list of pages threaded on page->lru diff --git a/mm/swapfile.c b/mm/swapfile.c index 17bc224bce6..c9d65400912 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1617,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)  	oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);  	err = try_to_unuse(type); -	test_set_oom_score_adj(oom_score_adj); +	compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);  	if (err) {  		/* diff --git a/mm/thrash.c b/mm/thrash.c index e53f7d02c17..57ad495dbd5 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -29,7 +29,7 @@  static DEFINE_SPINLOCK(swap_token_lock);  struct mm_struct *swap_token_mm; -struct mem_cgroup *swap_token_memcg; +static struct mem_cgroup *swap_token_memcg;  #ifdef CONFIG_CGROUP_MEM_RES_CTLR  static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5016f19e166..b669aa6f6ca 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1253,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area);  DEFINE_RWLOCK(vmlist_lock);  struct vm_struct *vmlist; -static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,  			      unsigned long flags, void *caller)  { -	struct vm_struct *tmp, **p; -  	vm->flags = flags;  	vm->addr = (void *)va->va_start;  	vm->size = va->va_end - va->va_start;  	vm->caller = caller;  	va->private = vm;  	va->flags |= VM_VM_AREA; +} +static void insert_vmalloc_vmlist(struct vm_struct *vm) +{ +	struct vm_struct *tmp, **p; + +	vm->flags &= ~VM_UNLIST;  	write_lock(&vmlist_lock);  	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {  		if (tmp->addr >= vm->addr) @@ -1275,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,  	write_unlock(&vmlist_lock);  } +static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, +			      unsigned long flags, void *caller) +{ +	setup_vmalloc_vm(vm, va, flags, caller); +	insert_vmalloc_vmlist(vm); +} +  static struct vm_struct *__get_vm_area_node(unsigned long size,  		unsigned long align, unsigned long flags, unsigned long start,  		unsigned long end, int node, gfp_t gfp_mask, void *caller) @@ -1313,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,  		return NULL;  	} -	insert_vmalloc_vm(area, va, flags, caller); +	/* +	 * When this function is called from __vmalloc_node_range, +	 * we do not add vm_struct to vmlist here to avoid +	 * accessing uninitialized members of vm_struct such as +	 * pages and nr_pages fields. They will be set later. +	 * To distinguish it from others, we use a VM_UNLIST flag. +	 */ +	if (flags & VM_UNLIST) +		setup_vmalloc_vm(area, va, flags, caller); +	else +		insert_vmalloc_vm(area, va, flags, caller); +  	return area;  } @@ -1381,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr)  	va = find_vmap_area((unsigned long)addr);  	if (va && va->flags & VM_VM_AREA) {  		struct vm_struct *vm = va->private; -		struct vm_struct *tmp, **p; -		/* -		 * remove from list and disallow access to this vm_struct -		 * before unmap. (address range confliction is maintained by -		 * vmap.) -		 */ -		write_lock(&vmlist_lock); -		for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) -			; -		*p = tmp->next; -		write_unlock(&vmlist_lock); + +		if (!(vm->flags & VM_UNLIST)) { +			struct vm_struct *tmp, **p; +			/* +			 * remove from list and disallow access to +			 * this vm_struct before unmap. (address range +			 * confliction is maintained by vmap.) +			 */ +			write_lock(&vmlist_lock); +			for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) +				; +			*p = tmp->next; +			write_unlock(&vmlist_lock); +		}  		vmap_debug_free_range(va->va_start, va->va_end);  		free_unmap_vmap_area(va); @@ -1568,8 +1593,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,  	return area->addr;  fail: -	warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, " -			  "allocated %ld of %ld bytes\n", +	warn_alloc_failed(gfp_mask, order, +			  "vmalloc: allocation failure, allocated %ld of %ld bytes\n",  			  (area->nr_pages*PAGE_SIZE), area->size);  	vfree(area->addr);  	return NULL; @@ -1600,17 +1625,22 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,  	size = PAGE_ALIGN(size);  	if (!size || (size >> PAGE_SHIFT) > totalram_pages) -		return NULL; - -	area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, -				  gfp_mask, caller); +		goto fail; +	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, +				  start, end, node, gfp_mask, caller);  	if (!area) -		return NULL; +		goto fail;  	addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);  	/* +	 * In this function, newly allocated vm_struct is not added +	 * to vmlist at __get_vm_area_node(). so, it is added here. +	 */ +	insert_vmalloc_vmlist(area); + +	/*  	 * A ref_count = 3 is needed because the vm_struct and vmap_area  	 * structures allocated in the __get_vm_area_node() function contain  	 * references to the virtual address of the vmalloc'ed block. @@ -1618,6 +1648,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,  	kmemleak_alloc(addr, real_size, 3, gfp_mask);  	return addr; + +fail: +	warn_alloc_failed(gfp_mask, 0, +			  "vmalloc: allocation failure: %lu bytes\n", +			  real_size); +	return NULL;  }  /** diff --git a/mm/vmscan.c b/mm/vmscan.c index c735bd770d3..a1893c05079 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -495,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,  			return PAGE_ACTIVATE;  		} -		/* -		 * Wait on writeback if requested to. This happens when -		 * direct reclaiming a large contiguous area and the -		 * first attempt to free a range of pages fails. -		 */ -		if (PageWriteback(page) && -		    (sc->reclaim_mode & RECLAIM_MODE_SYNC)) -			wait_on_page_writeback(page); -  		if (!PageWriteback(page)) {  			/* synchronous write or broken a_ops? */  			ClearPageReclaim(page); @@ -642,13 +633,14 @@ redo:  		lru = LRU_UNEVICTABLE;  		add_page_to_unevictable_list(page);  		/* -		 * When racing with an mlock clearing (page is -		 * unlocked), make sure that if the other thread does -		 * not observe our setting of PG_lru and fails -		 * isolation, we see PG_mlocked cleared below and move +		 * When racing with an mlock or AS_UNEVICTABLE clearing +		 * (page is unlocked) make sure that if the other thread +		 * does not observe our setting of PG_lru and fails +		 * isolation/check_move_unevictable_page, +		 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move  		 * the page back to the evictable list.  		 * -		 * The other side is TestClearPageMlocked(). +		 * The other side is TestClearPageMlocked() or shmem_lock().  		 */  		smp_mb();  	} @@ -759,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)   */  static unsigned long shrink_page_list(struct list_head *page_list,  				      struct zone *zone, -				      struct scan_control *sc) +				      struct scan_control *sc, +				      int priority, +				      unsigned long *ret_nr_dirty, +				      unsigned long *ret_nr_writeback)  {  	LIST_HEAD(ret_pages);  	LIST_HEAD(free_pages); @@ -767,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,  	unsigned long nr_dirty = 0;  	unsigned long nr_congested = 0;  	unsigned long nr_reclaimed = 0; +	unsigned long nr_writeback = 0;  	cond_resched(); @@ -803,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,  			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));  		if (PageWriteback(page)) { +			nr_writeback++;  			/* -			 * Synchronous reclaim is performed in two passes, -			 * first an asynchronous pass over the list to -			 * start parallel writeback, and a second synchronous -			 * pass to wait for the IO to complete.  Wait here -			 * for any page for which writeback has already -			 * started. +			 * Synchronous reclaim cannot queue pages for +			 * writeback due to the possibility of stack overflow +			 * but if it encounters a page under writeback, wait +			 * for the IO to complete.  			 */  			if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&  			    may_enter_fs) @@ -865,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,  		if (PageDirty(page)) {  			nr_dirty++; +			/* +			 * Only kswapd can writeback filesystem pages to +			 * avoid risk of stack overflow but do not writeback +			 * unless under significant pressure. +			 */ +			if (page_is_file_cache(page) && +					(!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { +				/* +				 * Immediately reclaim when written back. +				 * Similar in principal to deactivate_page() +				 * except we already have the page isolated +				 * and know it's dirty +				 */ +				inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); +				SetPageReclaim(page); + +				goto keep_locked; +			} +  			if (references == PAGEREF_RECLAIM_CLEAN)  				goto keep_locked;  			if (!may_enter_fs) @@ -999,6 +1013,8 @@ keep_lumpy:  	list_splice(&ret_pages, page_list);  	count_vm_events(PGACTIVATE, pgactivate); +	*ret_nr_dirty += nr_dirty; +	*ret_nr_writeback += nr_writeback;  	return nr_reclaimed;  } @@ -1012,23 +1028,27 @@ keep_lumpy:   *   * returns 0 on success, -ve errno on failure.   */ -int __isolate_lru_page(struct page *page, int mode, int file) +int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)  { +	bool all_lru_mode;  	int ret = -EINVAL;  	/* Only take pages on the LRU. */  	if (!PageLRU(page))  		return ret; +	all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == +		(ISOLATE_ACTIVE|ISOLATE_INACTIVE); +  	/*  	 * When checking the active state, we need to be sure we are  	 * dealing with comparible boolean values.  Take the logical not  	 * of each.  	 */ -	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) +	if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))  		return ret; -	if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) +	if (!all_lru_mode && !!page_is_file_cache(page) != file)  		return ret;  	/* @@ -1041,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)  	ret = -EBUSY; +	if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) +		return ret; + +	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) +		return ret; +  	if (likely(get_page_unless_zero(page))) {  		/*  		 * Be careful not to clear PageLRU until after we're @@ -1076,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)   */  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,  		struct list_head *src, struct list_head *dst, -		unsigned long *scanned, int order, int mode, int file) +		unsigned long *scanned, int order, isolate_mode_t mode, +		int file)  {  	unsigned long nr_taken = 0;  	unsigned long nr_lumpy_taken = 0; @@ -1201,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,  static unsigned long isolate_pages_global(unsigned long nr,  					struct list_head *dst,  					unsigned long *scanned, int order, -					int mode, struct zone *z, -					int active, int file) +					isolate_mode_t mode, +					struct zone *z,	int active, int file)  {  	int lru = LRU_BASE;  	if (active) @@ -1394,7 +1421,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,  }  /* - * Returns true if the caller should wait to clean dirty/writeback pages. + * Returns true if a direct reclaim should wait on pages under writeback.   *   * If we are direct reclaiming for contiguous pages and we do not reclaim   * everything in the list, try again and wait for writeback IO to complete. @@ -1416,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,  	if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)  		return false; -	/* If we have relaimed everything on the isolated list, no stall */ +	/* If we have reclaimed everything on the isolated list, no stall */  	if (nr_freed == nr_taken)  		return false; @@ -1448,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,  	unsigned long nr_taken;  	unsigned long nr_anon;  	unsigned long nr_file; +	unsigned long nr_dirty = 0; +	unsigned long nr_writeback = 0; +	isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;  	while (unlikely(too_many_isolated(zone, file, sc))) {  		congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1458,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,  	}  	set_reclaim_mode(priority, sc, false); +	if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) +		reclaim_mode |= ISOLATE_ACTIVE; +  	lru_add_drain(); + +	if (!sc->may_unmap) +		reclaim_mode |= ISOLATE_UNMAPPED; +	if (!sc->may_writepage) +		reclaim_mode |= ISOLATE_CLEAN; +  	spin_lock_irq(&zone->lru_lock);  	if (scanning_global_lru(sc)) { -		nr_taken = isolate_pages_global(nr_to_scan, -			&page_list, &nr_scanned, sc->order, -			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? -					ISOLATE_BOTH : ISOLATE_INACTIVE, -			zone, 0, file); +		nr_taken = isolate_pages_global(nr_to_scan, &page_list, +			&nr_scanned, sc->order, reclaim_mode, zone, 0, file);  		zone->pages_scanned += nr_scanned;  		if (current_is_kswapd())  			__count_zone_vm_events(PGSCAN_KSWAPD, zone, @@ -1475,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,  			__count_zone_vm_events(PGSCAN_DIRECT, zone,  					       nr_scanned);  	} else { -		nr_taken = mem_cgroup_isolate_pages(nr_to_scan, -			&page_list, &nr_scanned, sc->order, -			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? -					ISOLATE_BOTH : ISOLATE_INACTIVE, -			zone, sc->mem_cgroup, -			0, file); +		nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, +			&nr_scanned, sc->order, reclaim_mode, zone, +			sc->mem_cgroup, 0, file);  		/*  		 * mem_cgroup_isolate_pages() keeps track of  		 * scanned pages on its own. @@ -1496,12 +1529,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,  	spin_unlock_irq(&zone->lru_lock); -	nr_reclaimed = shrink_page_list(&page_list, zone, sc); +	nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, +						&nr_dirty, &nr_writeback);  	/* Check if we should syncronously wait for writeback */  	if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {  		set_reclaim_mode(priority, sc, true); -		nr_reclaimed += shrink_page_list(&page_list, zone, sc); +		nr_reclaimed += shrink_page_list(&page_list, zone, sc, +					priority, &nr_dirty, &nr_writeback);  	}  	local_irq_disable(); @@ -1511,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,  	putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); +	/* +	 * If reclaim is isolating dirty pages under writeback, it implies +	 * that the long-lived page allocation rate is exceeding the page +	 * laundering rate. Either the global limits are not being effective +	 * at throttling processes due to the page distribution throughout +	 * zones or there is heavy usage of a slow backing device. The +	 * only option is to throttle from reclaim context which is not ideal +	 * as there is no guarantee the dirtying process is throttled in the +	 * same way balance_dirty_pages() manages. +	 * +	 * This scales the number of dirty pages that must be under writeback +	 * before throttling depending on priority. It is a simple backoff +	 * function that has the most effect in the range DEF_PRIORITY to +	 * DEF_PRIORITY-2 which is the priority reclaim is considered to be +	 * in trouble and reclaim is considered to be in trouble. +	 * +	 * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle +	 * DEF_PRIORITY-1  50% must be PageWriteback +	 * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble +	 * ... +	 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any +	 *                     isolated page is PageWriteback +	 */ +	if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) +		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); +  	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,  		zone_idx(zone),  		nr_scanned, nr_reclaimed, @@ -1582,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,  	struct page *page;  	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);  	unsigned long nr_rotated = 0; +	isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;  	lru_add_drain(); + +	if (!sc->may_unmap) +		reclaim_mode |= ISOLATE_UNMAPPED; +	if (!sc->may_writepage) +		reclaim_mode |= ISOLATE_CLEAN; +  	spin_lock_irq(&zone->lru_lock);  	if (scanning_global_lru(sc)) {  		nr_taken = isolate_pages_global(nr_pages, &l_hold,  						&pgscanned, sc->order, -						ISOLATE_ACTIVE, zone, +						reclaim_mode, zone,  						1, file);  		zone->pages_scanned += pgscanned;  	} else {  		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,  						&pgscanned, sc->order, -						ISOLATE_ACTIVE, zone, +						reclaim_mode, zone,  						sc->mem_cgroup, 1, file);  		/*  		 * mem_cgroup_isolate_pages() keeps track of @@ -1699,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)  	if (scanning_global_lru(sc))  		low = inactive_anon_is_low_global(zone);  	else -		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); +		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);  	return low;  }  #else @@ -1742,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)  	if (scanning_global_lru(sc))  		low = inactive_file_is_low_global(zone);  	else -		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); +		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);  	return low;  } @@ -1795,12 +1863,19 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,  	enum lru_list l;  	int noswap = 0;  	bool force_scan = false; -	unsigned long nr_force_scan[2]; -	/* kswapd does zone balancing and needs to scan this zone */ +	/* +	 * If the zone or memcg is small, nr[l] can be 0.  This +	 * results in no scanning on this priority and a potential +	 * priority drop.  Global direct reclaim can go to the next +	 * zone and tends to have no problems. Global kswapd is for +	 * zone balancing and it needs to scan a minimum amount. When +	 * reclaiming for a memcg, a priority drop can cause high +	 * latencies, so it's better to scan a minimum amount there as +	 * well. +	 */  	if (scanning_global_lru(sc) && current_is_kswapd())  		force_scan = true; -	/* memcg may have small limit and need to avoid priority drop */  	if (!scanning_global_lru(sc))  		force_scan = true; @@ -1810,8 +1885,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,  		fraction[0] = 0;  		fraction[1] = 1;  		denominator = 1; -		nr_force_scan[0] = 0; -		nr_force_scan[1] = SWAP_CLUSTER_MAX;  		goto out;  	} @@ -1828,8 +1901,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,  			fraction[0] = 1;  			fraction[1] = 0;  			denominator = 1; -			nr_force_scan[0] = SWAP_CLUSTER_MAX; -			nr_force_scan[1] = 0;  			goto out;  		}  	} @@ -1878,11 +1949,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,  	fraction[0] = ap;  	fraction[1] = fp;  	denominator = ap + fp + 1; -	if (force_scan) { -		unsigned long scan = SWAP_CLUSTER_MAX; -		nr_force_scan[0] = div64_u64(scan * ap, denominator); -		nr_force_scan[1] = div64_u64(scan * fp, denominator); -	}  out:  	for_each_evictable_lru(l) {  		int file = is_file_lru(l); @@ -1891,20 +1957,10 @@ out:  		scan = zone_nr_lru_pages(zone, sc, l);  		if (priority || noswap) {  			scan >>= priority; +			if (!scan && force_scan) +				scan = SWAP_CLUSTER_MAX;  			scan = div64_u64(scan * fraction[file], denominator);  		} - -		/* -		 * If zone is small or memcg is small, nr[l] can be 0. -		 * This results no-scan on this priority and priority drop down. -		 * For global direct reclaim, it can visit next zone and tend -		 * not to have problems. For global kswapd, it's for zone -		 * balancing and it need to scan a small amounts. When using -		 * memcg, priority drop can cause big latency. So, it's better -		 * to scan small amount. See may_noscan above. -		 */ -		if (!scan && force_scan) -			scan = nr_force_scan[file];  		nr[l] = scan;  	}  } @@ -1983,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone,  	enum lru_list l;  	unsigned long nr_reclaimed, nr_scanned;  	unsigned long nr_to_reclaim = sc->nr_to_reclaim; +	struct blk_plug plug;  restart:  	nr_reclaimed = 0;  	nr_scanned = sc->nr_scanned;  	get_scan_count(zone, sc, nr, priority); +	blk_start_plug(&plug);  	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||  					nr[LRU_INACTIVE_FILE]) {  		for_each_evictable_lru(l) { @@ -2012,6 +2070,7 @@ restart:  		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)  			break;  	} +	blk_finish_plug(&plug);  	sc->nr_reclaimed += nr_reclaimed;  	/* @@ -2044,14 +2103,19 @@ restart:   *   * If a zone is deemed to be full of pinned pages then just give it a light   * scan then give up on it. + * + * This function returns true if a zone is being reclaimed for a costly + * high-order allocation and compaction is either ready to begin or deferred. + * This indicates to the caller that it should retry the allocation or fail.   */ -static void shrink_zones(int priority, struct zonelist *zonelist, +static bool shrink_zones(int priority, struct zonelist *zonelist,  					struct scan_control *sc)  {  	struct zoneref *z;  	struct zone *zone;  	unsigned long nr_soft_reclaimed;  	unsigned long nr_soft_scanned; +	bool should_abort_reclaim = false;  	for_each_zone_zonelist_nodemask(zone, z, zonelist,  					gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2066,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist,  				continue;  			if (zone->all_unreclaimable && priority != DEF_PRIORITY)  				continue;	/* Let kswapd poll it */ +			if (COMPACTION_BUILD) { +				/* +				 * If we already have plenty of memory free for +				 * compaction in this zone, don't free any more. +				 * Even though compaction is invoked for any +				 * non-zero order, only frequent costly order +				 * reclamation is disruptive enough to become a +				 * noticable problem, like transparent huge page +				 * allocations. +				 */ +				if (sc->order > PAGE_ALLOC_COSTLY_ORDER && +					(compaction_suitable(zone, sc->order) || +					 compaction_deferred(zone))) { +					should_abort_reclaim = true; +					continue; +				} +			}  			/*  			 * This steals pages from memory cgroups over softlimit  			 * and returns the number of reclaimed pages and @@ -2083,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,  		shrink_zone(priority, zone, sc);  	} + +	return should_abort_reclaim;  }  static bool zone_reclaimable(struct zone *zone) @@ -2147,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,  		sc->nr_scanned = 0;  		if (!priority)  			disable_swap_token(sc->mem_cgroup); -		shrink_zones(priority, zonelist, sc); +		if (shrink_zones(priority, zonelist, sc)) +			break; +  		/*  		 * Don't shrink slabs when reclaiming memory from  		 * over limit cgroups @@ -2691,6 +2776,8 @@ out:  			/* If balanced, clear the congested flag */  			zone_clear_flag(zone, ZONE_CONGESTED); +			if (i <= *classzone_idx) +				balanced += zone->present_pages;  		}  	} @@ -2764,7 +2851,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)  static int kswapd(void *p)  {  	unsigned long order, new_order; +	unsigned balanced_order;  	int classzone_idx, new_classzone_idx; +	int balanced_classzone_idx;  	pg_data_t *pgdat = (pg_data_t*)p;  	struct task_struct *tsk = current; @@ -2795,7 +2884,9 @@ static int kswapd(void *p)  	set_freezable();  	order = new_order = 0; +	balanced_order = 0;  	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; +	balanced_classzone_idx = classzone_idx;  	for ( ; ; ) {  		int ret; @@ -2804,7 +2895,8 @@ static int kswapd(void *p)  		 * new request of a similar or harder type will succeed soon  		 * so consider going to sleep on the basis we reclaimed at  		 */ -		if (classzone_idx >= new_classzone_idx && order == new_order) { +		if (balanced_classzone_idx >= new_classzone_idx && +					balanced_order == new_order) {  			new_order = pgdat->kswapd_max_order;  			new_classzone_idx = pgdat->classzone_idx;  			pgdat->kswapd_max_order =  0; @@ -2819,9 +2911,12 @@ static int kswapd(void *p)  			order = new_order;  			classzone_idx = new_classzone_idx;  		} else { -			kswapd_try_to_sleep(pgdat, order, classzone_idx); +			kswapd_try_to_sleep(pgdat, balanced_order, +						balanced_classzone_idx);  			order = pgdat->kswapd_max_order;  			classzone_idx = pgdat->classzone_idx; +			new_order = order; +			new_classzone_idx = classzone_idx;  			pgdat->kswapd_max_order = 0;  			pgdat->classzone_idx = pgdat->nr_zones - 1;  		} @@ -2836,7 +2931,9 @@ static int kswapd(void *p)  		 */  		if (!ret) {  			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); -			order = balance_pgdat(pgdat, order, &classzone_idx); +			balanced_classzone_idx = classzone_idx; +			balanced_order = balance_pgdat(pgdat, order, +						&balanced_classzone_idx);  		}  	}  	return 0; @@ -3348,66 +3445,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)  } -/** - * scan_zone_unevictable_pages - check unevictable list for evictable pages - * @zone - zone of which to scan the unevictable list - * - * Scan @zone's unevictable LRU lists to check for pages that have become - * evictable.  Move those that have to @zone's inactive list where they - * become candidates for reclaim, unless shrink_inactive_zone() decides - * to reactivate them.  Pages that are still unevictable are rotated - * back onto @zone's unevictable list. - */ -#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ -static void scan_zone_unevictable_pages(struct zone *zone) +static void warn_scan_unevictable_pages(void)  { -	struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; -	unsigned long scan; -	unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); - -	while (nr_to_scan > 0) { -		unsigned long batch_size = min(nr_to_scan, -						SCAN_UNEVICTABLE_BATCH_SIZE); - -		spin_lock_irq(&zone->lru_lock); -		for (scan = 0;  scan < batch_size; scan++) { -			struct page *page = lru_to_page(l_unevictable); - -			if (!trylock_page(page)) -				continue; - -			prefetchw_prev_lru_page(page, l_unevictable, flags); - -			if (likely(PageLRU(page) && PageUnevictable(page))) -				check_move_unevictable_page(page, zone); - -			unlock_page(page); -		} -		spin_unlock_irq(&zone->lru_lock); - -		nr_to_scan -= batch_size; -	} -} - - -/** - * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages - * - * A really big hammer:  scan all zones' unevictable LRU lists to check for - * pages that have become evictable.  Move those back to the zones' - * inactive list where they become candidates for reclaim. - * This occurs when, e.g., we have unswappable pages on the unevictable lists, - * and we add swap to the system.  As such, it runs in the context of a task - * that has possibly/probably made some previously unevictable pages - * evictable. - */ -static void scan_all_zones_unevictable_pages(void) -{ -	struct zone *zone; - -	for_each_zone(zone) { -		scan_zone_unevictable_pages(zone); -	} +	printk_once(KERN_WARNING +		    "The scan_unevictable_pages sysctl/node-interface has been " +		    "disabled for lack of a legitimate use case.  If you have " +		    "one, please send an email to linux-mm@kvack.org.\n");  }  /* @@ -3420,11 +3463,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,  			   void __user *buffer,  			   size_t *length, loff_t *ppos)  { +	warn_scan_unevictable_pages();  	proc_doulongvec_minmax(table, write, buffer, length, ppos); - -	if (write && *(unsigned long *)table->data) -		scan_all_zones_unevictable_pages(); -  	scan_unevictable_pages = 0;  	return 0;  } @@ -3439,6 +3479,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,  					  struct sysdev_attribute *attr,  					  char *buf)  { +	warn_scan_unevictable_pages();  	return sprintf(buf, "0\n");	/* always zero; should fit... */  } @@ -3446,19 +3487,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,  					   struct sysdev_attribute *attr,  					const char *buf, size_t count)  { -	struct zone *node_zones = NODE_DATA(dev->id)->node_zones; -	struct zone *zone; -	unsigned long res; -	unsigned long req = strict_strtoul(buf, 10, &res); - -	if (!req) -		return 1;	/* zero is no-op */ - -	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { -		if (!populated_zone(zone)) -			continue; -		scan_zone_unevictable_pages(zone); -	} +	warn_scan_unevictable_pages();  	return 1;  } diff --git a/mm/vmstat.c b/mm/vmstat.c index d52b13d28e8..8fd603b1665 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)   *   * vm_stat contains the global counters   */ -atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; +atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;  EXPORT_SYMBOL(vm_stat);  #ifdef CONFIG_SMP @@ -702,6 +702,7 @@ const char * const vmstat_text[] = {  	"nr_unstable",  	"nr_bounce",  	"nr_vmscan_write", +	"nr_vmscan_immediate_reclaim",  	"nr_writeback_temp",  	"nr_isolated_anon",  	"nr_isolated_file",  |