diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/memory.c | 13 | ||||
| -rw-r--r-- | mm/mempolicy.c | 130 | ||||
| -rw-r--r-- | mm/page_alloc.c | 27 | ||||
| -rw-r--r-- | mm/page_isolation.c | 26 | ||||
| -rw-r--r-- | mm/shmem.c | 4 | ||||
| -rw-r--r-- | mm/vmscan.c | 111 | 
6 files changed, 137 insertions, 174 deletions
diff --git a/mm/memory.c b/mm/memory.c index e0a9b0ce4f1..bb1369f7b9b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -184,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)  		return 1;  	} +	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) +		return 0; +  	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);  	if (!batch)  		return 0; +	tlb->batch_count++;  	batch->next = NULL;  	batch->nr   = 0;  	batch->max  = MAX_GATHER_BATCH; @@ -216,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)  	tlb->local.nr   = 0;  	tlb->local.max  = ARRAY_SIZE(tlb->__pages);  	tlb->active     = &tlb->local; +	tlb->batch_count = 0;  #ifdef CONFIG_HAVE_RCU_TABLE_FREE  	tlb->batch = NULL; @@ -3706,6 +3711,14 @@ retry:  		if (pmd_trans_huge(orig_pmd)) {  			unsigned int dirty = flags & FAULT_FLAG_WRITE; +			/* +			 * If the pmd is splitting, return and retry the +			 * the fault.  Alternative: wait until the split +			 * is done, and goto retry. +			 */ +			if (pmd_trans_splitting(orig_pmd)) +				return 0; +  			if (pmd_numa(orig_pmd))  				return do_huge_pmd_numa_page(mm, vma, address,  							     orig_pmd, pmd); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d1b315e9862..e2df1c1fb41 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2132,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)   */  /* lookup first element intersecting start-end */ -/* Caller holds sp->mutex */ +/* Caller holds sp->lock */  static struct sp_node *  sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)  { @@ -2196,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)  	if (!sp->root.rb_node)  		return NULL; -	mutex_lock(&sp->mutex); +	spin_lock(&sp->lock);  	sn = sp_lookup(sp, idx, idx+1);  	if (sn) {  		mpol_get(sn->policy);  		pol = sn->policy;  	} -	mutex_unlock(&sp->mutex); +	spin_unlock(&sp->lock);  	return pol;  } @@ -2328,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)  	sp_free(n);  } +static void sp_node_init(struct sp_node *node, unsigned long start, +			unsigned long end, struct mempolicy *pol) +{ +	node->start = start; +	node->end = end; +	node->policy = pol; +} +  static struct sp_node *sp_alloc(unsigned long start, unsigned long end,  				struct mempolicy *pol)  { @@ -2344,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,  		return NULL;  	}  	newpol->flags |= MPOL_F_SHARED; - -	n->start = start; -	n->end = end; -	n->policy = newpol; +	sp_node_init(n, start, end, newpol);  	return n;  } @@ -2357,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,  				 unsigned long end, struct sp_node *new)  {  	struct sp_node *n; +	struct sp_node *n_new = NULL; +	struct mempolicy *mpol_new = NULL;  	int ret = 0; -	mutex_lock(&sp->mutex); +restart: +	spin_lock(&sp->lock);  	n = sp_lookup(sp, start, end);  	/* Take care of old policies in the same range. */  	while (n && n->start < end) { @@ -2372,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,  		} else {  			/* Old policy spanning whole new range. */  			if (n->end > end) { -				struct sp_node *new2; -				new2 = sp_alloc(end, n->end, n->policy); -				if (!new2) { -					ret = -ENOMEM; -					goto out; -				} +				if (!n_new) +					goto alloc_new; + +				*mpol_new = *n->policy; +				atomic_set(&mpol_new->refcnt, 1); +				sp_node_init(n_new, n->end, end, mpol_new); +				sp_insert(sp, n_new);  				n->end = start; -				sp_insert(sp, new2); +				n_new = NULL; +				mpol_new = NULL;  				break;  			} else  				n->end = start; @@ -2390,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,  	}  	if (new)  		sp_insert(sp, new); -out: -	mutex_unlock(&sp->mutex); +	spin_unlock(&sp->lock); +	ret = 0; + +err_out: +	if (mpol_new) +		mpol_put(mpol_new); +	if (n_new) +		kmem_cache_free(sn_cache, n_new); +  	return ret; + +alloc_new: +	spin_unlock(&sp->lock); +	ret = -ENOMEM; +	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); +	if (!n_new) +		goto err_out; +	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); +	if (!mpol_new) +		goto err_out; +	goto restart;  }  /** @@ -2410,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)  	int ret;  	sp->root = RB_ROOT;		/* empty tree == default mempolicy */ -	mutex_init(&sp->mutex); +	spin_lock_init(&sp->lock);  	if (mpol) {  		struct vm_area_struct pvma; @@ -2476,14 +2504,14 @@ void mpol_free_shared_policy(struct shared_policy *p)  	if (!p->root.rb_node)  		return; -	mutex_lock(&p->mutex); +	spin_lock(&p->lock);  	next = rb_first(&p->root);  	while (next) {  		n = rb_entry(next, struct sp_node, nd);  		next = rb_next(&n->nd);  		sp_delete(p, n);  	} -	mutex_unlock(&p->mutex); +	spin_unlock(&p->lock);  }  #ifdef CONFIG_NUMA_BALANCING @@ -2595,8 +2623,7 @@ void numa_default_policy(void)   */  /* - * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag - * Used only for mpol_parse_str() and mpol_to_str() + * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.   */  static const char * const policy_modes[] =  { @@ -2610,28 +2637,20 @@ static const char * const policy_modes[] =  #ifdef CONFIG_TMPFS  /** - * mpol_parse_str - parse string to mempolicy + * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.   * @str:  string containing mempolicy to parse   * @mpol:  pointer to struct mempolicy pointer, returned on success. - * @no_context:  flag whether to "contextualize" the mempolicy   *   * Format of input:   *	<mode>[=<flags>][:<nodelist>]   * - * if @no_context is true, save the input nodemask in w.user_nodemask in - * the returned mempolicy.  This will be used to "clone" the mempolicy in - * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol - * mount option.  Note that if 'static' or 'relative' mode flags were - * specified, the input nodemask will already have been saved.  Saving - * it again is redundant, but safe. - *   * On success, returns 0, else 1   */ -int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) +int mpol_parse_str(char *str, struct mempolicy **mpol)  {  	struct mempolicy *new = NULL;  	unsigned short mode; -	unsigned short uninitialized_var(mode_flags); +	unsigned short mode_flags;  	nodemask_t nodes;  	char *nodelist = strchr(str, ':');  	char *flags = strchr(str, '='); @@ -2719,24 +2738,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)  	if (IS_ERR(new))  		goto out; -	if (no_context) { -		/* save for contextualization */ -		new->w.user_nodemask = nodes; -	} else { -		int ret; -		NODEMASK_SCRATCH(scratch); -		if (scratch) { -			task_lock(current); -			ret = mpol_set_nodemask(new, &nodes, scratch); -			task_unlock(current); -		} else -			ret = -ENOMEM; -		NODEMASK_SCRATCH_FREE(scratch); -		if (ret) { -			mpol_put(new); -			goto out; -		} -	} +	/* +	 * Save nodes for mpol_to_str() to show the tmpfs mount options +	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. +	 */ +	if (mode != MPOL_PREFERRED) +		new->v.nodes = nodes; +	else if (nodelist) +		new->v.preferred_node = first_node(nodes); +	else +		new->flags |= MPOL_F_LOCAL; + +	/* +	 * Save nodes for contextualization: this will be used to "clone" +	 * the mempolicy in a specific context [cpuset] at a later time. +	 */ +	new->w.user_nodemask = nodes; +  	err = 0;  out: @@ -2756,13 +2774,12 @@ out:   * @buffer:  to contain formatted mempolicy string   * @maxlen:  length of @buffer   * @pol:  pointer to mempolicy to be formatted - * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask   *   * Convert a mempolicy into a string.   * Returns the number of characters in buffer (if positive)   * or an error (negative)   */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)  {  	char *p = buffer;  	int l; @@ -2788,7 +2805,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)  	case MPOL_PREFERRED:  		nodes_clear(nodes);  		if (flags & MPOL_F_LOCAL) -			mode = MPOL_LOCAL;	/* pseudo-policy */ +			mode = MPOL_LOCAL;  		else  			node_set(pol->v.preferred_node, nodes);  		break; @@ -2796,10 +2813,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)  	case MPOL_BIND:  		/* Fall through */  	case MPOL_INTERLEAVE: -		if (no_context) -			nodes = pol->w.user_nodemask; -		else -			nodes = pol->v.nodes; +		nodes = pol->v.nodes;  		break;  	default: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ba5e37127f..bc6cc0e913b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -221,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes);  int page_group_by_mobility_disabled __read_mostly; -/* - * NOTE: - * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. - * Instead, use {un}set_pageblock_isolate. - */  void set_pageblock_migratetype(struct page *page, int migratetype)  { @@ -1655,20 +1650,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,  	return true;  } -#ifdef CONFIG_MEMORY_ISOLATION -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ -	if (unlikely(zone->nr_pageblock_isolate)) -		return zone->nr_pageblock_isolate * pageblock_nr_pages; -	return 0; -} -#else -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ -	return 0; -} -#endif -  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,  		      int classzone_idx, int alloc_flags)  { @@ -1684,14 +1665,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,  	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)  		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); -	/* -	 * If the zone has MIGRATE_ISOLATE type free pages, we should consider -	 * it.  nr_zone_isolate_freepages is never accurate so kswapd might not -	 * sleep although it could do so.  But this is more desirable for memory -	 * hotplug than sleeping which can cause a livelock in the direct -	 * reclaim path. -	 */ -	free_pages -= nr_zone_isolate_freepages(z);  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,  								free_pages);  } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9d2264ea460..383bdbb98b0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -8,28 +8,6 @@  #include <linux/memory.h>  #include "internal.h" -/* called while holding zone->lock */ -static void set_pageblock_isolate(struct page *page) -{ -	if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) -		return; - -	set_pageblock_migratetype(page, MIGRATE_ISOLATE); -	page_zone(page)->nr_pageblock_isolate++; -} - -/* called while holding zone->lock */ -static void restore_pageblock_isolate(struct page *page, int migratetype) -{ -	struct zone *zone = page_zone(page); -	if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) -		return; - -	BUG_ON(zone->nr_pageblock_isolate <= 0); -	set_pageblock_migratetype(page, migratetype); -	zone->nr_pageblock_isolate--; -} -  int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)  {  	struct zone *zone; @@ -80,7 +58,7 @@ out:  		unsigned long nr_pages;  		int migratetype = get_pageblock_migratetype(page); -		set_pageblock_isolate(page); +		set_pageblock_migratetype(page, MIGRATE_ISOLATE);  		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);  		__mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -103,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)  		goto out;  	nr_pages = move_freepages_block(zone, page, migratetype);  	__mod_zone_freepage_state(zone, nr_pages, migratetype); -	restore_pageblock_isolate(page, migratetype); +	set_pageblock_migratetype(page, migratetype);  out:  	spin_unlock_irqrestore(&zone->lock, flags);  } diff --git a/mm/shmem.c b/mm/shmem.c index 5c90d84c2b0..5dd56f6efdb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)  	if (!mpol || mpol->mode == MPOL_DEFAULT)  		return;		/* show nothing */ -	mpol_to_str(buffer, sizeof(buffer), mpol, 1); +	mpol_to_str(buffer, sizeof(buffer), mpol);  	seq_printf(seq, ",mpol=%s", buffer);  } @@ -2463,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,  			if (!gid_valid(sbinfo->gid))  				goto bad_val;  		} else if (!strcmp(this_char,"mpol")) { -			if (mpol_parse_str(value, &sbinfo->mpol, 1)) +			if (mpol_parse_str(value, &sbinfo->mpol))  				goto bad_val;  		} else {  			printk(KERN_ERR "tmpfs: Bad mount option %s\n", diff --git a/mm/vmscan.c b/mm/vmscan.c index adc7e905818..196709f5ee5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2452,12 +2452,16 @@ static bool zone_balanced(struct zone *zone, int order,  }  /* - * pgdat_balanced is used when checking if a node is balanced for high-order - * allocations. Only zones that meet watermarks and are in a zone allowed - * by the callers classzone_idx are added to balanced_pages. The total of - * balanced pages must be at least 25% of the zones allowed by classzone_idx - * for the node to be considered balanced. Forcing all zones to be balanced - * for high orders can cause excessive reclaim when there are imbalanced zones. + * pgdat_balanced() is used when checking if a node is balanced. + * + * For order-0, all zones must be balanced! + * + * For high-order allocations only zones that meet watermarks and are in a + * zone allowed by the callers classzone_idx are added to balanced_pages. The + * total of balanced pages must be at least 25% of the zones allowed by + * classzone_idx for the node to be considered balanced. Forcing all zones to + * be balanced for high orders can cause excessive reclaim when there are + * imbalanced zones.   * The choice of 25% is due to   *   o a 16M DMA zone that is balanced will not balance a zone on any   *     reasonable sized machine @@ -2467,17 +2471,43 @@ static bool zone_balanced(struct zone *zone, int order,   *     Similarly, on x86-64 the Normal zone would need to be at least 1G   *     to balance a node on its own. These seemed like reasonable ratios.   */ -static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, -						int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)  {  	unsigned long present_pages = 0; +	unsigned long balanced_pages = 0;  	int i; -	for (i = 0; i <= classzone_idx; i++) -		present_pages += pgdat->node_zones[i].present_pages; +	/* Check the watermark levels */ +	for (i = 0; i <= classzone_idx; i++) { +		struct zone *zone = pgdat->node_zones + i; -	/* A special case here: if zone has no page, we think it's balanced */ -	return balanced_pages >= (present_pages >> 2); +		if (!populated_zone(zone)) +			continue; + +		present_pages += zone->present_pages; + +		/* +		 * A special case here: +		 * +		 * balance_pgdat() skips over all_unreclaimable after +		 * DEF_PRIORITY. Effectively, it considers them balanced so +		 * they must be considered balanced here as well! +		 */ +		if (zone->all_unreclaimable) { +			balanced_pages += zone->present_pages; +			continue; +		} + +		if (zone_balanced(zone, order, 0, i)) +			balanced_pages += zone->present_pages; +		else if (!order) +			return false; +	} + +	if (order) +		return balanced_pages >= (present_pages >> 2); +	else +		return true;  }  /* @@ -2489,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,  static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,  					int classzone_idx)  { -	int i; -	unsigned long balanced = 0; -	bool all_zones_ok = true; -  	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */  	if (remaining)  		return false; @@ -2511,39 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,  		return false;  	} -	/* Check the watermark levels */ -	for (i = 0; i <= classzone_idx; i++) { -		struct zone *zone = pgdat->node_zones + i; - -		if (!populated_zone(zone)) -			continue; - -		/* -		 * balance_pgdat() skips over all_unreclaimable after -		 * DEF_PRIORITY. Effectively, it considers them balanced so -		 * they must be considered balanced here as well if kswapd -		 * is to sleep -		 */ -		if (zone->all_unreclaimable) { -			balanced += zone->present_pages; -			continue; -		} - -		if (!zone_balanced(zone, order, 0, i)) -			all_zones_ok = false; -		else -			balanced += zone->present_pages; -	} - -	/* -	 * For high-order requests, the balanced zones must contain at least -	 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones -	 * must be balanced -	 */ -	if (order) -		return pgdat_balanced(pgdat, balanced, classzone_idx); -	else -		return all_zones_ok; +	return pgdat_balanced(pgdat, order, classzone_idx);  }  /* @@ -2571,7 +2565,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,  							int *classzone_idx)  {  	struct zone *unbalanced_zone; -	unsigned long balanced;  	int i;  	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */  	unsigned long total_scanned; @@ -2605,7 +2598,6 @@ loop_again:  		int has_under_min_watermark_zone = 0;  		unbalanced_zone = NULL; -		balanced = 0;  		/*  		 * Scan in the highmem->dma direction for the highest @@ -2761,8 +2753,6 @@ loop_again:  				 * speculatively avoid congestion waits  				 */  				zone_clear_flag(zone, ZONE_CONGESTED); -				if (i <= *classzone_idx) -					balanced += zone->present_pages;  			}  		} @@ -2776,7 +2766,7 @@ loop_again:  				pfmemalloc_watermark_ok(pgdat))  			wake_up(&pgdat->pfmemalloc_wait); -		if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) +		if (pgdat_balanced(pgdat, order, *classzone_idx))  			break;		/* kswapd: all done */  		/*  		 * OK, kswapd is getting into trouble.  Take a nap, then take @@ -2785,7 +2775,7 @@ loop_again:  		if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {  			if (has_under_min_watermark_zone)  				count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); -			else +			else if (unbalanced_zone)  				wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);  		} @@ -2800,12 +2790,7 @@ loop_again:  	} while (--sc.priority >= 0);  out: -	/* -	 * order-0: All zones must meet high watermark for a balanced node -	 * high-order: Balanced zones must make up at least 25% of the node -	 *             for the node to be balanced -	 */ -	if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) { +	if (!pgdat_balanced(pgdat, order, *classzone_idx)) {  		cond_resched();  		try_to_freeze(); @@ -3137,8 +3122,8 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)     not required for correctness.  So if the last cpu in a node goes     away, we get changed to run anywhere: as the first one comes back,     restore their cpu bindings. */ -static int __devinit cpu_callback(struct notifier_block *nfb, -				  unsigned long action, void *hcpu) +static int cpu_callback(struct notifier_block *nfb, unsigned long action, +			void *hcpu)  {  	int nid;  |