diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 222 | 
1 files changed, 158 insertions, 64 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bd9052a5d3a..e013b8e57d2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,6 +35,7 @@  #include <linux/limits.h>  #include <linux/mutex.h>  #include <linux/rbtree.h> +#include <linux/shmem_fs.h>  #include <linux/slab.h>  #include <linux/swap.h>  #include <linux/swapops.h> @@ -107,10 +108,12 @@ enum mem_cgroup_events_index {  enum mem_cgroup_events_target {  	MEM_CGROUP_TARGET_THRESH,  	MEM_CGROUP_TARGET_SOFTLIMIT, +	MEM_CGROUP_TARGET_NUMAINFO,  	MEM_CGROUP_NTARGETS,  };  #define THRESHOLDS_EVENTS_TARGET (128)  #define SOFTLIMIT_EVENTS_TARGET (1024) +#define NUMAINFO_EVENTS_TARGET	(1024)  struct mem_cgroup_stat_cpu {  	long count[MEM_CGROUP_STAT_NSTATS]; @@ -236,7 +239,8 @@ struct mem_cgroup {  	int last_scanned_node;  #if MAX_NUMNODES > 1  	nodemask_t	scan_nodes; -	unsigned long   next_scan_node_update; +	atomic_t	numainfo_events; +	atomic_t	numainfo_updating;  #endif  	/*  	 * Should the accounting and control be hierarchical, per subtree? @@ -359,7 +363,7 @@ enum charge_type {  static void mem_cgroup_get(struct mem_cgroup *mem);  static void mem_cgroup_put(struct mem_cgroup *mem);  static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); -static void drain_all_stock_async(void); +static void drain_all_stock_async(struct mem_cgroup *mem);  static struct mem_cgroup_per_zone *  mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) @@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,  	return val;  } -static long mem_cgroup_local_usage(struct mem_cgroup *mem) -{ -	long ret; - -	ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); -	ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); -	return ret; -} -  static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,  					 bool charge)  { @@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)  	case MEM_CGROUP_TARGET_SOFTLIMIT:  		next = val + SOFTLIMIT_EVENTS_TARGET;  		break; +	case MEM_CGROUP_TARGET_NUMAINFO: +		next = val + NUMAINFO_EVENTS_TARGET; +		break;  	default:  		return;  	} @@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)  		mem_cgroup_threshold(mem);  		__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);  		if (unlikely(__memcg_event_check(mem, -			MEM_CGROUP_TARGET_SOFTLIMIT))){ +			     MEM_CGROUP_TARGET_SOFTLIMIT))) {  			mem_cgroup_update_tree(mem, page);  			__mem_cgroup_target_update(mem, -				MEM_CGROUP_TARGET_SOFTLIMIT); +						   MEM_CGROUP_TARGET_SOFTLIMIT);  		} +#if MAX_NUMNODES > 1 +		if (unlikely(__memcg_event_check(mem, +			MEM_CGROUP_TARGET_NUMAINFO))) { +			atomic_inc(&mem->numainfo_events); +			__mem_cgroup_target_update(mem, +				MEM_CGROUP_TARGET_NUMAINFO); +		} +#endif  	}  } @@ -735,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)  				struct mem_cgroup, css);  } -static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)  {  	struct mem_cgroup *mem = NULL; @@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,  	return MEM_CGROUP_ZSTAT(mz, lru);  } -#ifdef CONFIG_NUMA  static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,  							int nid)  { @@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,  	return ret;  } +static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, +							int nid) +{ +	unsigned long ret; + +	ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + +		mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); +	return ret; +} + +#if MAX_NUMNODES > 1  static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)  {  	u64 total = 0; @@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)  	return total;  } -static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, -							int nid) -{ -	unsigned long ret; - -	ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + -		mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); - -	return ret; -} -  static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)  {  	u64 total = 0; @@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)  	return ret;  } +/** + * test_mem_cgroup_node_reclaimable + * @mem: the target memcg + * @nid: the node ID to be checked. + * @noswap : specify true here if the user wants flle only information. + * + * This function returns whether the specified memcg contains any + * reclaimable pages on a node. Returns true if there are any reclaimable + * pages in the node. + */ +static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, +		int nid, bool noswap) +{ +	if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) +		return true; +	if (noswap || !total_swap_pages) +		return false; +	if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) +		return true; +	return false; + +}  #if MAX_NUMNODES > 1  /* @@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)  static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)  {  	int nid; - -	if (time_after(mem->next_scan_node_update, jiffies)) +	/* +	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET +	 * pagein/pageout changes since the last update. +	 */ +	if (!atomic_read(&mem->numainfo_events)) +		return; +	if (atomic_inc_return(&mem->numainfo_updating) > 1)  		return; -	mem->next_scan_node_update = jiffies + 10*HZ;  	/* make a nodemask where this memcg uses memory from */  	mem->scan_nodes = node_states[N_HIGH_MEMORY];  	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { -		if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || -		    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) -			continue; - -		if (total_swap_pages && -		    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || -		     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) -			continue; -		node_clear(nid, mem->scan_nodes); +		if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) +			node_clear(nid, mem->scan_nodes);  	} + +	atomic_set(&mem->numainfo_events, 0); +	atomic_set(&mem->numainfo_updating, 0);  }  /* @@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)  	return node;  } +/* + * Check all nodes whether it contains reclaimable pages or not. + * For quick scan, we make use of scan_nodes. This will allow us to skip + * unused nodes. But scan_nodes is lazily updated and may not cotain + * enough new information. We need to do double check. + */ +bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +{ +	int nid; + +	/* +	 * quick check...making use of scan_node. +	 * We can skip unused nodes. +	 */ +	if (!nodes_empty(mem->scan_nodes)) { +		for (nid = first_node(mem->scan_nodes); +		     nid < MAX_NUMNODES; +		     nid = next_node(nid, mem->scan_nodes)) { + +			if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) +				return true; +		} +	} +	/* +	 * Check rest of nodes. +	 */ +	for_each_node_state(nid, N_HIGH_MEMORY) { +		if (node_isset(nid, mem->scan_nodes)) +			continue; +		if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) +			return true; +	} +	return false; +} +  #else  int mem_cgroup_select_victim_node(struct mem_cgroup *mem)  {  	return 0;  } + +bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +{ +	return test_mem_cgroup_node_reclaimable(mem, 0, noswap); +}  #endif  /* @@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,  	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;  	/* If memsw_is_minimum==1, swap-out is of-no-use. */ -	if (root_mem->memsw_is_minimum) +	if (!check_soft && root_mem->memsw_is_minimum)  		noswap = true;  	while (1) {  		victim = mem_cgroup_select_victim(root_mem);  		if (victim == root_mem) {  			loop++; -			if (loop >= 1) -				drain_all_stock_async(); +			/* +			 * We are not draining per cpu cached charges during +			 * soft limit reclaim  because global reclaim doesn't +			 * care about charges. It tries to free some memory and +			 * charges will not give any. +			 */ +			if (!check_soft && loop >= 1) +				drain_all_stock_async(root_mem);  			if (loop >= 2) {  				/*  				 * If we have not been able to reclaim @@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,  				}  			}  		} -		if (!mem_cgroup_local_usage(victim)) { +		if (!mem_cgroup_reclaimable(victim, noswap)) {  			/* this cgroup's local usage == 0 */  			css_put(&victim->css);  			continue; @@ -1934,9 +2007,11 @@ struct memcg_stock_pcp {  	struct mem_cgroup *cached; /* this never be root cgroup */  	unsigned int nr_pages;  	struct work_struct work; +	unsigned long flags; +#define FLUSHING_CACHED_CHARGE	(0)  };  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); -static atomic_t memcg_drain_count; +static DEFINE_MUTEX(percpu_charge_mutex);  /*   * Try to consume stocked charge on this cpu. If success, one page is consumed @@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy)  {  	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);  	drain_stock(stock); +	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);  }  /* @@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)   * expects some charges will be back to res_counter later but cannot wait for   * it.   */ -static void drain_all_stock_async(void) +static void drain_all_stock_async(struct mem_cgroup *root_mem)  { -	int cpu; -	/* This function is for scheduling "drain" in asynchronous way. -	 * The result of "drain" is not directly handled by callers. Then, -	 * if someone is calling drain, we don't have to call drain more. -	 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if -	 * there is a race. We just do loose check here. +	int cpu, curcpu; +	/* +	 * If someone calls draining, avoid adding more kworker runs.  	 */ -	if (atomic_read(&memcg_drain_count)) +	if (!mutex_trylock(&percpu_charge_mutex))  		return;  	/* Notify other cpus that system-wide "drain" is running */ -	atomic_inc(&memcg_drain_count);  	get_online_cpus(); +	/* +	 * Get a hint for avoiding draining charges on the current cpu, +	 * which must be exhausted by our charging.  It is not required that +	 * this be a precise check, so we use raw_smp_processor_id() instead of +	 * getcpu()/putcpu(). +	 */ +	curcpu = raw_smp_processor_id();  	for_each_online_cpu(cpu) {  		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); -		schedule_work_on(cpu, &stock->work); +		struct mem_cgroup *mem; + +		if (cpu == curcpu) +			continue; + +		mem = stock->cached; +		if (!mem) +			continue; +		if (mem != root_mem) { +			if (!root_mem->use_hierarchy) +				continue; +			/* check whether "mem" is under tree of "root_mem" */ +			if (!css_is_ancestor(&mem->css, &root_mem->css)) +				continue; +		} +		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) +			schedule_work_on(cpu, &stock->work);  	}   	put_online_cpus(); -	atomic_dec(&memcg_drain_count); +	mutex_unlock(&percpu_charge_mutex);  	/* We don't wait for flush_work */  } @@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void)  static void drain_all_stock_sync(void)  {  	/* called when force_empty is called */ -	atomic_inc(&memcg_drain_count); +	mutex_lock(&percpu_charge_mutex);  	schedule_on_each_cpu(drain_local_stock); -	atomic_dec(&memcg_drain_count); +	mutex_unlock(&percpu_charge_mutex);  }  /* @@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = {  	{  		.name = "numa_stat",  		.open = mem_control_numa_stat_open, +		.mode = S_IRUGO,  	},  #endif  }; @@ -5414,18 +5510,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,  				struct cgroup *old_cont,  				struct task_struct *p)  { -	struct mm_struct *mm; +	struct mm_struct *mm = get_task_mm(p); -	if (!mc.to) -		/* no need to move charge */ -		return; - -	mm = get_task_mm(p);  	if (mm) { -		mem_cgroup_move_charge(mm); +		if (mc.to) +			mem_cgroup_move_charge(mm); +		put_swap_token(mm);  		mmput(mm);  	} -	mem_cgroup_clear_mc(); +	if (mc.to) +		mem_cgroup_clear_mc();  }  #else	/* !CONFIG_MMU */  static int mem_cgroup_can_attach(struct cgroup_subsys *ss,  |