diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 1242 | 
1 files changed, 1176 insertions, 66 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bbfac5063ca..f3009b4bae5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -10,6 +10,10 @@   * Copyright (C) 2009 Nokia Corporation   * Author: Kirill A. Shutemov   * + * Kernel Memory Controller + * Copyright (C) 2012 Parallels Inc. and Google Inc. + * Authors: Glauber Costa and Suleiman Souhlal + *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License as published by   * the Free Software Foundation; either version 2 of the License, or @@ -268,6 +272,10 @@ struct mem_cgroup {  	};  	/* +	 * the counter to account for kernel memory usage. +	 */ +	struct res_counter kmem; +	/*  	 * Per cgroup active and inactive list, similar to the  	 * per zone LRU lists.  	 */ @@ -282,6 +290,7 @@ struct mem_cgroup {  	 * Should the accounting and control be hierarchical, per subtree?  	 */  	bool use_hierarchy; +	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */  	bool		oom_lock;  	atomic_t	under_oom; @@ -332,8 +341,61 @@ struct mem_cgroup {  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)  	struct tcp_memcontrol tcp_mem;  #endif +#if defined(CONFIG_MEMCG_KMEM) +	/* analogous to slab_common's slab_caches list. per-memcg */ +	struct list_head memcg_slab_caches; +	/* Not a spinlock, we can take a lot of time walking the list */ +	struct mutex slab_caches_mutex; +        /* Index in the kmem_cache->memcg_params->memcg_caches array */ +	int kmemcg_id; +#endif  }; +/* internal only representation about the status of kmem accounting. */ +enum { +	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ +	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ +	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ +}; + +/* We account when limit is on, but only after call sites are patched */ +#define KMEM_ACCOUNTED_MASK \ +		((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) + +#ifdef CONFIG_MEMCG_KMEM +static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) +{ +	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ +	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static void memcg_kmem_set_activated(struct mem_cgroup *memcg) +{ +	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + +static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) +{ +	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + +static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) +{ +	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) +		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) +{ +	return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, +				  &memcg->kmem_account_flags); +} +#endif +  /* Stuffs for move charges at task migration. */  /*   * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a @@ -388,9 +450,13 @@ enum charge_type {  };  /* for encoding cft->private value on file */ -#define _MEM			(0) -#define _MEMSWAP		(1) -#define _OOM_TYPE		(2) +enum res_type { +	_MEM, +	_MEMSWAP, +	_OOM_TYPE, +	_KMEM, +}; +  #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))  #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)  #define MEMFILE_ATTR(val)	((val) & 0xffff) @@ -487,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)  }  #endif +#ifdef CONFIG_MEMCG_KMEM +/* + * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * There are two main reasons for not using the css_id for this: + *  1) this works better in sparse environments, where we have a lot of memcgs, + *     but only a few kmem-limited. Or also, if we have, for instance, 200 + *     memcgs, and none but the 200th is kmem-limited, we'd have to have a + *     200 entry array for that. + * + *  2) In order not to violate the cgroup API, we would like to do all memory + *     allocation in ->create(). At that point, we haven't yet allocated the + *     css_id. Having a separate index prevents us from messing with the cgroup + *     core for this + * + * The current size of the caches array is stored in + * memcg_limited_groups_array_size.  It will double each time we have to + * increase it. + */ +static DEFINE_IDA(kmem_limited_groups); +int memcg_limited_groups_array_size; + +/* + * MIN_SIZE is different than 1, because we would like to avoid going through + * the alloc/free process all the time. In a small machine, 4 kmem-limited + * cgroups is a reasonable guess. In the future, it could be a parameter or + * tunable, but that is strictly not necessary. + * + * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * this constant directly from cgroup, but it is understandable that this is + * better kept as an internal representation in cgroup.c. In any case, the + * css_id space is not getting any smaller, and we don't have to necessarily + * increase ours as well if it increases. + */ +#define MEMCG_CACHES_MIN_SIZE 4 +#define MEMCG_CACHES_MAX_SIZE 65535 + +/* + * A lot of the calls to the cache allocation functions are expected to be + * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * conditional to this static branch, we'll have to allow modules that does + * kmem_cache_alloc and the such to see this symbol as well + */ +struct static_key memcg_kmem_enabled_key; +EXPORT_SYMBOL(memcg_kmem_enabled_key); + +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ +	if (memcg_kmem_is_active(memcg)) { +		static_key_slow_dec(&memcg_kmem_enabled_key); +		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); +	} +	/* +	 * This check can't live in kmem destruction function, +	 * since the charges will outlive the cgroup +	 */ +	WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); +} +#else +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static void disarm_static_keys(struct mem_cgroup *memcg) +{ +	disarm_sock_keys(memcg); +	disarm_kmem_keys(memcg); +} +  static void drain_all_stock_async(struct mem_cgroup *memcg);  static struct mem_cgroup_per_zone * @@ -1453,6 +1588,10 @@ done:  		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,  		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,  		res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); +	printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", +		res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, +		res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, +		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));  }  /* @@ -2060,20 +2199,28 @@ struct memcg_stock_pcp {  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);  static DEFINE_MUTEX(percpu_charge_mutex); -/* - * Try to consume stocked charge on this cpu. If success, one page is consumed - * from local stock and true is returned. If the stock is 0 or charges from a - * cgroup which is not current target, returns false. This stock will be - * refilled. +/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. + * + * The charges will only happen if @memcg matches the current cpu's memcg + * stock, and at least @nr_pages are available in that stock.  Failure to + * service an allocation will refill the stock. + * + * returns true if successful, false otherwise.   */ -static bool consume_stock(struct mem_cgroup *memcg) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)  {  	struct memcg_stock_pcp *stock;  	bool ret = true; +	if (nr_pages > CHARGE_BATCH) +		return false; +  	stock = &get_cpu_var(memcg_stock); -	if (memcg == stock->cached && stock->nr_pages) -		stock->nr_pages--; +	if (memcg == stock->cached && stock->nr_pages >= nr_pages) +		stock->nr_pages -= nr_pages;  	else /* need to call res_counter_charge */  		ret = false;  	put_cpu_var(memcg_stock); @@ -2250,7 +2397,8 @@ enum {  };  static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, -				unsigned int nr_pages, bool oom_check) +				unsigned int nr_pages, unsigned int min_pages, +				bool oom_check)  {  	unsigned long csize = nr_pages * PAGE_SIZE;  	struct mem_cgroup *mem_over_limit; @@ -2273,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,  	} else  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);  	/* -	 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch -	 * of regular pages (CHARGE_BATCH), or a single regular page (1). -	 *  	 * Never reclaim on behalf of optional batching, retry with a  	 * single page instead.  	 */ -	if (nr_pages == CHARGE_BATCH) +	if (nr_pages > min_pages)  		return CHARGE_RETRY;  	if (!(gfp_mask & __GFP_WAIT))  		return CHARGE_WOULDBLOCK; +	if (gfp_mask & __GFP_NORETRY) +		return CHARGE_NOMEM; +  	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)  		return CHARGE_RETRY; @@ -2297,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,  	 * unlikely to succeed so close to the limit, and we fall back  	 * to regular pages anyway in case of failure.  	 */ -	if (nr_pages == 1 && ret) +	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)  		return CHARGE_RETRY;  	/* @@ -2371,7 +2519,7 @@ again:  		memcg = *ptr;  		if (mem_cgroup_is_root(memcg))  			goto done; -		if (nr_pages == 1 && consume_stock(memcg)) +		if (consume_stock(memcg, nr_pages))  			goto done;  		css_get(&memcg->css);  	} else { @@ -2396,7 +2544,7 @@ again:  			rcu_read_unlock();  			goto done;  		} -		if (nr_pages == 1 && consume_stock(memcg)) { +		if (consume_stock(memcg, nr_pages)) {  			/*  			 * It seems dagerous to access memcg without css_get().  			 * But considering how consume_stok works, it's not @@ -2431,7 +2579,8 @@ again:  			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;  		} -		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); +		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, +		    oom_check);  		switch (ret) {  		case CHARGE_OK:  			break; @@ -2624,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  	memcg_check_events(memcg, page);  } +static DEFINE_MUTEX(set_limit_mutex); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) +{ +	return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && +		(memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); +} + +/* + * This is a bit cumbersome, but it is rarely used and avoids a backpointer + * in the memcg_cache_params struct. + */ +static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) +{ +	struct kmem_cache *cachep; + +	VM_BUG_ON(p->is_root_cache); +	cachep = p->root_cache; +	return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; +} + +#ifdef CONFIG_SLABINFO +static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, +					struct seq_file *m) +{ +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); +	struct memcg_cache_params *params; + +	if (!memcg_can_account_kmem(memcg)) +		return -EIO; + +	print_slabinfo_header(m); + +	mutex_lock(&memcg->slab_caches_mutex); +	list_for_each_entry(params, &memcg->memcg_slab_caches, list) +		cache_show(memcg_params_to_cache(params), m); +	mutex_unlock(&memcg->slab_caches_mutex); + +	return 0; +} +#endif + +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +{ +	struct res_counter *fail_res; +	struct mem_cgroup *_memcg; +	int ret = 0; +	bool may_oom; + +	ret = res_counter_charge(&memcg->kmem, size, &fail_res); +	if (ret) +		return ret; + +	/* +	 * Conditions under which we can wait for the oom_killer. Those are +	 * the same conditions tested by the core page allocator +	 */ +	may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); + +	_memcg = memcg; +	ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, +				      &_memcg, may_oom); + +	if (ret == -EINTR)  { +		/* +		 * __mem_cgroup_try_charge() chosed to bypass to root due to +		 * OOM kill or fatal signal.  Since our only options are to +		 * either fail the allocation or charge it to this cgroup, do +		 * it as a temporary condition. But we can't fail. From a +		 * kmem/slab perspective, the cache has already been selected, +		 * by mem_cgroup_kmem_get_cache(), so it is too late to change +		 * our minds. +		 * +		 * This condition will only trigger if the task entered +		 * memcg_charge_kmem in a sane state, but was OOM-killed during +		 * __mem_cgroup_try_charge() above. Tasks that were already +		 * dying when the allocation triggers should have been already +		 * directed to the root cgroup in memcontrol.h +		 */ +		res_counter_charge_nofail(&memcg->res, size, &fail_res); +		if (do_swap_account) +			res_counter_charge_nofail(&memcg->memsw, size, +						  &fail_res); +		ret = 0; +	} else if (ret) +		res_counter_uncharge(&memcg->kmem, size); + +	return ret; +} + +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +{ +	res_counter_uncharge(&memcg->res, size); +	if (do_swap_account) +		res_counter_uncharge(&memcg->memsw, size); + +	/* Not down to 0 */ +	if (res_counter_uncharge(&memcg->kmem, size)) +		return; + +	if (memcg_kmem_test_and_clear_dead(memcg)) +		mem_cgroup_put(memcg); +} + +void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) +{ +	if (!memcg) +		return; + +	mutex_lock(&memcg->slab_caches_mutex); +	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); +	mutex_unlock(&memcg->slab_caches_mutex); +} + +/* + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. + */ +int memcg_cache_id(struct mem_cgroup *memcg) +{ +	return memcg ? memcg->kmemcg_id : -1; +} + +/* + * This ends up being protected by the set_limit mutex, during normal + * operation, because that is its main call site. + * + * But when we create a new cache, we can call this as well if its parent + * is kmem-limited. That will have to hold set_limit_mutex as well. + */ +int memcg_update_cache_sizes(struct mem_cgroup *memcg) +{ +	int num, ret; + +	num = ida_simple_get(&kmem_limited_groups, +				0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); +	if (num < 0) +		return num; +	/* +	 * After this point, kmem_accounted (that we test atomically in +	 * the beginning of this conditional), is no longer 0. This +	 * guarantees only one process will set the following boolean +	 * to true. We don't need test_and_set because we're protected +	 * by the set_limit_mutex anyway. +	 */ +	memcg_kmem_set_activated(memcg); + +	ret = memcg_update_all_caches(num+1); +	if (ret) { +		ida_simple_remove(&kmem_limited_groups, num); +		memcg_kmem_clear_activated(memcg); +		return ret; +	} + +	memcg->kmemcg_id = num; +	INIT_LIST_HEAD(&memcg->memcg_slab_caches); +	mutex_init(&memcg->slab_caches_mutex); +	return 0; +} + +static size_t memcg_caches_array_size(int num_groups) +{ +	ssize_t size; +	if (num_groups <= 0) +		return 0; + +	size = 2 * num_groups; +	if (size < MEMCG_CACHES_MIN_SIZE) +		size = MEMCG_CACHES_MIN_SIZE; +	else if (size > MEMCG_CACHES_MAX_SIZE) +		size = MEMCG_CACHES_MAX_SIZE; + +	return size; +} + +/* + * We should update the current array size iff all caches updates succeed. This + * can only be done from the slab side. The slab mutex needs to be held when + * calling this. + */ +void memcg_update_array_size(int num) +{ +	if (num > memcg_limited_groups_array_size) +		memcg_limited_groups_array_size = memcg_caches_array_size(num); +} + +int memcg_update_cache_size(struct kmem_cache *s, int num_groups) +{ +	struct memcg_cache_params *cur_params = s->memcg_params; + +	VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); + +	if (num_groups > memcg_limited_groups_array_size) { +		int i; +		ssize_t size = memcg_caches_array_size(num_groups); + +		size *= sizeof(void *); +		size += sizeof(struct memcg_cache_params); + +		s->memcg_params = kzalloc(size, GFP_KERNEL); +		if (!s->memcg_params) { +			s->memcg_params = cur_params; +			return -ENOMEM; +		} + +		s->memcg_params->is_root_cache = true; + +		/* +		 * There is the chance it will be bigger than +		 * memcg_limited_groups_array_size, if we failed an allocation +		 * in a cache, in which case all caches updated before it, will +		 * have a bigger array. +		 * +		 * But if that is the case, the data after +		 * memcg_limited_groups_array_size is certainly unused +		 */ +		for (i = 0; i < memcg_limited_groups_array_size; i++) { +			if (!cur_params->memcg_caches[i]) +				continue; +			s->memcg_params->memcg_caches[i] = +						cur_params->memcg_caches[i]; +		} + +		/* +		 * Ideally, we would wait until all caches succeed, and only +		 * then free the old one. But this is not worth the extra +		 * pointer per-cache we'd have to have for this. +		 * +		 * It is not a big deal if some caches are left with a size +		 * bigger than the others. And all updates will reset this +		 * anyway. +		 */ +		kfree(cur_params); +	} +	return 0; +} + +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, +			 struct kmem_cache *root_cache) +{ +	size_t size = sizeof(struct memcg_cache_params); + +	if (!memcg_kmem_enabled()) +		return 0; + +	if (!memcg) +		size += memcg_limited_groups_array_size * sizeof(void *); + +	s->memcg_params = kzalloc(size, GFP_KERNEL); +	if (!s->memcg_params) +		return -ENOMEM; + +	if (memcg) { +		s->memcg_params->memcg = memcg; +		s->memcg_params->root_cache = root_cache; +	} +	return 0; +} + +void memcg_release_cache(struct kmem_cache *s) +{ +	struct kmem_cache *root; +	struct mem_cgroup *memcg; +	int id; + +	/* +	 * This happens, for instance, when a root cache goes away before we +	 * add any memcg. +	 */ +	if (!s->memcg_params) +		return; + +	if (s->memcg_params->is_root_cache) +		goto out; + +	memcg = s->memcg_params->memcg; +	id  = memcg_cache_id(memcg); + +	root = s->memcg_params->root_cache; +	root->memcg_params->memcg_caches[id] = NULL; +	mem_cgroup_put(memcg); + +	mutex_lock(&memcg->slab_caches_mutex); +	list_del(&s->memcg_params->list); +	mutex_unlock(&memcg->slab_caches_mutex); + +out: +	kfree(s->memcg_params); +} + +/* + * During the creation a new cache, we need to disable our accounting mechanism + * altogether. This is true even if we are not creating, but rather just + * enqueing new caches to be created. + * + * This is because that process will trigger allocations; some visible, like + * explicit kmallocs to auxiliary data structures, name strings and internal + * cache structures; some well concealed, like INIT_WORK() that can allocate + * objects during debug. + * + * If any allocation happens during memcg_kmem_get_cache, we will recurse back + * to it. This may not be a bounded recursion: since the first cache creation + * failed to complete (waiting on the allocation), we'll just try to create the + * cache again, failing at the same point. + * + * memcg_kmem_get_cache is prepared to abort after seeing a positive count of + * memcg_kmem_skip_account. So we enclose anything that might allocate memory + * inside the following two functions. + */ +static inline void memcg_stop_kmem_account(void) +{ +	VM_BUG_ON(!current->mm); +	current->memcg_kmem_skip_account++; +} + +static inline void memcg_resume_kmem_account(void) +{ +	VM_BUG_ON(!current->mm); +	current->memcg_kmem_skip_account--; +} + +static void kmem_cache_destroy_work_func(struct work_struct *w) +{ +	struct kmem_cache *cachep; +	struct memcg_cache_params *p; + +	p = container_of(w, struct memcg_cache_params, destroy); + +	cachep = memcg_params_to_cache(p); + +	/* +	 * If we get down to 0 after shrink, we could delete right away. +	 * However, memcg_release_pages() already puts us back in the workqueue +	 * in that case. If we proceed deleting, we'll get a dangling +	 * reference, and removing the object from the workqueue in that case +	 * is unnecessary complication. We are not a fast path. +	 * +	 * Note that this case is fundamentally different from racing with +	 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in +	 * kmem_cache_shrink, not only we would be reinserting a dead cache +	 * into the queue, but doing so from inside the worker racing to +	 * destroy it. +	 * +	 * So if we aren't down to zero, we'll just schedule a worker and try +	 * again +	 */ +	if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { +		kmem_cache_shrink(cachep); +		if (atomic_read(&cachep->memcg_params->nr_pages) == 0) +			return; +	} else +		kmem_cache_destroy(cachep); +} + +void mem_cgroup_destroy_cache(struct kmem_cache *cachep) +{ +	if (!cachep->memcg_params->dead) +		return; + +	/* +	 * There are many ways in which we can get here. +	 * +	 * We can get to a memory-pressure situation while the delayed work is +	 * still pending to run. The vmscan shrinkers can then release all +	 * cache memory and get us to destruction. If this is the case, we'll +	 * be executed twice, which is a bug (the second time will execute over +	 * bogus data). In this case, cancelling the work should be fine. +	 * +	 * But we can also get here from the worker itself, if +	 * kmem_cache_shrink is enough to shake all the remaining objects and +	 * get the page count to 0. In this case, we'll deadlock if we try to +	 * cancel the work (the worker runs with an internal lock held, which +	 * is the same lock we would hold for cancel_work_sync().) +	 * +	 * Since we can't possibly know who got us here, just refrain from +	 * running if there is already work pending +	 */ +	if (work_pending(&cachep->memcg_params->destroy)) +		return; +	/* +	 * We have to defer the actual destroying to a workqueue, because +	 * we might currently be in a context that cannot sleep. +	 */ +	schedule_work(&cachep->memcg_params->destroy); +} + +static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) +{ +	char *name; +	struct dentry *dentry; + +	rcu_read_lock(); +	dentry = rcu_dereference(memcg->css.cgroup->dentry); +	rcu_read_unlock(); + +	BUG_ON(dentry == NULL); + +	name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, +			 memcg_cache_id(memcg), dentry->d_name.name); + +	return name; +} + +static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, +					 struct kmem_cache *s) +{ +	char *name; +	struct kmem_cache *new; + +	name = memcg_cache_name(memcg, s); +	if (!name) +		return NULL; + +	new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, +				      (s->flags & ~SLAB_PANIC), s->ctor, s); + +	if (new) +		new->allocflags |= __GFP_KMEMCG; + +	kfree(name); +	return new; +} + +/* + * This lock protects updaters, not readers. We want readers to be as fast as + * they can, and they will either see NULL or a valid cache value. Our model + * allow them to see NULL, in which case the root memcg will be selected. + * + * We need this lock because multiple allocations to the same cache from a non + * will span more than one worker. Only one of them can create the cache. + */ +static DEFINE_MUTEX(memcg_cache_mutex); +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, +						  struct kmem_cache *cachep) +{ +	struct kmem_cache *new_cachep; +	int idx; + +	BUG_ON(!memcg_can_account_kmem(memcg)); + +	idx = memcg_cache_id(memcg); + +	mutex_lock(&memcg_cache_mutex); +	new_cachep = cachep->memcg_params->memcg_caches[idx]; +	if (new_cachep) +		goto out; + +	new_cachep = kmem_cache_dup(memcg, cachep); +	if (new_cachep == NULL) { +		new_cachep = cachep; +		goto out; +	} + +	mem_cgroup_get(memcg); +	atomic_set(&new_cachep->memcg_params->nr_pages , 0); + +	cachep->memcg_params->memcg_caches[idx] = new_cachep; +	/* +	 * the readers won't lock, make sure everybody sees the updated value, +	 * so they won't put stuff in the queue again for no reason +	 */ +	wmb(); +out: +	mutex_unlock(&memcg_cache_mutex); +	return new_cachep; +} + +void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ +	struct kmem_cache *c; +	int i; + +	if (!s->memcg_params) +		return; +	if (!s->memcg_params->is_root_cache) +		return; + +	/* +	 * If the cache is being destroyed, we trust that there is no one else +	 * requesting objects from it. Even if there are, the sanity checks in +	 * kmem_cache_destroy should caught this ill-case. +	 * +	 * Still, we don't want anyone else freeing memcg_caches under our +	 * noses, which can happen if a new memcg comes to life. As usual, +	 * we'll take the set_limit_mutex to protect ourselves against this. +	 */ +	mutex_lock(&set_limit_mutex); +	for (i = 0; i < memcg_limited_groups_array_size; i++) { +		c = s->memcg_params->memcg_caches[i]; +		if (!c) +			continue; + +		/* +		 * We will now manually delete the caches, so to avoid races +		 * we need to cancel all pending destruction workers and +		 * proceed with destruction ourselves. +		 * +		 * kmem_cache_destroy() will call kmem_cache_shrink internally, +		 * and that could spawn the workers again: it is likely that +		 * the cache still have active pages until this very moment. +		 * This would lead us back to mem_cgroup_destroy_cache. +		 * +		 * But that will not execute at all if the "dead" flag is not +		 * set, so flip it down to guarantee we are in control. +		 */ +		c->memcg_params->dead = false; +		cancel_work_sync(&c->memcg_params->destroy); +		kmem_cache_destroy(c); +	} +	mutex_unlock(&set_limit_mutex); +} + +struct create_work { +	struct mem_cgroup *memcg; +	struct kmem_cache *cachep; +	struct work_struct work; +}; + +static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ +	struct kmem_cache *cachep; +	struct memcg_cache_params *params; + +	if (!memcg_kmem_is_active(memcg)) +		return; + +	mutex_lock(&memcg->slab_caches_mutex); +	list_for_each_entry(params, &memcg->memcg_slab_caches, list) { +		cachep = memcg_params_to_cache(params); +		cachep->memcg_params->dead = true; +		INIT_WORK(&cachep->memcg_params->destroy, +				  kmem_cache_destroy_work_func); +		schedule_work(&cachep->memcg_params->destroy); +	} +	mutex_unlock(&memcg->slab_caches_mutex); +} + +static void memcg_create_cache_work_func(struct work_struct *w) +{ +	struct create_work *cw; + +	cw = container_of(w, struct create_work, work); +	memcg_create_kmem_cache(cw->memcg, cw->cachep); +	/* Drop the reference gotten when we enqueued. */ +	css_put(&cw->memcg->css); +	kfree(cw); +} + +/* + * Enqueue the creation of a per-memcg kmem_cache. + * Called with rcu_read_lock. + */ +static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, +					 struct kmem_cache *cachep) +{ +	struct create_work *cw; + +	cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); +	if (cw == NULL) +		return; + +	/* The corresponding put will be done in the workqueue. */ +	if (!css_tryget(&memcg->css)) { +		kfree(cw); +		return; +	} + +	cw->memcg = memcg; +	cw->cachep = cachep; + +	INIT_WORK(&cw->work, memcg_create_cache_work_func); +	schedule_work(&cw->work); +} + +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, +				       struct kmem_cache *cachep) +{ +	/* +	 * We need to stop accounting when we kmalloc, because if the +	 * corresponding kmalloc cache is not yet created, the first allocation +	 * in __memcg_create_cache_enqueue will recurse. +	 * +	 * However, it is better to enclose the whole function. Depending on +	 * the debugging options enabled, INIT_WORK(), for instance, can +	 * trigger an allocation. This too, will make us recurse. Because at +	 * this point we can't allow ourselves back into memcg_kmem_get_cache, +	 * the safest choice is to do it like this, wrapping the whole function. +	 */ +	memcg_stop_kmem_account(); +	__memcg_create_cache_enqueue(memcg, cachep); +	memcg_resume_kmem_account(); +} +/* + * Return the kmem_cache we're supposed to use for a slab allocation. + * We try to use the current memcg's version of the cache. + * + * If the cache does not exist yet, if we are the first user of it, + * we either create it immediately, if possible, or create it asynchronously + * in a workqueue. + * In the latter case, we will let the current allocation go through with + * the original cache. + * + * Can't be called in interrupt context or from kernel threads. + * This function needs to be called with rcu_read_lock() held. + */ +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, +					  gfp_t gfp) +{ +	struct mem_cgroup *memcg; +	int idx; + +	VM_BUG_ON(!cachep->memcg_params); +	VM_BUG_ON(!cachep->memcg_params->is_root_cache); + +	if (!current->mm || current->memcg_kmem_skip_account) +		return cachep; + +	rcu_read_lock(); +	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); +	rcu_read_unlock(); + +	if (!memcg_can_account_kmem(memcg)) +		return cachep; + +	idx = memcg_cache_id(memcg); + +	/* +	 * barrier to mare sure we're always seeing the up to date value.  The +	 * code updating memcg_caches will issue a write barrier to match this. +	 */ +	read_barrier_depends(); +	if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { +		/* +		 * If we are in a safe context (can wait, and not in interrupt +		 * context), we could be be predictable and return right away. +		 * This would guarantee that the allocation being performed +		 * already belongs in the new cache. +		 * +		 * However, there are some clashes that can arrive from locking. +		 * For instance, because we acquire the slab_mutex while doing +		 * kmem_cache_dup, this means no further allocation could happen +		 * with the slab_mutex held. +		 * +		 * Also, because cache creation issue get_online_cpus(), this +		 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, +		 * that ends up reversed during cpu hotplug. (cpuset allocates +		 * a bunch of GFP_KERNEL memory during cpuup). Due to all that, +		 * better to defer everything. +		 */ +		memcg_create_cache_enqueue(memcg, cachep); +		return cachep; +	} + +	return cachep->memcg_params->memcg_caches[idx]; +} +EXPORT_SYMBOL(__memcg_kmem_get_cache); + +/* + * We need to verify if the allocation against current->mm->owner's memcg is + * possible for the given order. But the page is not allocated yet, so we'll + * need a further commit step to do the final arrangements. + * + * It is possible for the task to switch cgroups in this mean time, so at + * commit time, we can't rely on task conversion any longer.  We'll then use + * the handle argument to return to the caller which cgroup we should commit + * against. We could also return the memcg directly and avoid the pointer + * passing, but a boolean return value gives better semantics considering + * the compiled-out case as well. + * + * Returning true means the allocation is possible. + */ +bool +__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +{ +	struct mem_cgroup *memcg; +	int ret; + +	*_memcg = NULL; +	memcg = try_get_mem_cgroup_from_mm(current->mm); + +	/* +	 * very rare case described in mem_cgroup_from_task. Unfortunately there +	 * isn't much we can do without complicating this too much, and it would +	 * be gfp-dependent anyway. Just let it go +	 */ +	if (unlikely(!memcg)) +		return true; + +	if (!memcg_can_account_kmem(memcg)) { +		css_put(&memcg->css); +		return true; +	} + +	ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); +	if (!ret) +		*_memcg = memcg; + +	css_put(&memcg->css); +	return (ret == 0); +} + +void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, +			      int order) +{ +	struct page_cgroup *pc; + +	VM_BUG_ON(mem_cgroup_is_root(memcg)); + +	/* The page allocation failed. Revert */ +	if (!page) { +		memcg_uncharge_kmem(memcg, PAGE_SIZE << order); +		return; +	} + +	pc = lookup_page_cgroup(page); +	lock_page_cgroup(pc); +	pc->mem_cgroup = memcg; +	SetPageCgroupUsed(pc); +	unlock_page_cgroup(pc); +} + +void __memcg_kmem_uncharge_pages(struct page *page, int order) +{ +	struct mem_cgroup *memcg = NULL; +	struct page_cgroup *pc; + + +	pc = lookup_page_cgroup(page); +	/* +	 * Fast unlocked return. Theoretically might have changed, have to +	 * check again after locking. +	 */ +	if (!PageCgroupUsed(pc)) +		return; + +	lock_page_cgroup(pc); +	if (PageCgroupUsed(pc)) { +		memcg = pc->mem_cgroup; +		ClearPageCgroupUsed(pc); +	} +	unlock_page_cgroup(pc); + +	/* +	 * We trust that only if there is a memcg associated with the page, it +	 * is a valid allocation +	 */ +	if (!memcg) +		return; + +	VM_BUG_ON(mem_cgroup_is_root(memcg)); +	memcg_uncharge_kmem(memcg, PAGE_SIZE << order); +} +#else +static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ +  #ifdef CONFIG_TRANSPARENT_HUGEPAGE  #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) @@ -3486,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)  }  #endif -static DEFINE_MUTEX(set_limit_mutex); -  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,  				unsigned long long val)  { @@ -3772,6 +4679,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,  static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)  {  	int node, zid; +	u64 usage;  	do {  		/* This is for making all *used* pages to be on LRU. */ @@ -3792,13 +4700,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)  		cond_resched();  		/* +		 * Kernel memory may not necessarily be trackable to a specific +		 * process. So they are not migrated, and therefore we can't +		 * expect their value to drop to 0 here. +		 * Having res filled up with kmem only is enough. +		 *  		 * This is a safety check because mem_cgroup_force_empty_list  		 * could have raced with mem_cgroup_replace_page_cache callers  		 * so the lru seemed empty but the page could have been added  		 * right after the check. RES_USAGE should be safe as we always  		 * charge before adding to the LRU.  		 */ -	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); +		usage = res_counter_read_u64(&memcg->res, RES_USAGE) - +			res_counter_read_u64(&memcg->kmem, RES_USAGE); +	} while (usage > 0);  }  /* @@ -3942,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);  	char str[64];  	u64 val; -	int type, name, len; +	int name, len; +	enum res_type type;  	type = MEMFILE_TYPE(cft->private);  	name = MEMFILE_ATTR(cft->private); @@ -3963,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,  		else  			val = res_counter_read_u64(&memcg->memsw, name);  		break; +	case _KMEM: +		val = res_counter_read_u64(&memcg->kmem, name); +		break;  	default:  		BUG();  	} @@ -3970,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,  	len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);  	return simple_read_from_buffer(buf, nbytes, ppos, str, len);  } + +static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +{ +	int ret = -EINVAL; +#ifdef CONFIG_MEMCG_KMEM +	bool must_inc_static_branch = false; + +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); +	/* +	 * For simplicity, we won't allow this to be disabled.  It also can't +	 * be changed if the cgroup has children already, or if tasks had +	 * already joined. +	 * +	 * If tasks join before we set the limit, a person looking at +	 * kmem.usage_in_bytes will have no way to determine when it took +	 * place, which makes the value quite meaningless. +	 * +	 * After it first became limited, changes in the value of the limit are +	 * of course permitted. +	 * +	 * Taking the cgroup_lock is really offensive, but it is so far the only +	 * way to guarantee that no children will appear. There are plenty of +	 * other offenders, and they should all go away. Fine grained locking +	 * is probably the way to go here. When we are fully hierarchical, we +	 * can also get rid of the use_hierarchy check. +	 */ +	cgroup_lock(); +	mutex_lock(&set_limit_mutex); +	if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { +		if (cgroup_task_count(cont) || (memcg->use_hierarchy && +						!list_empty(&cont->children))) { +			ret = -EBUSY; +			goto out; +		} +		ret = res_counter_set_limit(&memcg->kmem, val); +		VM_BUG_ON(ret); + +		ret = memcg_update_cache_sizes(memcg); +		if (ret) { +			res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); +			goto out; +		} +		must_inc_static_branch = true; +		/* +		 * kmem charges can outlive the cgroup. In the case of slab +		 * pages, for instance, a page contain objects from various +		 * processes, so it is unfeasible to migrate them away. We +		 * need to reference count the memcg because of that. +		 */ +		mem_cgroup_get(memcg); +	} else +		ret = res_counter_set_limit(&memcg->kmem, val); +out: +	mutex_unlock(&set_limit_mutex); +	cgroup_unlock(); + +	/* +	 * We are by now familiar with the fact that we can't inc the static +	 * branch inside cgroup_lock. See disarm functions for details. A +	 * worker here is overkill, but also wrong: After the limit is set, we +	 * must start accounting right away. Since this operation can't fail, +	 * we can safely defer it to here - no rollback will be needed. +	 * +	 * The boolean used to control this is also safe, because +	 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be +	 * able to set it to true; +	 */ +	if (must_inc_static_branch) { +		static_key_slow_inc(&memcg_kmem_enabled_key); +		/* +		 * setting the active bit after the inc will guarantee no one +		 * starts accounting before all call sites are patched +		 */ +		memcg_kmem_set_active(memcg); +	} + +#endif +	return ret; +} + +static int memcg_propagate_kmem(struct mem_cgroup *memcg) +{ +	int ret = 0; +	struct mem_cgroup *parent = parent_mem_cgroup(memcg); +	if (!parent) +		goto out; + +	memcg->kmem_account_flags = parent->kmem_account_flags; +#ifdef CONFIG_MEMCG_KMEM +	/* +	 * When that happen, we need to disable the static branch only on those +	 * memcgs that enabled it. To achieve this, we would be forced to +	 * complicate the code by keeping track of which memcgs were the ones +	 * that actually enabled limits, and which ones got it from its +	 * parents. +	 * +	 * It is a lot simpler just to do static_key_slow_inc() on every child +	 * that is accounted. +	 */ +	if (!memcg_kmem_is_active(memcg)) +		goto out; + +	/* +	 * destroy(), called if we fail, will issue static_key_slow_inc() and +	 * mem_cgroup_put() if kmem is enabled. We have to either call them +	 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find +	 * this more consistent, since it always leads to the same destroy path +	 */ +	mem_cgroup_get(memcg); +	static_key_slow_inc(&memcg_kmem_enabled_key); + +	mutex_lock(&set_limit_mutex); +	ret = memcg_update_cache_sizes(memcg); +	mutex_unlock(&set_limit_mutex); +#endif +out: +	return ret; +} +  /*   * The user of this function is...   * RES_LIMIT. @@ -3978,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,  			    const char *buffer)  {  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); -	int type, name; +	enum res_type type; +	int name;  	unsigned long long val;  	int ret; @@ -4000,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,  			break;  		if (type == _MEM)  			ret = mem_cgroup_resize_limit(memcg, val); -		else +		else if (type == _MEMSWAP)  			ret = mem_cgroup_resize_memsw_limit(memcg, val); +		else if (type == _KMEM) +			ret = memcg_update_kmem_limit(cont, val); +		else +			return -EINVAL;  		break;  	case RES_SOFT_LIMIT:  		ret = res_counter_memparse_write_strategy(buffer, &val); @@ -4054,7 +5097,8 @@ out:  static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)  {  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); -	int type, name; +	int name; +	enum res_type type;  	type = MEMFILE_TYPE(event);  	name = MEMFILE_ATTR(event); @@ -4066,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)  	case RES_MAX_USAGE:  		if (type == _MEM)  			res_counter_reset_max(&memcg->res); -		else +		else if (type == _MEMSWAP)  			res_counter_reset_max(&memcg->memsw); +		else if (type == _KMEM) +			res_counter_reset_max(&memcg->kmem); +		else +			return -EINVAL;  		break;  	case RES_FAILCNT:  		if (type == _MEM)  			res_counter_reset_failcnt(&memcg->res); -		else +		else if (type == _MEMSWAP)  			res_counter_reset_failcnt(&memcg->memsw); +		else if (type == _KMEM) +			res_counter_reset_failcnt(&memcg->kmem); +		else +			return -EINVAL;  		break;  	} @@ -4390,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);  	struct mem_cgroup_thresholds *thresholds;  	struct mem_cgroup_threshold_ary *new; -	int type = MEMFILE_TYPE(cft->private); +	enum res_type type = MEMFILE_TYPE(cft->private);  	u64 threshold, usage;  	int i, size, ret; @@ -4473,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);  	struct mem_cgroup_thresholds *thresholds;  	struct mem_cgroup_threshold_ary *new; -	int type = MEMFILE_TYPE(cft->private); +	enum res_type type = MEMFILE_TYPE(cft->private);  	u64 usage;  	int i, j, size; @@ -4551,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,  {  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);  	struct mem_cgroup_eventfd_list *event; -	int type = MEMFILE_TYPE(cft->private); +	enum res_type type = MEMFILE_TYPE(cft->private);  	BUG_ON(type != _OOM_TYPE);  	event = kmalloc(sizeof(*event),	GFP_KERNEL); @@ -4576,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,  {  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);  	struct mem_cgroup_eventfd_list *ev, *tmp; -	int type = MEMFILE_TYPE(cft->private); +	enum res_type type = MEMFILE_TYPE(cft->private);  	BUG_ON(type != _OOM_TYPE); @@ -4635,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,  #ifdef CONFIG_MEMCG_KMEM  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)  { +	int ret; + +	memcg->kmemcg_id = -1; +	ret = memcg_propagate_kmem(memcg); +	if (ret) +		return ret; +  	return mem_cgroup_sockets_init(memcg, ss);  };  static void kmem_cgroup_destroy(struct mem_cgroup *memcg)  {  	mem_cgroup_sockets_destroy(memcg); + +	memcg_kmem_mark_dead(memcg); + +	if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) +		return; + +	/* +	 * Charges already down to 0, undo mem_cgroup_get() done in the charge +	 * path here, being careful not to race with memcg_uncharge_kmem: it is +	 * possible that the charges went down to 0 between mark_dead and the +	 * res_counter read, so in that case, we don't need the put +	 */ +	if (memcg_kmem_test_and_clear_dead(memcg)) +		mem_cgroup_put(memcg);  }  #else  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -4749,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = {  		.read = mem_cgroup_read,  	},  #endif +#ifdef CONFIG_MEMCG_KMEM +	{ +		.name = "kmem.limit_in_bytes", +		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), +		.write_string = mem_cgroup_write, +		.read = mem_cgroup_read, +	}, +	{ +		.name = "kmem.usage_in_bytes", +		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), +		.read = mem_cgroup_read, +	}, +	{ +		.name = "kmem.failcnt", +		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), +		.trigger = mem_cgroup_reset, +		.read = mem_cgroup_read, +	}, +	{ +		.name = "kmem.max_usage_in_bytes", +		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), +		.trigger = mem_cgroup_reset, +		.read = mem_cgroup_read, +	}, +#ifdef CONFIG_SLABINFO +	{ +		.name = "kmem.slabinfo", +		.read_seq_string = mem_cgroup_slabinfo_read, +	}, +#endif +#endif  	{ },	/* terminate */  }; @@ -4816,16 +5920,29 @@ out_free:  }  /* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context.  The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. + * At destroying mem_cgroup, references from swap_cgroup can remain. + * (scanning all at force_empty is too costly...) + * + * Instead of clearing all references at force_empty, we remember + * the number of reference from swap_cgroup and free mem_cgroup when + * it goes down to 0. + * + * Removal of cgroup itself succeeds regardless of refs from swap.   */ -static void free_work(struct work_struct *work) + +static void __mem_cgroup_free(struct mem_cgroup *memcg)  { -	struct mem_cgroup *memcg; +	int node;  	int size = sizeof(struct mem_cgroup); -	memcg = container_of(work, struct mem_cgroup, work_freeing); +	mem_cgroup_remove_from_trees(memcg); +	free_css_id(&mem_cgroup_subsys, &memcg->css); + +	for_each_node(node) +		free_mem_cgroup_per_zone_info(memcg, node); + +	free_percpu(memcg->stat); +  	/*  	 * We need to make sure that (at least for now), the jump label  	 * destruction code runs outside of the cgroup lock. This is because @@ -4837,45 +5954,34 @@ static void free_work(struct work_struct *work)  	 * to move this code around, and make sure it is outside  	 * the cgroup_lock.  	 */ -	disarm_sock_keys(memcg); +	disarm_static_keys(memcg);  	if (size < PAGE_SIZE)  		kfree(memcg);  	else  		vfree(memcg);  } -static void free_rcu(struct rcu_head *rcu_head) -{ -	struct mem_cgroup *memcg; - -	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); -	INIT_WORK(&memcg->work_freeing, free_work); -	schedule_work(&memcg->work_freeing); -}  /* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. + * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, + * but in process context.  The work_freeing structure is overlaid + * on the rcu_freeing structure, which itself is overlaid on memsw.   */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) +static void free_work(struct work_struct *work)  { -	int node; +	struct mem_cgroup *memcg; -	mem_cgroup_remove_from_trees(memcg); -	free_css_id(&mem_cgroup_subsys, &memcg->css); +	memcg = container_of(work, struct mem_cgroup, work_freeing); +	__mem_cgroup_free(memcg); +} -	for_each_node(node) -		free_mem_cgroup_per_zone_info(memcg, node); +static void free_rcu(struct rcu_head *rcu_head) +{ +	struct mem_cgroup *memcg; -	free_percpu(memcg->stat); -	call_rcu(&memcg->rcu_freeing, free_rcu); +	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); +	INIT_WORK(&memcg->work_freeing, free_work); +	schedule_work(&memcg->work_freeing);  }  static void mem_cgroup_get(struct mem_cgroup *memcg) @@ -4887,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)  {  	if (atomic_sub_and_test(count, &memcg->refcnt)) {  		struct mem_cgroup *parent = parent_mem_cgroup(memcg); -		__mem_cgroup_free(memcg); +		call_rcu(&memcg->rcu_freeing, free_rcu);  		if (parent)  			mem_cgroup_put(parent);  	} @@ -4994,6 +6100,8 @@ mem_cgroup_css_alloc(struct cgroup *cont)  	if (parent && parent->use_hierarchy) {  		res_counter_init(&memcg->res, &parent->res);  		res_counter_init(&memcg->memsw, &parent->memsw); +		res_counter_init(&memcg->kmem, &parent->kmem); +  		/*  		 * We increment refcnt of the parent to ensure that we can  		 * safely access it on res_counter_charge/uncharge. @@ -5004,6 +6112,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)  	} else {  		res_counter_init(&memcg->res, NULL);  		res_counter_init(&memcg->memsw, NULL); +		res_counter_init(&memcg->kmem, NULL);  		/*  		 * Deeper hierachy with use_hierarchy == false doesn't make  		 * much sense so let cgroup subsystem know about this @@ -5043,6 +6152,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont)  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);  	mem_cgroup_reparent_charges(memcg); +	mem_cgroup_destroy_all_caches(memcg);  }  static void mem_cgroup_css_free(struct cgroup *cont)  |