diff options
| author | Olof Johansson <olof@lixom.net> | 2013-10-28 14:39:03 -0700 | 
|---|---|---|
| committer | Olof Johansson <olof@lixom.net> | 2013-10-28 14:39:03 -0700 | 
| commit | 43d93947a54cf9323198a3a37eaf3ec14adb23e1 (patch) | |
| tree | 0c290a7fcbc644b94527f399e4f9509a8d379a5d /mm/memcontrol.c | |
| parent | 02673f94d04e629e4cdc41e2bf2dc980743cf3df (diff) | |
| parent | 54b89756a14aa1043507ce0811b4b6c02c5dddcc (diff) | |
Merge tag 'omap-for-v3.13/cm-scm-cleanup-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap into next/cleanup
From Paul Walmsley <paul@pwsan.com> via Tony Lindgren:
Move some of the OMAP2+ CM and System Control Module direct
register accesses into CM- and System Control
Module-specific "drivers" underneath arch/arm/mach-omap2/.  This
is a prerequisite for moving this code out of arch/arm/mach-omap2/ into
drivers/.
Basic test logs are available here:
http://www.pwsan.com/omap/testlogs/cm_scm_cleanup_a_v3.13/20131019101809/
* tag 'omap-for-v3.13/cm-scm-cleanup-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap:
  ARM: OMAP3: control: add API for setting IVA bootmode
  ARM: OMAP3: CM/control: move CM scratchpad save to CM driver
  ARM: OMAP3: McBSP: do not access CM register directly
  ARM: OMAP3: clock: add API to enable/disable autoidle for a single clock
  ARM: OMAP2: CM/PM: remove direct register accesses outside CM code
  + Linux 3.12-rc4
Signed-off-by: Olof Johansson <olof@lixom.net>
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 554 | 
1 files changed, 404 insertions, 150 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce13029..1c52ddbc839b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,6 +39,7 @@  #include <linux/limits.h>  #include <linux/export.h>  #include <linux/mutex.h> +#include <linux/rbtree.h>  #include <linux/slab.h>  #include <linux/swap.h>  #include <linux/swapops.h> @@ -160,6 +161,10 @@ struct mem_cgroup_per_zone {  	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; +	struct rb_node		tree_node;	/* RB tree node */ +	unsigned long long	usage_in_excess;/* Set to the value by which */ +						/* the soft limit is exceeded*/ +	bool			on_tree;  	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */  						/* use container_of	   */  }; @@ -168,6 +173,26 @@ struct mem_cgroup_per_node {  	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];  }; +/* + * Cgroups above their limits are maintained in a RB-Tree, independent of + * their hierarchy representation + */ + +struct mem_cgroup_tree_per_zone { +	struct rb_root rb_root; +	spinlock_t lock; +}; + +struct mem_cgroup_tree_per_node { +	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; +}; + +struct mem_cgroup_tree { +	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; +}; + +static struct mem_cgroup_tree soft_limit_tree __read_mostly; +  struct mem_cgroup_threshold {  	struct eventfd_ctx *eventfd;  	u64 threshold; @@ -303,22 +328,6 @@ struct mem_cgroup {  	atomic_t	numainfo_events;  	atomic_t	numainfo_updating;  #endif -	/* -	 * Protects soft_contributed transitions. -	 * See mem_cgroup_update_soft_limit -	 */ -	spinlock_t soft_lock; - -	/* -	 * If true then this group has increased parents' children_in_excess -	 * when it got over the soft limit. -	 * When a group falls bellow the soft limit, parents' children_in_excess -	 * is decreased and soft_contributed changed to false. -	 */ -	bool soft_contributed; - -	/* Number of children that are in soft limit excess */ -	atomic_t children_in_excess;  	struct mem_cgroup_per_node *nodeinfo[0];  	/* WARNING: nodeinfo must be the last member here */ @@ -422,6 +431,7 @@ static bool move_file(void)   * limit reclaim to prevent infinite loops, if they ever occur.   */  #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100 +#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2  enum charge_type {  	MEM_CGROUP_CHARGE_TYPE_CACHE = 0, @@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)  	return mem_cgroup_zoneinfo(memcg, nid, zid);  } +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_node_zone(int nid, int zid) +{ +	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_from_page(struct page *page) +{ +	int nid = page_to_nid(page); +	int zid = page_zonenum(page); + +	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static void +__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, +				struct mem_cgroup_per_zone *mz, +				struct mem_cgroup_tree_per_zone *mctz, +				unsigned long long new_usage_in_excess) +{ +	struct rb_node **p = &mctz->rb_root.rb_node; +	struct rb_node *parent = NULL; +	struct mem_cgroup_per_zone *mz_node; + +	if (mz->on_tree) +		return; + +	mz->usage_in_excess = new_usage_in_excess; +	if (!mz->usage_in_excess) +		return; +	while (*p) { +		parent = *p; +		mz_node = rb_entry(parent, struct mem_cgroup_per_zone, +					tree_node); +		if (mz->usage_in_excess < mz_node->usage_in_excess) +			p = &(*p)->rb_left; +		/* +		 * We can't avoid mem cgroups that are over their soft +		 * limit by the same amount +		 */ +		else if (mz->usage_in_excess >= mz_node->usage_in_excess) +			p = &(*p)->rb_right; +	} +	rb_link_node(&mz->tree_node, parent, p); +	rb_insert_color(&mz->tree_node, &mctz->rb_root); +	mz->on_tree = true; +} + +static void +__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, +				struct mem_cgroup_per_zone *mz, +				struct mem_cgroup_tree_per_zone *mctz) +{ +	if (!mz->on_tree) +		return; +	rb_erase(&mz->tree_node, &mctz->rb_root); +	mz->on_tree = false; +} + +static void +mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, +				struct mem_cgroup_per_zone *mz, +				struct mem_cgroup_tree_per_zone *mctz) +{ +	spin_lock(&mctz->lock); +	__mem_cgroup_remove_exceeded(memcg, mz, mctz); +	spin_unlock(&mctz->lock); +} + + +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) +{ +	unsigned long long excess; +	struct mem_cgroup_per_zone *mz; +	struct mem_cgroup_tree_per_zone *mctz; +	int nid = page_to_nid(page); +	int zid = page_zonenum(page); +	mctz = soft_limit_tree_from_page(page); + +	/* +	 * Necessary to update all ancestors when hierarchy is used. +	 * because their event counter is not touched. +	 */ +	for (; memcg; memcg = parent_mem_cgroup(memcg)) { +		mz = mem_cgroup_zoneinfo(memcg, nid, zid); +		excess = res_counter_soft_limit_excess(&memcg->res); +		/* +		 * We have to update the tree if mz is on RB-tree or +		 * mem is over its softlimit. +		 */ +		if (excess || mz->on_tree) { +			spin_lock(&mctz->lock); +			/* if on-tree, remove it */ +			if (mz->on_tree) +				__mem_cgroup_remove_exceeded(memcg, mz, mctz); +			/* +			 * Insert again. mz->usage_in_excess will be updated. +			 * If excess is 0, no tree ops. +			 */ +			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); +			spin_unlock(&mctz->lock); +		} +	} +} + +static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) +{ +	int node, zone; +	struct mem_cgroup_per_zone *mz; +	struct mem_cgroup_tree_per_zone *mctz; + +	for_each_node(node) { +		for (zone = 0; zone < MAX_NR_ZONES; zone++) { +			mz = mem_cgroup_zoneinfo(memcg, node, zone); +			mctz = soft_limit_tree_node_zone(node, zone); +			mem_cgroup_remove_exceeded(memcg, mz, mctz); +		} +	} +} + +static struct mem_cgroup_per_zone * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ +	struct rb_node *rightmost = NULL; +	struct mem_cgroup_per_zone *mz; + +retry: +	mz = NULL; +	rightmost = rb_last(&mctz->rb_root); +	if (!rightmost) +		goto done;		/* Nothing to reclaim from */ + +	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); +	/* +	 * Remove the node now but someone else can add it back, +	 * we will to add it back at the end of reclaim to its correct +	 * position in the tree. +	 */ +	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); +	if (!res_counter_soft_limit_excess(&mz->memcg->res) || +		!css_tryget(&mz->memcg->css)) +		goto retry; +done: +	return mz; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ +	struct mem_cgroup_per_zone *mz; + +	spin_lock(&mctz->lock); +	mz = __mem_cgroup_largest_soft_limit_node(mctz); +	spin_unlock(&mctz->lock); +	return mz; +} +  /*   * Implementation Note: reading percpu statistics for memcg.   * @@ -822,48 +990,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,  }  /* - * Called from rate-limited memcg_check_events when enough - * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure - * that all the parents up the hierarchy will be notified that this group - * is in excess or that it is not in excess anymore. mmecg->soft_contributed - * makes the transition a single action whenever the state flips from one to - * the other. - */ -static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) -{ -	unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); -	struct mem_cgroup *parent = memcg; -	int delta = 0; - -	spin_lock(&memcg->soft_lock); -	if (excess) { -		if (!memcg->soft_contributed) { -			delta = 1; -			memcg->soft_contributed = true; -		} -	} else { -		if (memcg->soft_contributed) { -			delta = -1; -			memcg->soft_contributed = false; -		} -	} - -	/* -	 * Necessary to update all ancestors when hierarchy is used -	 * because their event counter is not touched. -	 * We track children even outside the hierarchy for the root -	 * cgroup because tree walk starting at root should visit -	 * all cgroups and we want to prevent from pointless tree -	 * walk if no children is below the limit. -	 */ -	while (delta && (parent = parent_mem_cgroup(parent))) -		atomic_add(delta, &parent->children_in_excess); -	if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) -		atomic_add(delta, &root_mem_cgroup->children_in_excess); -	spin_unlock(&memcg->soft_lock); -} - -/*   * Check events in order.   *   */ @@ -886,7 +1012,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)  		mem_cgroup_threshold(memcg);  		if (unlikely(do_softlimit)) -			mem_cgroup_update_soft_limit(memcg); +			mem_cgroup_update_tree(memcg, page);  #if MAX_NUMNODES > 1  		if (unlikely(do_numainfo))  			atomic_inc(&memcg->numainfo_events); @@ -929,15 +1055,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)  	return memcg;  } -static enum mem_cgroup_filter_t -mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, -		mem_cgroup_iter_filter cond) -{ -	if (!cond) -		return VISIT; -	return cond(memcg, root); -} -  /*   * Returns a next (in a pre-order walk) alive memcg (with elevated css   * ref. count) or NULL if the whole root's subtree has been visited. @@ -945,7 +1062,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,   * helper function to be used by mem_cgroup_iter   */  static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, -		struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) +		struct mem_cgroup *last_visited)  {  	struct cgroup_subsys_state *prev_css, *next_css; @@ -963,31 +1080,11 @@ skip_node:  	if (next_css) {  		struct mem_cgroup *mem = mem_cgroup_from_css(next_css); -		switch (mem_cgroup_filter(mem, root, cond)) { -		case SKIP: +		if (css_tryget(&mem->css)) +			return mem; +		else {  			prev_css = next_css;  			goto skip_node; -		case SKIP_TREE: -			if (mem == root) -				return NULL; -			/* -			 * css_rightmost_descendant is not an optimal way to -			 * skip through a subtree (especially for imbalanced -			 * trees leaning to right) but that's what we have right -			 * now. More effective solution would be traversing -			 * right-up for first non-NULL without calling -			 * css_next_descendant_pre afterwards. -			 */ -			prev_css = css_rightmost_descendant(next_css); -			goto skip_node; -		case VISIT: -			if (css_tryget(&mem->css)) -				return mem; -			else { -				prev_css = next_css; -				goto skip_node; -			} -			break;  		}  	} @@ -1051,7 +1148,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,   * @root: hierarchy root   * @prev: previously returned memcg, NULL on first invocation   * @reclaim: cookie for shared reclaim walks, NULL for full walks - * @cond: filter for visited nodes, NULL for no filter   *   * Returns references to children of the hierarchy below @root, or   * @root itself, or %NULL after a full round-trip. @@ -1064,18 +1160,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,   * divide up the memcgs in the hierarchy among all concurrent   * reclaimers operating on the same zone and priority.   */ -struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,  				   struct mem_cgroup *prev, -				   struct mem_cgroup_reclaim_cookie *reclaim, -				   mem_cgroup_iter_filter cond) +				   struct mem_cgroup_reclaim_cookie *reclaim)  {  	struct mem_cgroup *memcg = NULL;  	struct mem_cgroup *last_visited = NULL; -	if (mem_cgroup_disabled()) { -		/* first call must return non-NULL, second return NULL */ -		return (struct mem_cgroup *)(unsigned long)!prev; -	} +	if (mem_cgroup_disabled()) +		return NULL;  	if (!root)  		root = root_mem_cgroup; @@ -1086,9 +1179,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,  	if (!root->use_hierarchy && root != root_mem_cgroup) {  		if (prev)  			goto out_css_put; -		if (mem_cgroup_filter(root, root, cond) == VISIT) -			return root; -		return NULL; +		return root;  	}  	rcu_read_lock(); @@ -1111,7 +1202,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,  			last_visited = mem_cgroup_iter_load(iter, root, &seq);  		} -		memcg = __mem_cgroup_iter_next(root, last_visited, cond); +		memcg = __mem_cgroup_iter_next(root, last_visited);  		if (reclaim) {  			mem_cgroup_iter_update(iter, last_visited, memcg, seq); @@ -1122,11 +1213,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,  				reclaim->generation = iter->generation;  		} -		/* -		 * We have finished the whole tree walk or no group has been -		 * visited because filter told us to skip the root node. -		 */ -		if (!memcg && (prev || (cond && !last_visited))) +		if (prev && !memcg)  			goto out_unlock;  	}  out_unlock: @@ -1767,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,  	return total;  } -#if MAX_NUMNODES > 1  /**   * test_mem_cgroup_node_reclaimable   * @memcg: the target memcg @@ -1790,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,  	return false;  } +#if MAX_NUMNODES > 1  /*   * Always updating the nodemask is not very good - even if we have an empty @@ -1857,50 +1944,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)  	return node;  } +/* + * Check all nodes whether it contains reclaimable pages or not. + * For quick scan, we make use of scan_nodes. This will allow us to skip + * unused nodes. But scan_nodes is lazily updated and may not cotain + * enough new information. We need to do double check. + */ +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +{ +	int nid; + +	/* +	 * quick check...making use of scan_node. +	 * We can skip unused nodes. +	 */ +	if (!nodes_empty(memcg->scan_nodes)) { +		for (nid = first_node(memcg->scan_nodes); +		     nid < MAX_NUMNODES; +		     nid = next_node(nid, memcg->scan_nodes)) { + +			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) +				return true; +		} +	} +	/* +	 * Check rest of nodes. +	 */ +	for_each_node_state(nid, N_MEMORY) { +		if (node_isset(nid, memcg->scan_nodes)) +			continue; +		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) +			return true; +	} +	return false; +} +  #else  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)  {  	return 0;  } +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +{ +	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); +}  #endif -/* - * A group is eligible for the soft limit reclaim under the given root - * hierarchy if - *	a) it is over its soft limit - *	b) any parent up the hierarchy is over its soft limit - * - * If the given group doesn't have any children over the limit then it - * doesn't make any sense to iterate its subtree. - */ -enum mem_cgroup_filter_t -mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, -		struct mem_cgroup *root) +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, +				   struct zone *zone, +				   gfp_t gfp_mask, +				   unsigned long *total_scanned)  { -	struct mem_cgroup *parent; - -	if (!memcg) -		memcg = root_mem_cgroup; -	parent = memcg; +	struct mem_cgroup *victim = NULL; +	int total = 0; +	int loop = 0; +	unsigned long excess; +	unsigned long nr_scanned; +	struct mem_cgroup_reclaim_cookie reclaim = { +		.zone = zone, +		.priority = 0, +	}; -	if (res_counter_soft_limit_excess(&memcg->res)) -		return VISIT; +	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; -	/* -	 * If any parent up to the root in the hierarchy is over its soft limit -	 * then we have to obey and reclaim from this group as well. -	 */ -	while ((parent = parent_mem_cgroup(parent))) { -		if (res_counter_soft_limit_excess(&parent->res)) -			return VISIT; -		if (parent == root) +	while (1) { +		victim = mem_cgroup_iter(root_memcg, victim, &reclaim); +		if (!victim) { +			loop++; +			if (loop >= 2) { +				/* +				 * If we have not been able to reclaim +				 * anything, it might because there are +				 * no reclaimable pages under this hierarchy +				 */ +				if (!total) +					break; +				/* +				 * We want to do more targeted reclaim. +				 * excess >> 2 is not to excessive so as to +				 * reclaim too much, nor too less that we keep +				 * coming back to reclaim from this cgroup +				 */ +				if (total >= (excess >> 2) || +					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) +					break; +			} +			continue; +		} +		if (!mem_cgroup_reclaimable(victim, false)) +			continue; +		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, +						     zone, &nr_scanned); +		*total_scanned += nr_scanned; +		if (!res_counter_soft_limit_excess(&root_memcg->res))  			break;  	} - -	if (!atomic_read(&memcg->children_in_excess)) -		return SKIP_TREE; -	return SKIP; +	mem_cgroup_iter_break(root_memcg, victim); +	return total;  }  static DEFINE_SPINLOCK(memcg_oom_lock); @@ -2812,7 +2953,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  	unlock_page_cgroup(pc);  	/* -	 * "charge_statistics" updated event counter. +	 * "charge_statistics" updated event counter. Then, check it. +	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. +	 * if they exceeds softlimit.  	 */  	memcg_check_events(memcg, page);  } @@ -4647,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,  	return ret;  } +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, +					    gfp_t gfp_mask, +					    unsigned long *total_scanned) +{ +	unsigned long nr_reclaimed = 0; +	struct mem_cgroup_per_zone *mz, *next_mz = NULL; +	unsigned long reclaimed; +	int loop = 0; +	struct mem_cgroup_tree_per_zone *mctz; +	unsigned long long excess; +	unsigned long nr_scanned; + +	if (order > 0) +		return 0; + +	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); +	/* +	 * This loop can run a while, specially if mem_cgroup's continuously +	 * keep exceeding their soft limit and putting the system under +	 * pressure +	 */ +	do { +		if (next_mz) +			mz = next_mz; +		else +			mz = mem_cgroup_largest_soft_limit_node(mctz); +		if (!mz) +			break; + +		nr_scanned = 0; +		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, +						    gfp_mask, &nr_scanned); +		nr_reclaimed += reclaimed; +		*total_scanned += nr_scanned; +		spin_lock(&mctz->lock); + +		/* +		 * If we failed to reclaim anything from this memory cgroup +		 * it is time to move on to the next cgroup +		 */ +		next_mz = NULL; +		if (!reclaimed) { +			do { +				/* +				 * Loop until we find yet another one. +				 * +				 * By the time we get the soft_limit lock +				 * again, someone might have aded the +				 * group back on the RB tree. Iterate to +				 * make sure we get a different mem. +				 * mem_cgroup_largest_soft_limit_node returns +				 * NULL if no other cgroup is present on +				 * the tree +				 */ +				next_mz = +				__mem_cgroup_largest_soft_limit_node(mctz); +				if (next_mz == mz) +					css_put(&next_mz->memcg->css); +				else /* next_mz == NULL or other memcg */ +					break; +			} while (1); +		} +		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); +		excess = res_counter_soft_limit_excess(&mz->memcg->res); +		/* +		 * One school of thought says that we should not add +		 * back the node to the tree if reclaim returns 0. +		 * But our reclaim could return 0, simply because due +		 * to priority we are exposing a smaller subset of +		 * memory to reclaim from. Consider this as a longer +		 * term TODO. +		 */ +		/* If excess == 0, no tree ops */ +		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); +		spin_unlock(&mctz->lock); +		css_put(&mz->memcg->css); +		loop++; +		/* +		 * Could not reclaim anything and there are no more +		 * mem cgroups to try or we seem to be looping without +		 * reclaiming anything. +		 */ +		if (!nr_reclaimed && +			(next_mz == NULL || +			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) +			break; +	} while (!nr_reclaimed); +	if (next_mz) +		css_put(&next_mz->memcg->css); +	return nr_reclaimed; +} +  /**   * mem_cgroup_force_empty_list - clears LRU of a group   * @memcg: group to clear @@ -5911,6 +6146,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {  		mz = &pn->zoneinfo[zone];  		lruvec_init(&mz->lruvec); +		mz->usage_in_excess = 0; +		mz->on_tree = false;  		mz->memcg = memcg;  	}  	memcg->nodeinfo[node] = pn; @@ -5966,6 +6203,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)  	int node;  	size_t size = memcg_size(); +	mem_cgroup_remove_from_trees(memcg);  	free_css_id(&mem_cgroup_subsys, &memcg->css);  	for_each_node(node) @@ -6002,6 +6240,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)  }  EXPORT_SYMBOL(parent_mem_cgroup); +static void __init mem_cgroup_soft_limit_tree_init(void) +{ +	struct mem_cgroup_tree_per_node *rtpn; +	struct mem_cgroup_tree_per_zone *rtpz; +	int tmp, node, zone; + +	for_each_node(node) { +		tmp = node; +		if (!node_state(node, N_NORMAL_MEMORY)) +			tmp = -1; +		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); +		BUG_ON(!rtpn); + +		soft_limit_tree.rb_tree_per_node[node] = rtpn; + +		for (zone = 0; zone < MAX_NR_ZONES; zone++) { +			rtpz = &rtpn->rb_tree_per_zone[zone]; +			rtpz->rb_root = RB_ROOT; +			spin_lock_init(&rtpz->lock); +		} +	} +} +  static struct cgroup_subsys_state * __ref  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)  { @@ -6031,7 +6292,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)  	mutex_init(&memcg->thresholds_lock);  	spin_lock_init(&memcg->move_lock);  	vmpressure_init(&memcg->vmpressure); -	spin_lock_init(&memcg->soft_lock);  	return &memcg->css; @@ -6109,13 +6369,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)  	mem_cgroup_invalidate_reclaim_iterators(memcg);  	mem_cgroup_reparent_charges(memcg); -	if (memcg->soft_contributed) { -		while ((memcg = parent_mem_cgroup(memcg))) -			atomic_dec(&memcg->children_in_excess); - -		if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) -			atomic_dec(&root_mem_cgroup->children_in_excess); -	}  	mem_cgroup_destroy_all_caches(memcg);  	vmpressure_cleanup(&memcg->vmpressure);  } @@ -6790,6 +7043,7 @@ static int __init mem_cgroup_init(void)  {  	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);  	enable_swap_cgroup(); +	mem_cgroup_soft_limit_tree_init();  	memcg_stock_init();  	return 0;  }  | 
