summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c972
1 files changed, 620 insertions, 352 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce13029..7f1a356153c0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
#include <linux/limits.h>
#include <linux/export.h>
#include <linux/mutex.h>
+#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -53,10 +54,12 @@
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <linux/lockdep.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp_memcontrol.h>
+#include "slab.h"
#include <asm/uaccess.h>
@@ -160,6 +163,10 @@ struct mem_cgroup_per_zone {
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+ struct rb_node tree_node; /* RB tree node */
+ unsigned long long usage_in_excess;/* Set to the value by which */
+ /* the soft limit is exceeded*/
+ bool on_tree;
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */
};
@@ -168,6 +175,26 @@ struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
+/*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+
+struct mem_cgroup_tree_per_zone {
+ struct rb_root rb_root;
+ spinlock_t lock;
+};
+
+struct mem_cgroup_tree_per_node {
+ struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_tree {
+ struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
+};
+
+static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+
struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd;
u64 threshold;
@@ -286,7 +313,7 @@ struct mem_cgroup {
atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct tcp_memcontrol tcp_mem;
+ struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
/* analogous to slab_common's slab_caches list. per-memcg */
@@ -303,22 +330,6 @@ struct mem_cgroup {
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
- /*
- * Protects soft_contributed transitions.
- * See mem_cgroup_update_soft_limit
- */
- spinlock_t soft_lock;
-
- /*
- * If true then this group has increased parents' children_in_excess
- * when it got over the soft limit.
- * When a group falls bellow the soft limit, parents' children_in_excess
- * is decreased and soft_contributed changed to false.
- */
- bool soft_contributed;
-
- /* Number of children that are in soft limit excess */
- atomic_t children_in_excess;
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
@@ -327,7 +338,7 @@ struct mem_cgroup {
static size_t memcg_size(void)
{
return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node);
+ nr_node_ids * sizeof(struct mem_cgroup_per_node *);
}
/* internal only representation about the status of kmem accounting. */
@@ -422,6 +433,7 @@ static bool move_file(void)
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
+#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -488,6 +500,29 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
+/*
+ * We restrict the id in the range of [1, 65535], so it can fit into
+ * an unsigned short.
+ */
+#define MEM_CGROUP_ID_MAX USHRT_MAX
+
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+ /*
+ * The ID of the root cgroup is 0, but memcg treat 0 as an
+ * invalid ID, so we return (cgroup_id + 1).
+ */
+ return memcg->css.cgroup->id + 1;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_from_id(id - 1, &mem_cgroup_subsys);
+ return mem_cgroup_from_css(css);
+}
+
/* Writing them here to avoid exposing memcg's inner layout */
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
@@ -540,13 +575,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
if (!memcg || mem_cgroup_is_root(memcg))
return NULL;
- return &memcg->tcp_mem.cg_proto;
+ return &memcg->tcp_mem;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
- if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+ if (!memcg_proto_activated(&memcg->tcp_mem))
return;
static_key_slow_dec(&memcg_socket_limit_enabled);
}
@@ -559,16 +594,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
- * There are two main reasons for not using the css_id for this:
- * 1) this works better in sparse environments, where we have a lot of memcgs,
- * but only a few kmem-limited. Or also, if we have, for instance, 200
- * memcgs, and none but the 200th is kmem-limited, we'd have to have a
- * 200 entry array for that.
- *
- * 2) In order not to violate the cgroup API, we would like to do all memory
- * allocation in ->create(). At that point, we haven't yet allocated the
- * css_id. Having a separate index prevents us from messing with the cgroup
- * core for this
+ * The main reason for not using cgroup id for this:
+ * this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
*
* The current size of the caches array is stored in
* memcg_limited_groups_array_size. It will double each time we have to
@@ -583,14 +613,14 @@ int memcg_limited_groups_array_size;
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
- * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
- * css_id space is not getting any smaller, and we don't have to necessarily
+ * cgrp_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
-#define MEMCG_CACHES_MAX_SIZE 65535
+#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
/*
* A lot of the calls to the cache allocation functions are expected to be
@@ -648,6 +678,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
return mem_cgroup_zoneinfo(memcg, nid, zid);
}
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_node_zone(int nid, int zid)
+{
+ return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_from_page(struct page *page)
+{
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
+
+ return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static void
+__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz,
+ unsigned long long new_usage_in_excess)
+{
+ struct rb_node **p = &mctz->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct mem_cgroup_per_zone *mz_node;
+
+ if (mz->on_tree)
+ return;
+
+ mz->usage_in_excess = new_usage_in_excess;
+ if (!mz->usage_in_excess)
+ return;
+ while (*p) {
+ parent = *p;
+ mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+ tree_node);
+ if (mz->usage_in_excess < mz_node->usage_in_excess)
+ p = &(*p)->rb_left;
+ /*
+ * We can't avoid mem cgroups that are over their soft
+ * limit by the same amount
+ */
+ else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+ p = &(*p)->rb_right;
+ }
+ rb_link_node(&mz->tree_node, parent, p);
+ rb_insert_color(&mz->tree_node, &mctz->rb_root);
+ mz->on_tree = true;
+}
+
+static void
+__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
+{
+ if (!mz->on_tree)
+ return;
+ rb_erase(&mz->tree_node, &mctz->rb_root);
+ mz->on_tree = false;
+}
+
+static void
+mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
+{
+ spin_lock(&mctz->lock);
+ __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ spin_unlock(&mctz->lock);
+}
+
+
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+{
+ unsigned long long excess;
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_tree_per_zone *mctz;
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
+ mctz = soft_limit_tree_from_page(page);
+
+ /*
+ * Necessary to update all ancestors when hierarchy is used.
+ * because their event counter is not touched.
+ */
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ excess = res_counter_soft_limit_excess(&memcg->res);
+ /*
+ * We have to update the tree if mz is on RB-tree or
+ * mem is over its softlimit.
+ */
+ if (excess || mz->on_tree) {
+ spin_lock(&mctz->lock);
+ /* if on-tree, remove it */
+ if (mz->on_tree)
+ __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ /*
+ * Insert again. mz->usage_in_excess will be updated.
+ * If excess is 0, no tree ops.
+ */
+ __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
+ spin_unlock(&mctz->lock);
+ }
+ }
+}
+
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
+{
+ int node, zone;
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_tree_per_zone *mctz;
+
+ for_each_node(node) {
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ mz = mem_cgroup_zoneinfo(memcg, node, zone);
+ mctz = soft_limit_tree_node_zone(node, zone);
+ mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ }
+ }
+}
+
+static struct mem_cgroup_per_zone *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+ struct rb_node *rightmost = NULL;
+ struct mem_cgroup_per_zone *mz;
+
+retry:
+ mz = NULL;
+ rightmost = rb_last(&mctz->rb_root);
+ if (!rightmost)
+ goto done; /* Nothing to reclaim from */
+
+ mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+ /*
+ * Remove the node now but someone else can add it back,
+ * we will to add it back at the end of reclaim to its correct
+ * position in the tree.
+ */
+ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+ if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+ !css_tryget(&mz->memcg->css))
+ goto retry;
+done:
+ return mz;
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ spin_lock(&mctz->lock);
+ mz = __mem_cgroup_largest_soft_limit_node(mctz);
+ spin_unlock(&mctz->lock);
+ return mz;
+}
+
/*
* Implementation Note: reading percpu statistics for memcg.
*
@@ -698,6 +886,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
unsigned long val = 0;
int cpu;
+ get_online_cpus();
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
@@ -705,6 +894,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
val += memcg->nocpu_base.events[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
+ put_online_cpus();
return val;
}
@@ -822,48 +1012,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
}
/*
- * Called from rate-limited memcg_check_events when enough
- * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
- * that all the parents up the hierarchy will be notified that this group
- * is in excess or that it is not in excess anymore. mmecg->soft_contributed
- * makes the transition a single action whenever the state flips from one to
- * the other.
- */
-static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
-{
- unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
- struct mem_cgroup *parent = memcg;
- int delta = 0;
-
- spin_lock(&memcg->soft_lock);
- if (excess) {
- if (!memcg->soft_contributed) {
- delta = 1;
- memcg->soft_contributed = true;
- }
- } else {
- if (memcg->soft_contributed) {
- delta = -1;
- memcg->soft_contributed = false;
- }
- }
-
- /*
- * Necessary to update all ancestors when hierarchy is used
- * because their event counter is not touched.
- * We track children even outside the hierarchy for the root
- * cgroup because tree walk starting at root should visit
- * all cgroups and we want to prevent from pointless tree
- * walk if no children is below the limit.
- */
- while (delta && (parent = parent_mem_cgroup(parent)))
- atomic_add(delta, &parent->children_in_excess);
- if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
- atomic_add(delta, &root_mem_cgroup->children_in_excess);
- spin_unlock(&memcg->soft_lock);
-}
-
-/*
* Check events in order.
*
*/
@@ -886,7 +1034,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
- mem_cgroup_update_soft_limit(memcg);
+ mem_cgroup_update_tree(memcg, page);
#if MAX_NUMNODES > 1
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
@@ -929,15 +1077,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
return memcg;
}
-static enum mem_cgroup_filter_t
-mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
- mem_cgroup_iter_filter cond)
-{
- if (!cond)
- return VISIT;
- return cond(memcg, root);
-}
-
/*
* Returns a next (in a pre-order walk) alive memcg (with elevated css
* ref. count) or NULL if the whole root's subtree has been visited.
@@ -945,7 +1084,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
* helper function to be used by mem_cgroup_iter
*/
static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
- struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
+ struct mem_cgroup *last_visited)
{
struct cgroup_subsys_state *prev_css, *next_css;
@@ -963,31 +1102,11 @@ skip_node:
if (next_css) {
struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
- switch (mem_cgroup_filter(mem, root, cond)) {
- case SKIP:
+ if (css_tryget(&mem->css))
+ return mem;
+ else {
prev_css = next_css;
goto skip_node;
- case SKIP_TREE:
- if (mem == root)
- return NULL;
- /*
- * css_rightmost_descendant is not an optimal way to
- * skip through a subtree (especially for imbalanced
- * trees leaning to right) but that's what we have right
- * now. More effective solution would be traversing
- * right-up for first non-NULL without calling
- * css_next_descendant_pre afterwards.
- */
- prev_css = css_rightmost_descendant(next_css);
- goto skip_node;
- case VISIT:
- if (css_tryget(&mem->css))
- return mem;
- else {
- prev_css = next_css;
- goto skip_node;
- }
- break;
}
}
@@ -1051,7 +1170,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
* @reclaim: cookie for shared reclaim walks, NULL for full walks
- * @cond: filter for visited nodes, NULL for no filter
*
* Returns references to children of the hierarchy below @root, or
* @root itself, or %NULL after a full round-trip.
@@ -1064,18 +1182,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
* divide up the memcgs in the hierarchy among all concurrent
* reclaimers operating on the same zone and priority.
*/
-struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
- struct mem_cgroup_reclaim_cookie *reclaim,
- mem_cgroup_iter_filter cond)
+ struct mem_cgroup_reclaim_cookie *reclaim)
{
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *last_visited = NULL;
- if (mem_cgroup_disabled()) {
- /* first call must return non-NULL, second return NULL */
- return (struct mem_cgroup *)(unsigned long)!prev;
- }
+ if (mem_cgroup_disabled())
+ return NULL;
if (!root)
root = root_mem_cgroup;
@@ -1086,9 +1201,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
goto out_css_put;
- if (mem_cgroup_filter(root, root, cond) == VISIT)
- return root;
- return NULL;
+ return root;
}
rcu_read_lock();
@@ -1111,7 +1224,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
last_visited = mem_cgroup_iter_load(iter, root, &seq);
}
- memcg = __mem_cgroup_iter_next(root, last_visited, cond);
+ memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1122,11 +1235,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
reclaim->generation = iter->generation;
}
- /*
- * We have finished the whole tree walk or no group has been
- * visited because filter told us to skip the root node.
- */
- if (!memcg && (prev || (cond && !last_visited)))
+ if (prev && !memcg)
goto out_unlock;
}
out_unlock:
@@ -1318,7 +1427,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
return true;
if (!root_memcg->use_hierarchy || !memcg)
return false;
- return css_is_ancestor(&memcg->css, &root_memcg->css);
+ return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
}
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
@@ -1767,7 +1876,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
return total;
}
-#if MAX_NUMNODES > 1
/**
* test_mem_cgroup_node_reclaimable
* @memcg: the target memcg
@@ -1790,6 +1898,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
return false;
}
+#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
@@ -1857,52 +1966,112 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
return node;
}
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+{
+ int nid;
+
+ /*
+ * quick check...making use of scan_node.
+ * We can skip unused nodes.
+ */
+ if (!nodes_empty(memcg->scan_nodes)) {
+ for (nid = first_node(memcg->scan_nodes);
+ nid < MAX_NUMNODES;
+ nid = next_node(nid, memcg->scan_nodes)) {
+
+ if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
+ return true;
+ }
+ }
+ /*
+ * Check rest of nodes.
+ */
+ for_each_node_state(nid, N_MEMORY) {
+ if (node_isset(nid, memcg->scan_nodes))
+ continue;
+ if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
+ return true;
+ }
+ return false;
+}
+
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
-#endif
-
-/*
- * A group is eligible for the soft limit reclaim under the given root
- * hierarchy if
- * a) it is over its soft limit
- * b) any parent up the hierarchy is over its soft limit
- *
- * If the given group doesn't have any children over the limit then it
- * doesn't make any sense to iterate its subtree.
- */
-enum mem_cgroup_filter_t
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
- struct mem_cgroup *root)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
- struct mem_cgroup *parent;
-
- if (!memcg)
- memcg = root_mem_cgroup;
- parent = memcg;
-
- if (res_counter_soft_limit_excess(&memcg->res))
- return VISIT;
+ return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
+}
+#endif
- /*
- * If any parent up to the root in the hierarchy is over its soft limit
- * then we have to obey and reclaim from this group as well.
- */
- while ((parent = parent_mem_cgroup(parent))) {
- if (res_counter_soft_limit_excess(&parent->res))
- return VISIT;
- if (parent == root)
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
+ struct zone *zone,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
+{
+ struct mem_cgroup *victim = NULL;
+ int total = 0;
+ int loop = 0;
+ unsigned long excess;
+ unsigned long nr_scanned;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .zone = zone,
+ .priority = 0,
+ };
+
+ excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+
+ while (1) {
+ victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
+ if (!victim) {
+ loop++;
+ if (loop >= 2) {
+ /*
+ * If we have not been able to reclaim
+ * anything, it might because there are
+ * no reclaimable pages under this hierarchy
+ */
+ if (!total)
+ break;
+ /*
+ * We want to do more targeted reclaim.
+ * excess >> 2 is not to excessive so as to
+ * reclaim too much, nor too less that we keep
+ * coming back to reclaim from this cgroup
+ */
+ if (total >= (excess >> 2) ||
+ (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
+ break;
+ }
+ continue;
+ }
+ if (!mem_cgroup_reclaimable(victim, false))
+ continue;
+ total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+ zone, &nr_scanned);
+ *total_scanned += nr_scanned;
+ if (!res_counter_soft_limit_excess(&root_memcg->res))
break;
}
-
- if (!atomic_read(&memcg->children_in_excess))
- return SKIP_TREE;
- return SKIP;
+ mem_cgroup_iter_break(root_memcg, victim);
+ return total;
}
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map memcg_oom_lock_dep_map = {
+ .name = "memcg_oom_lock",
+};
+#endif
+
static DEFINE_SPINLOCK(memcg_oom_lock);
/*
@@ -1940,7 +2109,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
}
iter->oom_lock = false;
}
- }
+ } else
+ mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
spin_unlock(&memcg_oom_lock);
@@ -1952,6 +2122,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
+ mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
@@ -2018,110 +2189,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
memcg_wakeup_oom(memcg);
}
-/*
- * try to call OOM killer
- */
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- bool locked;
- int wakeups;
-
if (!current->memcg_oom.may_oom)
return;
-
- current->memcg_oom.in_memcg_oom = 1;
-
/*
- * As with any blocking lock, a contender needs to start
- * listening for wakeups before attempting the trylock,
- * otherwise it can miss the wakeup from the unlock and sleep
- * indefinitely. This is just open-coded because our locking
- * is so particular to memcg hierarchies.
+ * We are in the middle of the charge context here, so we
+ * don't want to block when potentially sitting on a callstack
+ * that holds all kinds of filesystem and mm locks.
+ *
+ * Also, the caller may handle a failed allocation gracefully
+ * (like optional page cache readahead) and so an OOM killer
+ * invocation might not even be necessary.
+ *
+ * That's why we don't do anything here except remember the
+ * OOM context and then deal with it at the end of the page
+ * fault when the stack is unwound, the locks are released,
+ * and when we know whether the fault was overall successful.
*/
- wakeups = atomic_read(&memcg->oom_wakeups);
- mem_cgroup_mark_under_oom(memcg);
-
- locked = mem_cgroup_oom_trylock(memcg);
-
- if (locked)
- mem_cgroup_oom_notify(memcg);
-
- if (locked && !memcg->oom_kill_disable) {
- mem_cgroup_unmark_under_oom(memcg);
- mem_cgroup_out_of_memory(memcg, mask, order);
- mem_cgroup_oom_unlock(memcg);
- /*
- * There is no guarantee that an OOM-lock contender
- * sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitely.
- */
- memcg_oom_recover(memcg);
- } else {
- /*
- * A system call can just return -ENOMEM, but if this
- * is a page fault and somebody else is handling the
- * OOM already, we need to sleep on the OOM waitqueue
- * for this memcg until the situation is resolved.
- * Which can take some time because it might be
- * handled by a userspace task.
- *
- * However, this is the charge context, which means
- * that we may sit on a large call stack and hold
- * various filesystem locks, the mmap_sem etc. and we
- * don't want the OOM handler to deadlock on them
- * while we sit here and wait. Store the current OOM
- * context in the task_struct, then return -ENOMEM.
- * At the end of the page fault handler, with the
- * stack unwound, pagefault_out_of_memory() will check
- * back with us by calling
- * mem_cgroup_oom_synchronize(), possibly putting the
- * task to sleep.
- */
- current->memcg_oom.oom_locked = locked;
- current->memcg_oom.wakeups = wakeups;
- css_get(&memcg->css);
- current->memcg_oom.wait_on_memcg = memcg;
- }
+ css_get(&memcg->css);
+ current->memcg_oom.memcg = memcg;
+ current->memcg_oom.gfp_mask = mask;
+ current->memcg_oom.order = order;
}
/**
* mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
*
- * This has to be called at the end of a page fault if the the memcg
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
*
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
* sleep on a waitqueue until the userspace task resolves the
* situation. Sleeping directly in the charge context with all kinds
* of locks held is not a good idea, instead we remember an OOM state
* in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
- * OOM state.
+ * the end of the page fault to complete the OOM handling.
*
* Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
*/
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
{
+ struct mem_cgroup *memcg = current->memcg_oom.memcg;
struct oom_wait_info owait;
- struct mem_cgroup *memcg;
+ bool locked;
/* OOM is global, do not handle */
- if (!current->memcg_oom.in_memcg_oom)
- return false;
-
- /*
- * We invoked the OOM killer but there is a chance that a kill
- * did not free up any charges. Everybody else might already
- * be sleeping, so restart the fault and keep the rampage
- * going until some charges are released.
- */
- memcg = current->memcg_oom.wait_on_memcg;
if (!memcg)
- goto out;
+ return false;
- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
- goto out_memcg;
+ if (!handle)
+ goto cleanup;
owait.memcg = memcg;
owait.wait.flags = 0;
@@ -2130,13 +2250,25 @@ bool mem_cgroup_oom_synchronize(void)
INIT_LIST_HEAD(&owait.wait.task_list);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
- /* Only sleep if we didn't miss any wakeups since OOM */
- if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ if (locked && !memcg->oom_kill_disable) {
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+ current->memcg_oom.order);
+ } else {
schedule();
- finish_wait(&memcg_oom_waitq, &owait.wait);
-out_memcg:
- mem_cgroup_unmark_under_oom(memcg);
- if (current->memcg_oom.oom_locked) {
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ }
+
+ if (locked) {
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
@@ -2145,10 +2277,9 @@ out_memcg:
*/
memcg_oom_recover(memcg);
}
+cleanup:
+ current->memcg_oom.memcg = NULL;
css_put(&memcg->css);
- current->memcg_oom.wait_on_memcg = NULL;
-out:
- current->memcg_oom.in_memcg_oom = 0;
return true;
}
@@ -2562,6 +2693,12 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
|| fatal_signal_pending(current)))
goto bypass;
+ if (unlikely(task_in_memcg_oom(current)))
+ goto nomem;
+
+ if (gfp_mask & __GFP_NOFAIL)
+ oom = false;
+
/*
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
@@ -2659,8 +2796,10 @@ done:
*ptr = memcg;
return 0;
nomem:
- *ptr = NULL;
- return -ENOMEM;
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ *ptr = NULL;
+ return -ENOMEM;
+ }
bypass:
*ptr = root_mem_cgroup;
return -EINTR;
@@ -2709,15 +2848,10 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
- struct cgroup_subsys_state *css;
-
/* ID 0 is unused ID */
if (!id)
return NULL;
- css = css_lookup(&mem_cgroup_subsys, id);
- if (!css)
- return NULL;
- return mem_cgroup_from_css(css);
+ return mem_cgroup_from_id(id);
}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2812,7 +2946,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
unlock_page_cgroup(pc);
/*
- * "charge_statistics" updated event counter.
+ * "charge_statistics" updated event counter. Then, check it.
+ * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+ * if they exceeds softlimit.
*/
memcg_check_events(memcg, page);
}
@@ -2836,7 +2972,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
VM_BUG_ON(p->is_root_cache);
cachep = p->root_cache;
- return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+ return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
}
#ifdef CONFIG_SLABINFO
@@ -2865,21 +3001,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
struct res_counter *fail_res;
struct mem_cgroup *_memcg;
int ret = 0;
- bool may_oom;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
- /*
- * Conditions under which we can wait for the oom_killer. Those are
- * the same conditions tested by the core page allocator
- */
- may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
-
_memcg = memcg;
ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- &_memcg, may_oom);
+ &_memcg, oom_gfp_allowed(gfp));
if (ret == -EINTR) {
/*
@@ -3019,7 +3148,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
- VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+ VM_BUG_ON(!is_root_cache(s));
if (num_groups > memcg_limited_groups_array_size) {
int i;
@@ -3280,7 +3409,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
idx = memcg_cache_id(memcg);
mutex_lock(&memcg_cache_mutex);
- new_cachep = cachep->memcg_params->memcg_caches[idx];
+ new_cachep = cache_from_memcg_idx(cachep, idx);
if (new_cachep) {
css_put(&memcg->css);
goto out;
@@ -3326,8 +3455,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
* we'll take the set_limit_mutex to protect ourselves against this.
*/
mutex_lock(&set_limit_mutex);
- for (i = 0; i < memcg_limited_groups_array_size; i++) {
- c = s->memcg_params->memcg_caches[i];
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
if (!c)
continue;
@@ -3460,8 +3589,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();
- if (likely(cachep->memcg_params->memcg_caches[idx])) {
- cachep = cachep->memcg_params->memcg_caches[idx];
+ if (likely(cache_from_memcg_idx(cachep, idx))) {
+ cachep = cache_from_memcg_idx(cachep, idx);
goto out;
}
@@ -3663,8 +3792,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
{
/* Update stat data for mem_cgroup */
preempt_disable();
- WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
- __this_cpu_add(from->stat->count[idx], -nr_pages);
+ __this_cpu_sub(from->stat->count[idx], nr_pages);
__this_cpu_add(to->stat->count[idx], nr_pages);
preempt_enable();
}
@@ -4232,7 +4360,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
* css_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
- swap_cgroup_record(ent, css_id(&memcg->css));
+ swap_cgroup_record(ent, mem_cgroup_id(memcg));
}
#endif
@@ -4284,8 +4412,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
{
unsigned short old_id, new_id;
- old_id = css_id(&from->css);
- new_id = css_id(&to->css);
+ old_id = mem_cgroup_id(from);
+ new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
@@ -4647,6 +4775,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
return ret;
}
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
+{
+ unsigned long nr_reclaimed = 0;
+ struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+ unsigned long reclaimed;
+ int loop = 0;
+ struct mem_cgroup_tree_per_zone *mctz;
+ unsigned long long excess;
+ unsigned long nr_scanned;
+
+ if (order > 0)
+ return 0;
+
+ mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
+ /*
+ * This loop can run a while, specially if mem_cgroup's continuously
+ * keep exceeding their soft limit and putting the system under
+ * pressure
+ */
+ do {
+ if (next_mz)
+ mz = next_mz;
+ else
+ mz = mem_cgroup_largest_soft_limit_node(mctz);
+ if (!mz)
+ break;
+
+ nr_scanned = 0;
+ reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
+ gfp_mask, &nr_scanned);
+ nr_reclaimed += reclaimed;
+ *total_scanned += nr_scanned;
+ spin_lock(&mctz->lock);
+
+ /*
+ * If we failed to reclaim anything from this memory cgroup
+ * it is time to move on to the next cgroup
+ */
+ next_mz = NULL;
+ if (!reclaimed) {
+ do {
+ /*
+ * Loop until we find yet another one.
+ *
+ * By the time we get the soft_limit lock
+ * again, someone might have aded the
+ * group back on the RB tree. Iterate to
+ * make sure we get a different mem.
+ * mem_cgroup_largest_soft_limit_node returns
+ * NULL if no other cgroup is present on
+ * the tree
+ */
+ next_mz =
+ __mem_cgroup_largest_soft_limit_node(mctz);
+ if (next_mz == mz)
+ css_put(&next_mz->memcg->css);
+ else /* next_mz == NULL or other memcg */
+ break;
+ } while (1);
+ }
+ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+ excess = res_counter_soft_limit_excess(&mz->memcg->res);
+ /*
+ * One school of thought says that we should not add
+ * back the node to the tree if reclaim returns 0.
+ * But our reclaim could return 0, simply because due
+ * to priority we are exposing a smaller subset of
+ * memory to reclaim from. Consider this as a longer
+ * term TODO.
+ */
+ /* If excess == 0, no tree ops */
+ __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
+ spin_unlock(&mctz->lock);
+ css_put(&mz->memcg->css);
+ loop++;
+ /*
+ * Could not reclaim anything and there are no more
+ * mem cgroups to try or we seem to be looping without
+ * reclaiming anything.
+ */
+ if (!nr_reclaimed &&
+ (next_mz == NULL ||
+ loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+ break;
+ } while (!nr_reclaimed);
+ if (next_mz)
+ css_put(&next_mz->memcg->css);
+ return nr_reclaimed;
+}
+
/**
* mem_cgroup_force_empty_list - clears LRU of a group
* @memcg: group to clear
@@ -4748,31 +4968,18 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
} while (usage > 0);
}
-/*
- * This mainly exists for tests during the setting of set of use_hierarchy.
- * Since this is the very setting we are changing, the current hierarchy value
- * is meaningless
- */
-static inline bool __memcg_has_children(struct mem_cgroup *memcg)
-{
- struct cgroup_subsys_state *pos;
-
- /* bounce at first found */
- css_for_each_child(pos, &memcg->css)
- return true;
- return false;
-}
-
-/*
- * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
- * to be already dead (as in mem_cgroup_force_empty, for instance). This is
- * from mem_cgroup_count_children(), in the sense that we don't really care how
- * many children we have; we only need to know if we have any. It also counts
- * any memcg without hierarchy as infertile.
- */
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
- return memcg->use_hierarchy && __memcg_has_children(memcg);
+ lockdep_assert_held(&memcg_create_mutex);
+ /*
+ * The lock does not prevent addition or deletion to the list
+ * of children, but it prevents a new child from being
+ * initialized based on this parent in css_online(), so it's
+ * enough to decide whether hierarchically inherited
+ * attributes can still be changed or not.
+ */
+ return memcg->use_hierarchy &&
+ !list_empty(&memcg->css.cgroup->children);
}
/*
@@ -4852,7 +5059,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
*/
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
- if (!__memcg_has_children(memcg))
+ if (list_empty(&memcg->css.cgroup->children))
memcg->use_hierarchy = val;
else
retval = -EBUSY;
@@ -5179,45 +5386,50 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
{
+ struct numa_stat {
+ const char *name;
+ unsigned int lru_mask;
+ };
+
+ static const struct numa_stat stats[] = {
+ { "total", LRU_ALL },
+ { "file", LRU_ALL_FILE },
+ { "anon", LRU_ALL_ANON },
+ { "unevictable", BIT(LRU_UNEVICTABLE) },
+ };
+ const struct numa_stat *stat;
int nid;
- unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
- unsigned long node_nr;
+ unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
- seq_printf(m, "total=%lu", total_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
- seq_printf(m, " N%d=%lu", nid, node_nr);
- }
- seq_putc(m, '\n');
-
- file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
- seq_printf(m, "file=%lu", file_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- LRU_ALL_FILE);
- seq_printf(m, " N%d=%lu", nid, node_nr);
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
+ seq_printf(m, "%s=%lu", stat->name, nr);
+ for_each_node_state(nid, N_MEMORY) {
+ nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask);
+ seq_printf(m, " N%d=%lu", nid, nr);
+ }
+ seq_putc(m, '\n');
}
- seq_putc(m, '\n');
- anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
- seq_printf(m, "anon=%lu", anon_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- LRU_ALL_ANON);
- seq_printf(m, " N%d=%lu", nid, node_nr);
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ struct mem_cgroup *iter;
+
+ nr = 0;
+ for_each_mem_cgroup_tree(iter, memcg)
+ nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
+ seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
+ for_each_node_state(nid, N_MEMORY) {
+ nr = 0;
+ for_each_mem_cgroup_tree(iter, memcg)
+ nr += mem_cgroup_node_nr_lru_pages(
+ iter, nid, stat->lru_mask);
+ seq_printf(m, " N%d=%lu", nid, nr);
+ }
+ seq_putc(m, '\n');
}
- seq_putc(m, '\n');
- unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
- seq_printf(m, "unevictable=%lu", unevictable_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- BIT(LRU_UNEVICTABLE));
- seq_printf(m, " N%d=%lu", nid, node_nr);
- }
- seq_putc(m, '\n');
return 0;
}
#endif /* CONFIG_NUMA */
@@ -5911,6 +6123,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
lruvec_init(&mz->lruvec);
+ mz->usage_in_excess = 0;
+ mz->on_tree = false;
mz->memcg = memcg;
}
memcg->nodeinfo[node] = pn;
@@ -5966,7 +6180,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
int node;
size_t size = memcg_size();
- free_css_id(&mem_cgroup_subsys, &memcg->css);
+ mem_cgroup_remove_from_trees(memcg);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
@@ -6002,6 +6216,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(parent_mem_cgroup);
+static void __init mem_cgroup_soft_limit_tree_init(void)
+{
+ struct mem_cgroup_tree_per_node *rtpn;
+ struct mem_cgroup_tree_per_zone *rtpz;
+ int tmp, node, zone;
+
+ for_each_node(node) {
+ tmp = node;
+ if (!node_state(node, N_NORMAL_MEMORY))
+ tmp = -1;
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
+ BUG_ON(!rtpn);
+
+ soft_limit_tree.rb_tree_per_node[node] = rtpn;
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ rtpz = &rtpn->rb_tree_per_zone[zone];
+ rtpz->rb_root = RB_ROOT;
+ spin_lock_init(&rtpz->lock);
+ }
+ }
+}
+
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -6031,7 +6268,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
- spin_lock_init(&memcg->soft_lock);
return &memcg->css;
@@ -6047,6 +6283,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
int error = 0;
+ if (css->cgroup->id > MEM_CGROUP_ID_MAX)
+ return -ENOSPC;
+
if (!parent)
return 0;
@@ -6109,13 +6348,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
mem_cgroup_invalidate_reclaim_iterators(memcg);
mem_cgroup_reparent_charges(memcg);
- if (memcg->soft_contributed) {
- while ((memcg = parent_mem_cgroup(memcg)))
- atomic_dec(&memcg->children_in_excess);
-
- if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
- atomic_dec(&root_mem_cgroup->children_in_excess);
- }
mem_cgroup_destroy_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}
@@ -6123,6 +6355,42 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ /*
+ * XXX: css_offline() would be where we should reparent all
+ * memory to prepare the cgroup for destruction. However,
+ * memcg does not do css_tryget() and res_counter charging
+ * under the same RCU lock region, which means that charging
+ * could race with offlining. Offlining only happens to
+ * cgroups with no tasks in them but charges can show up
+ * without any tasks from the swapin path when the target
+ * memcg is looked up from the swapout record and not from the
+ * current task as it usually is. A race like this can leak
+ * charges and put pages with stale cgroup pointers into
+ * circulation:
+ *
+ * #0 #1
+ * lookup_swap_cgroup_id()
+ * rcu_read_lock()
+ * mem_cgroup_lookup()
+ * css_tryget()
+ * rcu_read_unlock()
+ * disable css_tryget()
+ * call_rcu()
+ * offline_css()
+ * reparent_charges()
+ * res_counter_charge()
+ * css_put()
+ * css_free()
+ * pc->mem_cgroup = dead memcg
+ * add page to lru
+ *
+ * The bulk of the charges are still moved in offline_css() to
+ * avoid pinning a lot of pages in case a long-term reference
+ * like a swapout record is deferring the css_free() to long
+ * after offlining. But this makes sure we catch any charges
+ * made after offlining:
+ */
+ mem_cgroup_reparent_charges(memcg);
memcg_destroy_kmem(memcg);
__mem_cgroup_free(memcg);
@@ -6325,7 +6593,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
- css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
+ mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
@@ -6376,10 +6644,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -6568,9 +6836,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6587,7 +6855,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
put_page(page);
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -6745,7 +7013,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
.bind = mem_cgroup_bind,
.base_cftypes = mem_cgroup_files,
.early_init = 0,
- .use_id = 1,
};
#ifdef CONFIG_MEMCG_SWAP
@@ -6790,6 +7057,7 @@ static int __init mem_cgroup_init(void)
{
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
enable_swap_cgroup();
+ mem_cgroup_soft_limit_tree_init();
memcg_stock_init();
return 0;
}