diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 377 |
1 files changed, 333 insertions, 44 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 010f9166fa6e..bd9052a5d3ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -94,6 +94,8 @@ enum mem_cgroup_events_index { MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ + MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ + MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ MEM_CGROUP_EVENTS_NSTATS, }; /* @@ -231,6 +233,11 @@ struct mem_cgroup { * reclaimed from. */ int last_scanned_child; + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + unsigned long next_scan_node_update; +#endif /* * Should the accounting and control be hierarchical, per subtree? */ @@ -585,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } +void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) +{ + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); +} + +void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) +{ + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); +} + static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, enum mem_cgroup_events_index idx) { @@ -624,18 +641,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, preempt_enable(); } +static unsigned long +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) +{ + struct mem_cgroup_per_zone *mz; + u64 total = 0; + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = mem_cgroup_zoneinfo(mem, nid, zid); + total += MEM_CGROUP_ZSTAT(mz, idx); + } + return total; +} static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { - int nid, zid; - struct mem_cgroup_per_zone *mz; + int nid; u64 total = 0; for_each_online_node(nid) - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - total += MEM_CGROUP_ZSTAT(mz, idx); - } + total += mem_cgroup_get_zonestat_node(mem, nid, idx); return total; } @@ -813,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) return (mem == root_mem_cgroup); } +void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +{ + struct mem_cgroup *mem; + + if (!mm) + return; + + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) + goto out; + + switch (idx) { + case PGMAJFAULT: + mem_cgroup_pgmajfault(mem, 1); + break; + case PGFAULT: + mem_cgroup_pgfault(mem, 1); + break; + default: + BUG(); + } +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(mem_cgroup_count_vm_event); + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -1064,9 +1117,9 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) return (active > inactive); } -unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, - struct zone *zone, - enum lru_list lru) +unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, + struct zone *zone, + enum lru_list lru) { int nid = zone_to_nid(zone); int zid = zone_idx(zone); @@ -1075,6 +1128,93 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, return MEM_CGROUP_ZSTAT(mz, lru); } +#ifdef CONFIG_NUMA +static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + unsigned long ret; + + ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) + + mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE); + + return ret; +} + +static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_file_lru_pages(memcg, nid); + + return total; +} + +static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + unsigned long ret; + + ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + + mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); + + return ret; +} + +static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid); + + return total; +} + +static unsigned long +mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid) +{ + return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE); +} + +static unsigned long +mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid); + + return total; +} + +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + enum lru_list l; + u64 total = 0; + + for_each_lru(l) + total += mem_cgroup_get_zonestat_node(memcg, nid, l); + + return total; +} + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_lru_pages(memcg, nid); + + return total; +} +#endif /* CONFIG_NUMA */ + struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { @@ -1418,6 +1558,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) return ret; } +#if MAX_NUMNODES > 1 + +/* + * Always updating the nodemask is not very good - even if we have an empty + * list or the wrong list here, we can start from some node and traverse all + * nodes based on the zonelist. So update the list loosely once per 10 secs. + * + */ +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) +{ + int nid; + + if (time_after(mem->next_scan_node_update, jiffies)) + return; + + mem->next_scan_node_update = jiffies + 10*HZ; + /* make a nodemask where this memcg uses memory from */ + mem->scan_nodes = node_states[N_HIGH_MEMORY]; + + for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { + + if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || + mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) + continue; + + if (total_swap_pages && + (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || + mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) + continue; + node_clear(nid, mem->scan_nodes); + } +} + +/* + * Selecting a node where we start reclaim from. Because what we need is just + * reducing usage counter, start from anywhere is O,K. Considering + * memory reclaim from current node, there are pros. and cons. + * + * Freeing memory from current node means freeing memory from a node which + * we'll use or we've used. So, it may make LRU bad. And if several threads + * hit limits, it will see a contention on a node. But freeing from remote + * node means more costs for memory reclaim because of memory latency. + * + * Now, we use round-robin. Better algorithm is welcomed. + */ +int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +{ + int node; + + mem_cgroup_may_update_nodemask(mem); + node = mem->last_scanned_node; + + node = next_node(node, mem->scan_nodes); + if (node == MAX_NUMNODES) + node = first_node(mem->scan_nodes); + /* + * We call this when we hit limit, not when pages are added to LRU. + * No LRU may hold pages because all pages are UNEVICTABLE or + * memcg is too small and all pages are not on LRU. In that case, + * we use curret node. + */ + if (unlikely(node == MAX_NUMNODES)) + node = numa_node_id(); + + mem->last_scanned_node = node; + return node; +} + +#else +int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +{ + return 0; +} +#endif + /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -1433,7 +1648,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, struct zone *zone, gfp_t gfp_mask, - unsigned long reclaim_options) + unsigned long reclaim_options, + unsigned long *total_scanned) { struct mem_cgroup *victim; int ret, total = 0; @@ -1442,6 +1658,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; unsigned long excess; + unsigned long nr_scanned; excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; @@ -1484,10 +1701,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, continue; } /* we use swappiness of local cgroup */ - if (check_soft) + if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, get_swappiness(victim), zone); - else + noswap, get_swappiness(victim), zone, + &nr_scanned); + *total_scanned += nr_scanned; + } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, get_swappiness(victim)); css_put(&victim->css); @@ -1503,7 +1722,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!res_counter_soft_limit_excess(&root_mem->res)) return total; } else if (mem_cgroup_margin(root_mem)) - return 1 + total; + return total; } return total; } @@ -1928,7 +2147,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags, NULL); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* @@ -3211,7 +3430,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK); + MEM_CGROUP_RECLAIM_SHRINK, + NULL); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3271,7 +3491,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK); + MEM_CGROUP_RECLAIM_SHRINK, + NULL); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3285,7 +3506,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, + unsigned long *total_scanned) { unsigned long nr_reclaimed = 0; struct mem_cgroup_per_zone *mz, *next_mz = NULL; @@ -3293,6 +3515,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, int loop = 0; struct mem_cgroup_tree_per_zone *mctz; unsigned long long excess; + unsigned long nr_scanned; if (order > 0) return 0; @@ -3311,10 +3534,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, if (!mz) break; + nr_scanned = 0; reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, gfp_mask, - MEM_CGROUP_RECLAIM_SOFT); + MEM_CGROUP_RECLAIM_SOFT, + &nr_scanned); nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; spin_lock(&mctz->lock); /* @@ -3337,10 +3563,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, */ next_mz = __mem_cgroup_largest_soft_limit_node(mctz); - if (next_mz == mz) { + if (next_mz == mz) css_put(&next_mz->mem->css); - next_mz = NULL; - } else /* next_mz == NULL or other memcg */ + else /* next_mz == NULL or other memcg */ break; } while (1); } @@ -3772,6 +3997,8 @@ enum { MCS_PGPGIN, MCS_PGPGOUT, MCS_SWAP, + MCS_PGFAULT, + MCS_PGMAJFAULT, MCS_INACTIVE_ANON, MCS_ACTIVE_ANON, MCS_INACTIVE_FILE, @@ -3794,6 +4021,8 @@ struct { {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, {"swap", "total_swap"}, + {"pgfault", "total_pgfault"}, + {"pgmajfault", "total_pgmajfault"}, {"inactive_anon", "total_inactive_anon"}, {"active_anon", "total_active_anon"}, {"inactive_file", "total_inactive_file"}, @@ -3822,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); s->stat[MCS_SWAP] += val * PAGE_SIZE; } + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); + s->stat[MCS_PGFAULT] += val; + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); + s->stat[MCS_PGMAJFAULT] += val; /* per zone stat */ val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); @@ -3845,6 +4078,51 @@ mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) mem_cgroup_get_local_stat(iter, s); } +#ifdef CONFIG_NUMA +static int mem_control_numa_stat_show(struct seq_file *m, void *arg) +{ + int nid; + unsigned long total_nr, file_nr, anon_nr, unevictable_nr; + unsigned long node_nr; + struct cgroup *cont = m->private; + struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); + + total_nr = mem_cgroup_nr_lru_pages(mem_cont); + seq_printf(m, "total=%lu", total_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); + seq_printf(m, "file=%lu", file_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); + seq_printf(m, "anon=%lu", anon_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); + seq_printf(m, "unevictable=%lu", unevictable_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, + nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + return 0; +} +#endif /* CONFIG_NUMA */ + static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, struct cgroup_map_cb *cb) { @@ -3855,6 +4133,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_local_stat(mem_cont, &mystat); + for (i = 0; i < NR_MCS_STAT; i++) { if (i == MCS_SWAP && !do_swap_account) continue; @@ -4278,6 +4557,22 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, return 0; } +#ifdef CONFIG_NUMA +static const struct file_operations mem_control_numa_stat_file_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int mem_control_numa_stat_open(struct inode *unused, struct file *file) +{ + struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; + + file->f_op = &mem_control_numa_stat_file_operations; + return single_open(file, mem_control_numa_stat_show, cont); +} +#endif /* CONFIG_NUMA */ + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4341,6 +4636,12 @@ static struct cftype mem_cgroup_files[] = { .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, +#ifdef CONFIG_NUMA + { + .name = "numa_stat", + .open = mem_control_numa_stat_open, + }, +#endif }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -4596,6 +4897,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->memsw, NULL); } mem->last_scanned_child = 0; + mem->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&mem->oom_notify); if (parent) @@ -4953,8 +5255,7 @@ static void mem_cgroup_clear_mc(void) static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { int ret = 0; struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); @@ -4993,8 +5294,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { mem_cgroup_clear_mc(); } @@ -5112,8 +5412,7 @@ retry: static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { struct mm_struct *mm; @@ -5131,22 +5430,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { return 0; } static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } #endif @@ -5169,19 +5465,12 @@ struct cgroup_subsys mem_cgroup_subsys = { static int __init enable_swap_account(char *s) { /* consider enabled if no parameter or 1 is given */ - if (!(*s) || !strcmp(s, "=1")) + if (!strcmp(s, "1")) really_do_swap_account = 1; - else if (!strcmp(s, "=0")) + else if (!strcmp(s, "0")) really_do_swap_account = 0; return 1; } -__setup("swapaccount", enable_swap_account); +__setup("swapaccount=", enable_swap_account); -static int __init disable_swap_account(char *s) -{ - printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); - enable_swap_account("=0"); - return 1; -} -__setup("noswapaccount", disable_swap_account); #endif |