diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 685 |
1 files changed, 315 insertions, 370 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b762215d73eb..2ed5f2a0879d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,14 +103,6 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } -/* memcg and lruvec stats flushing */ -static void flush_memcg_stats_dwork(struct work_struct *w); -static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static void flush_memcg_stats_work(struct work_struct *w); -static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work); -static DEFINE_PER_CPU(unsigned int, stats_flush_threshold); -static DEFINE_SPINLOCK(stats_flush_lock); - #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -242,7 +234,7 @@ enum res_type { iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool should_force_charge(void) +static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || (current->flags & PF_EXITING); @@ -459,28 +451,6 @@ ino_t page_cgroup_ino(struct page *page) return ino; } -static struct mem_cgroup_per_node * -mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) -{ - int nid = page_to_nid(page); - - return memcg->nodeinfo[nid]; -} - -static struct mem_cgroup_tree_per_node * -soft_limit_tree_node(int nid) -{ - return soft_limit_tree.rb_tree_per_node[nid]; -} - -static struct mem_cgroup_tree_per_node * -soft_limit_tree_from_page(struct page *page) -{ - int nid = page_to_nid(page); - - return soft_limit_tree.rb_tree_per_node[nid]; -} - static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, struct mem_cgroup_tree_per_node *mctz, unsigned long new_usage_in_excess) @@ -551,13 +521,13 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg) return excess; } -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) { unsigned long excess; struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; - mctz = soft_limit_tree_from_page(page); + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (!mctz) return; /* @@ -565,7 +535,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_page_nodeinfo(memcg, page); + mz = memcg->nodeinfo[nid]; excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or @@ -596,7 +566,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) for_each_node(nid) { mz = memcg->nodeinfo[nid]; - mctz = soft_limit_tree_node(nid); + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (mctz) mem_cgroup_remove_exceeded(mz, mctz); } @@ -638,6 +608,58 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } +/* + * memcg and lruvec stats flushing + * + * Many codepaths leading to stats update or read are performance sensitive and + * adding stats flushing in such codepaths is not desirable. So, to optimize the + * flushing the kernel does: + * + * 1) Periodically and asynchronously flush the stats every 2 seconds to not let + * rstat update tree grow unbounded. + * + * 2) Flush the stats synchronously on reader side only when there are more than + * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization + * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but + * only for 2 seconds due to (1). + */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static DEFINE_SPINLOCK(stats_flush_lock); +static DEFINE_PER_CPU(unsigned int, stats_updates); +static atomic_t stats_flush_threshold = ATOMIC_INIT(0); + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg) +{ + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) + atomic_inc(&stats_flush_threshold); +} + +static void __mem_cgroup_flush_stats(void) +{ + unsigned long flag; + + if (!spin_trylock_irqsave(&stats_flush_lock, flag)) + return; + + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + atomic_set(&stats_flush_threshold, 0); + spin_unlock_irqrestore(&stats_flush_lock, flag); +} + +void mem_cgroup_flush_stats(void) +{ + if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + __mem_cgroup_flush_stats(); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -650,7 +672,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -678,12 +700,12 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, memcg = pn->memcg; /* Update memcg */ - __mod_memcg_state(memcg, idx, val); + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); - if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH)) - queue_work(system_unbound_wq, &stats_flush_work); + + memcg_rstat_updated(memcg); } /** @@ -754,24 +776,6 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) rcu_read_unlock(); } -/* - * mod_objcg_mlstate() may be called with irq enabled, so - * mod_memcg_lruvec_state() should be used. - */ -static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, - struct pglist_data *pgdat, - enum node_stat_item idx, int nr) -{ - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - rcu_read_lock(); - memcg = obj_cgroup_memcg(objcg); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - mod_memcg_lruvec_state(lruvec, idx, nr); - rcu_read_unlock(); -} - /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup @@ -785,7 +789,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, return; __this_cpu_add(memcg->vmstats_percpu->events[idx], count); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -804,7 +808,6 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, - struct page *page, int nr_pages) { /* pagein of a big page is an event. So, ignore page size */ @@ -847,7 +850,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, * Check events in order. * */ -static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) +static void memcg_check_events(struct mem_cgroup *memcg, int nid) { /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, @@ -858,7 +861,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) MEM_CGROUP_TARGET_SOFTLIMIT); mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) - mem_cgroup_update_tree(memcg, page); + mem_cgroup_update_tree(memcg, nid); } } @@ -1154,64 +1157,88 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, } #ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) - VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); + VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio); else - VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page); + VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); } #endif /** - * lock_page_lruvec - lock and return lruvec for a given page. - * @page: the page + * folio_lruvec_lock - Lock the lruvec for a folio. + * @folio: Pointer to the folio. * * These functions are safe to use under any of the following conditions: - * - page locked - * - PageLRU cleared - * - lock_page_memcg() - * - page->_refcount is zero + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held. */ -struct lruvec *lock_page_lruvec(struct page *page) +struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct lruvec *lruvec; + struct lruvec *lruvec = folio_lruvec(folio); - lruvec = mem_cgroup_page_lruvec(page); spin_lock(&lruvec->lru_lock); - - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, folio); return lruvec; } -struct lruvec *lock_page_lruvec_irq(struct page *page) +/** + * folio_lruvec_lock_irq - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held and interrupts + * disabled. + */ +struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct lruvec *lruvec; + struct lruvec *lruvec = folio_lruvec(folio); - lruvec = mem_cgroup_page_lruvec(page); spin_lock_irq(&lruvec->lru_lock); - - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, folio); return lruvec; } -struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) +/** + * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * @flags: Pointer to irqsave flags. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held and interrupts + * disabled. + */ +struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, + unsigned long *flags) { - struct lruvec *lruvec; + struct lruvec *lruvec = folio_lruvec(folio); - lruvec = mem_cgroup_page_lruvec(page); spin_lock_irqsave(&lruvec->lru_lock, *flags); - - lruvec_memcg_debug(lruvec, page); + lruvec_memcg_debug(lruvec, folio); return lruvec; } @@ -1419,7 +1446,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * * Current memory state: */ - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -1580,7 +1607,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = should_force_charge() || out_of_memory(&oc); + ret = task_is_dying() || out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); @@ -1961,18 +1988,17 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } /** - * lock_page_memcg - lock a page and memcg binding - * @page: the page + * folio_memcg_lock - Bind a folio to its memcg. + * @folio: The folio. * - * This function protects unlocked LRU pages from being moved to + * This function prevents unlocked LRU folios from being moved to * another cgroup. * - * It ensures lifetime of the locked memcg. Caller is responsible - * for the lifetime of the page. + * It ensures lifetime of the bound memcg. The caller is responsible + * for the lifetime of the folio. */ -void lock_page_memcg(struct page *page) +void folio_memcg_lock(struct folio *folio) { - struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; unsigned long flags; @@ -1986,7 +2012,7 @@ void lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return; again: - memcg = page_memcg(head); + memcg = folio_memcg(folio); if (unlikely(!memcg)) return; @@ -2000,7 +2026,7 @@ again: return; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != page_memcg(head)) { + if (memcg != folio_memcg(folio)) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2014,9 +2040,13 @@ again: memcg->move_lock_task = current; memcg->move_lock_flags = flags; } -EXPORT_SYMBOL(lock_page_memcg); -static void __unlock_page_memcg(struct mem_cgroup *memcg) +void lock_page_memcg(struct page *page) +{ + folio_memcg_lock(page_folio(page)); +} + +static void __folio_memcg_unlock(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -2031,16 +2061,22 @@ static void __unlock_page_memcg(struct mem_cgroup *memcg) } /** - * unlock_page_memcg - unlock a page and memcg binding - * @page: the page + * folio_memcg_unlock - Release the binding between a folio and its memcg. + * @folio: The folio. + * + * This releases the binding created by folio_memcg_lock(). This does + * not change the accounting of this folio to its memcg, but it does + * permit others to change it. */ -void unlock_page_memcg(struct page *page) +void folio_memcg_unlock(struct folio *folio) { - struct page *head = compound_head(page); + __folio_memcg_unlock(folio_memcg(folio)); +} - __unlock_page_memcg(page_memcg(head)); +void unlock_page_memcg(struct page *page) +{ + folio_memcg_unlock(page_folio(page)); } -EXPORT_SYMBOL(unlock_page_memcg); struct obj_stock { #ifdef CONFIG_MEMCG_KMEM @@ -2083,41 +2119,6 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, } #endif -/* - * Most kmem_cache_alloc() calls are from user context. The irq disable/enable - * sequence used in this case to access content from object stock is slow. - * To optimize for user context access, there are now two object stocks for - * task context and interrupt context access respectively. - * - * The task context object stock can be accessed by disabling preemption only - * which is cheap in non-preempt kernel. The interrupt context object stock - * can only be accessed after disabling interrupt. User context code can - * access interrupt object stock, but not vice versa. - */ -static inline struct obj_stock *get_obj_stock(unsigned long *pflags) -{ - struct memcg_stock_pcp *stock; - - if (likely(in_task())) { - *pflags = 0UL; - preempt_disable(); - stock = this_cpu_ptr(&memcg_stock); - return &stock->task_obj; - } - - local_irq_save(*pflags); - stock = this_cpu_ptr(&memcg_stock); - return &stock->irq_obj; -} - -static inline void put_obj_stock(unsigned long flags) -{ - if (likely(in_task())) - preempt_enable(); - else - local_irq_restore(flags); -} - /** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. @@ -2535,6 +2536,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; enum oom_status oom_status; unsigned long nr_reclaimed; + bool passed_oom = false; bool may_swap = true; bool drained = false; unsigned long pflags; @@ -2570,15 +2572,6 @@ retry: goto force; /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(should_force_charge())) - goto force; - - /* * Prevent unbounded recursion when reclaim operations need to * allocate memory. This might exceed the limits temporarily, * but we prefer facilitating memory reclaim and getting back @@ -2635,8 +2628,9 @@ retry: if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; - if (fatal_signal_pending(current)) - goto force; + /* Avoid endless loop for tasks bypassed by the oom killer */ + if (passed_oom && task_is_dying()) + goto nomem; /* * keep retrying as long as the memcg oom killer is able to make @@ -2645,14 +2639,10 @@ retry: */ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); - switch (oom_status) { - case OOM_SUCCESS: + if (oom_status == OOM_SUCCESS) { + passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; goto retry; - case OOM_FAILED: - goto force; - default: - goto nomem; } nomem: if (!(gfp_mask & __GFP_NOFAIL)) @@ -2727,8 +2717,7 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) -static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) return; @@ -2737,11 +2726,10 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); } -#endif -static void commit_charge(struct page *page, struct mem_cgroup *memcg) +static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { - VM_BUG_ON_PAGE(page_memcg(page), page); + VM_BUG_ON_FOLIO(folio_memcg(folio), folio); /* * Any of the following ensures page's memcg stability: * @@ -2750,7 +2738,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) * - lock_page_memcg() * - exclusive reference */ - page->memcg_data = (unsigned long)memcg; + folio->memcg_data = (unsigned long)memcg; } static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) @@ -2775,6 +2763,59 @@ retry: */ #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) +/* + * Most kmem_cache_alloc() calls are from user context. The irq disable/enable + * sequence used in this case to access content from object stock is slow. + * To optimize for user context access, there are now two object stocks for + * task context and interrupt context access respectively. + * + * The task context object stock can be accessed by disabling preemption only + * which is cheap in non-preempt kernel. The interrupt context object stock + * can only be accessed after disabling interrupt. User context code can + * access interrupt object stock, but not vice versa. + */ +static inline struct obj_stock *get_obj_stock(unsigned long *pflags) +{ + struct memcg_stock_pcp *stock; + + if (likely(in_task())) { + *pflags = 0UL; + preempt_disable(); + stock = this_cpu_ptr(&memcg_stock); + return &stock->task_obj; + } + + local_irq_save(*pflags); + stock = this_cpu_ptr(&memcg_stock); + return &stock->irq_obj; +} + +static inline void put_obj_stock(unsigned long flags) +{ + if (likely(in_task())) + preempt_enable(); + else + local_irq_restore(flags); +} + +/* + * mod_objcg_mlstate() may be called with irq enabled, so + * mod_memcg_lruvec_state() should be used. + */ +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + mod_memcg_lruvec_state(lruvec, idx, nr); + rcu_read_unlock(); +} + int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, gfp_t gfp, bool new_page) { @@ -2956,7 +2997,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, unsigned int nr_pages) { - struct page_counter *counter; struct mem_cgroup *memcg; int ret; @@ -2966,21 +3006,8 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && - !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { - - /* - * Enforce __GFP_NOFAIL allocation because callers are not - * prepared to see failures and likely do not have any failure - * handling code. - */ - if (gfp & __GFP_NOFAIL) { - page_counter_charge(&memcg->kmem, nr_pages); - goto out; - } - cancel_charge(memcg, nr_pages); - ret = -ENOMEM; - } + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_charge(&memcg->kmem, nr_pages); out: css_put(&memcg->css); @@ -3020,15 +3047,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { + struct folio *folio = page_folio(page); struct obj_cgroup *objcg; unsigned int nr_pages = 1 << order; - if (!PageMemcgKmem(page)) + if (!folio_memcg_kmem(folio)) return; - objcg = __page_objcg(page); + objcg = __folio_objcg(folio); obj_cgroup_uncharge_pages(objcg, nr_pages); - page->memcg_data = 0; + folio->memcg_data = 0; obj_cgroup_put(objcg); } @@ -3262,17 +3290,18 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) */ void split_page_memcg(struct page *head, unsigned int nr) { - struct mem_cgroup *memcg = page_memcg(head); + struct folio *folio = page_folio(head); + struct mem_cgroup *memcg = folio_memcg(folio); int i; if (mem_cgroup_disabled() || !memcg) return; for (i = 1; i < nr; i++) - head[i].memcg_data = head->memcg_data; + folio_page(folio, i)->memcg_data = folio->memcg_data; - if (PageMemcgKmem(head)) - obj_cgroup_get_many(__page_objcg(head), nr - 1); + if (folio_memcg_kmem(folio)) + obj_cgroup_get_many(__folio_objcg(folio), nr - 1); else css_get_many(&memcg->css, nr - 1); } @@ -3386,7 +3415,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, if (order > 0) return 0; - mctz = soft_limit_tree_node(pgdat->node_id); + mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; /* * Do not even bother to check the largest node if the root @@ -3470,19 +3499,11 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* try to free all pages in this cgroup */ while (nr_retries && page_counter_read(&memcg->memory)) { - int progress; - if (signal_pending(current)) return -EINTR; - progress = try_to_free_mem_cgroup_pages(memcg, 1, - GFP_KERNEL, true); - if (!progress) { + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true)) nr_retries--; - /* maybe some writeback is necessary */ - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - } return 0; @@ -3523,8 +3544,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - /* mem_cgroup_threshold() calls here from irqsafe context */ - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -3599,7 +3619,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) return 0; BUG_ON(memcg->kmemcg_id >= 0); - BUG_ON(memcg->kmem_state); memcg_id = memcg_alloc_cache_id(); if (memcg_id < 0) @@ -3616,22 +3635,18 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static_branch_enable(&memcg_kmem_enabled_key); memcg->kmemcg_id = memcg_id; - memcg->kmem_state = KMEM_ONLINE; return 0; } static void memcg_offline_kmem(struct mem_cgroup *memcg) { - struct cgroup_subsys_state *css; - struct mem_cgroup *parent, *child; + struct mem_cgroup *parent; int kmemcg_id; - if (memcg->kmem_state != KMEM_ONLINE) + if (memcg->kmemcg_id == -1) return; - memcg->kmem_state = KMEM_ALLOCATED; - parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; @@ -3642,31 +3657,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) BUG_ON(kmemcg_id < 0); /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. After we have finished, all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. The - * ordering is imposed by list_lru_node->lock taken by + * After we have finished memcg_reparent_objcgs(), all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. + * The ordering is imposed by list_lru_node->lock taken by * memcg_drain_all_list_lrus(). */ - rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ - css_for_each_descendant_pre(css, &memcg->css) { - child = mem_cgroup_from_css(css); - BUG_ON(child->kmemcg_id != kmemcg_id); - child->kmemcg_id = parent->kmemcg_id; - } - rcu_read_unlock(); - memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); -} - -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ - /* css_alloc() failed, offlining didn't happen */ - if (unlikely(memcg->kmem_state == KMEM_ONLINE)) - memcg_offline_kmem(memcg); + memcg->kmemcg_id = -1; } #else static int memcg_online_kmem(struct mem_cgroup *memcg) @@ -3676,22 +3675,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static void memcg_offline_kmem(struct mem_cgroup *memcg) { } -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ -static int memcg_update_kmem_max(struct mem_cgroup *memcg, - unsigned long max) -{ - int ret; - - mutex_lock(&memcg_max_mutex); - ret = page_counter_set_max(&memcg->kmem, max); - mutex_unlock(&memcg_max_mutex); - return ret; -} - static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; @@ -3757,10 +3742,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " - "Please report your usecase to linux-mm@kvack.org if you " - "depend on this functionality.\n"); - ret = memcg_update_kmem_max(memcg, nr_pages); + /* kmem.limit_in_bytes is deprecated. */ + ret = -EOPNOTSUPP; break; case _TCP: ret = memcg_update_tcp_max(memcg, nr_pages); @@ -3905,7 +3888,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, @@ -3977,7 +3960,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4480,7 +4463,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -4542,17 +4525,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, * As being wrong occasionally doesn't matter, updates and accesses to the * records are lockless and racy. */ -void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, +void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb) { - struct mem_cgroup *memcg = page_memcg(page); + struct mem_cgroup *memcg = folio_memcg(folio); struct memcg_cgwb_frn *frn; u64 now = get_jiffies_64(); u64 oldest_at = now; int oldest = -1; int i; - trace_track_foreign_dirty(page, wb); + trace_track_foreign_dirty(folio, wb); /* * Pick the slot to use. If there is already a slot for @wb, keep @@ -5313,7 +5296,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); free_shrinker_info(memcg); - memcg_free_kmem(memcg); + + /* Need to offline kmem if online_css() fails */ + memcg_offline_kmem(memcg); mem_cgroup_free(memcg); } @@ -5346,26 +5331,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } -void mem_cgroup_flush_stats(void) -{ - if (!spin_trylock(&stats_flush_lock)) - return; - - cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); - spin_unlock(&stats_flush_lock); -} - -static void flush_memcg_stats_dwork(struct work_struct *w) -{ - mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); -} - -static void flush_memcg_stats_work(struct work_struct *w) -{ - mem_cgroup_flush_stats(); -} - static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -5555,7 +5520,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, #endif static struct page *mc_handle_file_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + unsigned long addr, pte_t ptent) { if (!vma->vm_file) /* anonymous vma */ return NULL; @@ -5585,38 +5550,39 @@ static int mem_cgroup_move_account(struct page *page, struct mem_cgroup *from, struct mem_cgroup *to) { + struct folio *folio = page_folio(page); struct lruvec *from_vec, *to_vec; struct pglist_data *pgdat; - unsigned int nr_pages = compound ? thp_nr_pages(page) : 1; - int ret; + unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; + int nid, ret; VM_BUG_ON(from == to); - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON(compound && !PageTransHuge(page)); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON(compound && !folio_test_large(folio)); /* * Prevent mem_cgroup_migrate() from looking at * page's memory cgroup of its source page while we change it. */ ret = -EBUSY; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto out; ret = -EINVAL; - if (page_memcg(page) != from) + if (folio_memcg(folio) != from) goto out_unlock; - pgdat = page_pgdat(page); + pgdat = folio_pgdat(folio); from_vec = mem_cgroup_lruvec(from, pgdat); to_vec = mem_cgroup_lruvec(to, pgdat); - lock_page_memcg(page); + folio_memcg_lock(folio); - if (PageAnon(page)) { - if (page_mapped(page)) { + if (folio_test_anon(folio)) { + if (folio_mapped(folio)) { __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); - if (PageTransHuge(page)) { + if (folio_test_transhuge(folio)) { __mod_lruvec_state(from_vec, NR_ANON_THPS, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_THPS, @@ -5627,18 +5593,18 @@ static int mem_cgroup_move_account(struct page *page, __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); - if (PageSwapBacked(page)) { + if (folio_test_swapbacked(folio)) { __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); } - if (page_mapped(page)) { + if (folio_mapped(folio)) { __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); } - if (PageDirty(page)) { - struct address_space *mapping = page_mapping(page); + if (folio_test_dirty(folio)) { + struct address_space *mapping = folio_mapping(folio); if (mapping_can_writeback(mapping)) { __mod_lruvec_state(from_vec, NR_FILE_DIRTY, @@ -5649,7 +5615,7 @@ static int mem_cgroup_move_account(struct page *page, } } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); } @@ -5672,20 +5638,21 @@ static int mem_cgroup_move_account(struct page *page, css_get(&to->css); css_put(&from->css); - page->memcg_data = (unsigned long)to; + folio->memcg_data = (unsigned long)to; - __unlock_page_memcg(from); + __folio_memcg_unlock(from); ret = 0; + nid = folio_nid(folio); local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); - memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); - memcg_check_events(from, page); + mem_cgroup_charge_statistics(to, nr_pages); + memcg_check_events(to, nid); + mem_cgroup_charge_statistics(from, -nr_pages); + memcg_check_events(from, nid); local_irq_enable(); out_unlock: - unlock_page(page); + folio_unlock(folio); out: return ret; } @@ -5728,7 +5695,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); else if (pte_none(ptent)) - page = mc_handle_file_pte(vma, addr, ptent, &ent); + page = mc_handle_file_pte(vma, addr, ptent); if (!page && !ent.val) return ret; @@ -6383,7 +6350,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; @@ -6690,9 +6657,10 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } -static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) +static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, + gfp_t gfp) { - unsigned int nr_pages = thp_nr_pages(page); + long nr_pages = folio_nr_pages(folio); int ret; ret = try_charge(memcg, gfp, nr_pages); @@ -6700,38 +6668,23 @@ static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) goto out; css_get(&memcg->css); - commit_charge(page, memcg); + commit_charge(folio, memcg); local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); - memcg_check_events(memcg, page); + mem_cgroup_charge_statistics(memcg, nr_pages); + memcg_check_events(memcg, folio_nid(folio)); local_irq_enable(); out: return ret; } -/** - * __mem_cgroup_charge - charge a newly allocated page to a cgroup - * @page: page to charge - * @mm: mm context of the victim - * @gfp_mask: reclaim mode - * - * Try to charge @page to the memcg that @mm belongs to, reclaiming - * pages according to @gfp_mask if necessary. if @mm is NULL, try to - * charge to the active memcg. - * - * Do not use this for pages allocated for swapin. - * - * Returns 0 on success. Otherwise, an error code is returned. - */ -int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { struct mem_cgroup *memcg; int ret; memcg = get_mem_cgroup_from_mm(mm); - ret = charge_memcg(page, memcg, gfp_mask); + ret = charge_memcg(folio, memcg, gfp); css_put(&memcg->css); return ret; @@ -6752,6 +6705,7 @@ int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { + struct folio *folio = page_folio(page); struct mem_cgroup *memcg; unsigned short id; int ret; @@ -6766,7 +6720,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); - ret = charge_memcg(page, memcg, gfp); + ret = charge_memcg(folio, memcg, gfp); css_put(&memcg->css); return ret; @@ -6810,7 +6764,7 @@ struct uncharge_gather { unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; - struct page *dummy_page; + int nid; }; static inline void uncharge_gather_clear(struct uncharge_gather *ug) @@ -6834,36 +6788,36 @@ static void uncharge_batch(const struct uncharge_gather *ug) local_irq_save(flags); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); - memcg_check_events(ug->memcg, ug->dummy_page); + memcg_check_events(ug->memcg, ug->nid); local_irq_restore(flags); - /* drop reference from uncharge_page */ + /* drop reference from uncharge_folio */ css_put(&ug->memcg->css); } -static void uncharge_page(struct page *page, struct uncharge_gather *ug) +static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) { - unsigned long nr_pages; + long nr_pages; struct mem_cgroup *memcg; struct obj_cgroup *objcg; - bool use_objcg = PageMemcgKmem(page); + bool use_objcg = folio_memcg_kmem(folio); - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Nobody should be changing or seriously looking at - * page memcg or objcg at this point, we have fully - * exclusive access to the page. + * folio memcg or objcg at this point, we have fully + * exclusive access to the folio. */ if (use_objcg) { - objcg = __page_objcg(page); + objcg = __folio_objcg(folio); /* * This get matches the put at the end of the function and * kmem pages do not hold memcg references anymore. */ memcg = get_mem_cgroup_from_objcg(objcg); } else { - memcg = __page_memcg(page); + memcg = __folio_memcg(folio); } if (!memcg) @@ -6875,19 +6829,19 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) uncharge_gather_clear(ug); } ug->memcg = memcg; - ug->dummy_page = page; + ug->nid = folio_nid(folio); /* pairs with css_put in uncharge_batch */ css_get(&memcg->css); } - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); if (use_objcg) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - page->memcg_data = 0; + folio->memcg_data = 0; obj_cgroup_put(objcg); } else { /* LRU pages aren't accounted at the root level */ @@ -6895,28 +6849,22 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) ug->nr_memory += nr_pages; ug->pgpgout++; - page->memcg_data = 0; + folio->memcg_data = 0; } css_put(&memcg->css); } -/** - * __mem_cgroup_uncharge - uncharge a page - * @page: page to uncharge - * - * Uncharge a page previously charged with __mem_cgroup_charge(). - */ -void __mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct folio *folio) { struct uncharge_gather ug; - /* Don't touch page->lru of any random page, pre-check: */ - if (!page_memcg(page)) + /* Don't touch folio->lru of any random page, pre-check: */ + if (!folio_memcg(folio)) return; uncharge_gather_clear(&ug); - uncharge_page(page, &ug); + uncharge_folio(folio, &ug); uncharge_batch(&ug); } @@ -6930,52 +6878,49 @@ void __mem_cgroup_uncharge(struct page *page) void __mem_cgroup_uncharge_list(struct list_head *page_list) { struct uncharge_gather ug; - struct page *page; + struct folio *folio; uncharge_gather_clear(&ug); - list_for_each_entry(page, page_list, lru) - uncharge_page(page, &ug); + list_for_each_entry(folio, page_list, lru) + uncharge_folio(folio, &ug); if (ug.memcg) uncharge_batch(&ug); } /** - * mem_cgroup_migrate - charge a page's replacement - * @oldpage: currently circulating page - * @newpage: replacement page + * mem_cgroup_migrate - Charge a folio's replacement. + * @old: Currently circulating folio. + * @new: Replacement folio. * - * Charge @newpage as a replacement page for @oldpage. @oldpage will + * Charge @new as a replacement folio for @old. @old will * be uncharged upon free. * - * Both pages must be locked, @newpage->mapping must be set up. + * Both folios must be locked, @new->mapping must be set up. */ -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) +void mem_cgroup_migrate(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; - unsigned int nr_pages; + long nr_pages = folio_nr_pages(new); unsigned long flags; - VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); - VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), - newpage); + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); + VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new); if (mem_cgroup_disabled()) return; - /* Page cache replacement: new page already charged? */ - if (page_memcg(newpage)) + /* Page cache replacement: new folio already charged? */ + if (folio_memcg(new)) return; - memcg = page_memcg(oldpage); - VM_WARN_ON_ONCE_PAGE(!memcg, oldpage); + memcg = folio_memcg(old); + VM_WARN_ON_ONCE_FOLIO(!memcg, old); if (!memcg) return; /* Force-charge the new page. The old one will be freed soon */ - nr_pages = thp_nr_pages(newpage); - if (!mem_cgroup_is_root(memcg)) { page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) @@ -6983,11 +6928,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) } css_get(&memcg->css); - commit_charge(newpage, memcg); + commit_charge(new, memcg); local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, newpage, nr_pages); - memcg_check_events(memcg, newpage); + mem_cgroup_charge_statistics(memcg, nr_pages); + memcg_check_events(memcg, folio_nid(new)); local_irq_restore(flags); } @@ -7214,8 +7159,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, -nr_entries); - memcg_check_events(memcg, page); + mem_cgroup_charge_statistics(memcg, -nr_entries); + memcg_check_events(memcg, page_to_nid(page)); css_put(&memcg->css); } |