diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 19:29:45 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 19:29:45 -0800 |
commit | e2ca6ba6ba0152361aa4fcbf6067db71b2c7a770 (patch) | |
tree | f7ed7753a2e66486a4ffe0fbbf98404ec4ba2212 /mm/vmscan.c | |
parent | 7e68dd7d07a28faa2e6574dd6b9dbd90cdeaae91 (diff) | |
parent | c45bc55a99957b20e4e0333bcd42e12d1833a7f5 (diff) |
Merge tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- More userfaultfs work from Peter Xu
- Several convert-to-folios series from Sidhartha Kumar and Huang Ying
- Some filemap cleanups from Vishal Moola
- David Hildenbrand added the ability to selftest anon memory COW
handling
- Some cpuset simplifications from Liu Shixin
- Addition of vmalloc tracing support by Uladzislau Rezki
- Some pagecache folioifications and simplifications from Matthew
Wilcox
- A pagemap cleanup from Kefeng Wang: we have VM_ACCESS_FLAGS, so use
it
- Miguel Ojeda contributed some cleanups for our use of the
__no_sanitize_thread__ gcc keyword.
This series should have been in the non-MM tree, my bad
- Naoya Horiguchi improved the interaction between memory poisoning and
memory section removal for huge pages
- DAMON cleanups and tuneups from SeongJae Park
- Tony Luck fixed the handling of COW faults against poisoned pages
- Peter Xu utilized the PTE marker code for handling swapin errors
- Hugh Dickins reworked compound page mapcount handling, simplifying it
and making it more efficient
- Removal of the autonuma savedwrite infrastructure from Nadav Amit and
David Hildenbrand
- zram support for multiple compression streams from Sergey Senozhatsky
- David Hildenbrand reworked the GUP code's R/O long-term pinning so
that drivers no longer need to use the FOLL_FORCE workaround which
didn't work very well anyway
- Mel Gorman altered the page allocator so that local IRQs can remnain
enabled during per-cpu page allocations
- Vishal Moola removed the try_to_release_page() wrapper
- Stefan Roesch added some per-BDI sysfs tunables which are used to
prevent network block devices from dirtying excessive amounts of
pagecache
- David Hildenbrand did some cleanup and repair work on KSM COW
breaking
- Nhat Pham and Johannes Weiner have implemented writeback in zswap's
zsmalloc backend
- Brian Foster has fixed a longstanding corner-case oddity in
file[map]_write_and_wait_range()
- sparse-vmemmap changes for MIPS, LoongArch and NIOS2 from Feiyang
Chen
- Shiyang Ruan has done some work on fsdax, to make its reflink mode
work better under xfstests. Better, but still not perfect
- Christoph Hellwig has removed the .writepage() method from several
filesystems. They only need .writepages()
- Yosry Ahmed wrote a series which fixes the memcg reclaim target
beancounting
- David Hildenbrand has fixed some of our MM selftests for 32-bit
machines
- Many singleton patches, as usual
* tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (313 commits)
mm/hugetlb: set head flag before setting compound_order in __prep_compound_gigantic_folio
mm: mmu_gather: allow more than one batch of delayed rmaps
mm: fix typo in struct pglist_data code comment
kmsan: fix memcpy tests
mm: add cond_resched() in swapin_walk_pmd_entry()
mm: do not show fs mm pc for VM_LOCKONFAULT pages
selftests/vm: ksm_functional_tests: fixes for 32bit
selftests/vm: cow: fix compile warning on 32bit
selftests/vm: madv_populate: fix missing MADV_POPULATE_(READ|WRITE) definitions
mm/gup_test: fix PIN_LONGTERM_TEST_READ with highmem
mm,thp,rmap: fix races between updates of subpages_mapcount
mm: memcg: fix swapcached stat accounting
mm: add nodes= arg to memory.reclaim
mm: disable top-tier fallback to reclaim on proactive reclaim
selftests: cgroup: make sure reclaim target memcg is unprotected
selftests: cgroup: refactor proactive reclaim code to reclaim_until()
mm: memcg: fix stale protection of reclaim target memcg
mm/mmap: properly unaccount memory on mas_preallocate() failure
omfs: remove ->writepage
jfs: remove ->writepage
...
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 125 |
1 files changed, 79 insertions, 46 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 8fcc5fa768c0..aba991c505f1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -54,6 +54,7 @@ #include <linux/shmem_fs.h> #include <linux/ctype.h> #include <linux/debugfs.h> +#include <linux/khugepaged.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -1020,31 +1021,52 @@ out: return freed; } -static void drop_slab_node(int nid) +static unsigned long drop_slab_node(int nid) { - unsigned long freed; - int shift = 0; + unsigned long freed = 0; + struct mem_cgroup *memcg = NULL; + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct mem_cgroup *memcg = NULL; + freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - if (fatal_signal_pending(current)) - return; + return freed; +} +void drop_slab(void) +{ + int nid; + int shift = 0; + unsigned long freed; + + do { freed = 0; - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + for_each_online_node(nid) { + if (fatal_signal_pending(current)) + return; + + freed += drop_slab_node(nid); + } } while ((freed >> shift++) > 1); } -void drop_slab(void) +static int reclaimer_offset(void) { - int nid; + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGSCAN_DIRECT - PGSCAN_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); - for_each_online_node(nid) - drop_slab_node(nid); + if (current_is_kswapd()) + return 0; + if (current_is_khugepaged()) + return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; + return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } static inline int is_page_cache_freeable(struct folio *folio) @@ -1346,11 +1368,10 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (folio_test_swapcache(folio)) { swp_entry_t swap = folio_swap_entry(folio); - /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */ if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - mem_cgroup_swapout(folio, swap); __delete_from_swap_cache(folio, swap, shadow); + mem_cgroup_swapout(folio, swap); xa_unlock_irq(&mapping->i_pages); put_swap_folio(folio, swap); } else { @@ -1599,10 +1620,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); - if (current_is_kswapd()) - __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); - else - __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); return nr_succeeded; } @@ -2069,10 +2087,29 @@ keep: nr_reclaimed += demote_folio_list(&demote_folios, pgdat); /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { - /* Folios which weren't demoted go back on @folio_list for retry: */ + /* Folios which weren't demoted go back on @folio_list */ list_splice_init(&demote_folios, folio_list); - do_demote_pass = false; - goto retry; + + /* + * goto retry to reclaim the undemoted folios in folio_list if + * desired. + * + * Reclaiming directly from top tier nodes is not often desired + * due to it breaking the LRU ordering: in general memory + * should be reclaimed from lower tier nodes and demoted from + * top tier nodes. + * + * However, disabling reclaim from top tier nodes entirely + * would cause ooms in edge scenarios where lower tier memory + * is unreclaimable for whatever reason, eg memory being + * mlocked or too hot to reclaim. We can disable reclaim + * from top tier nodes in proactive reclaim though as that is + * not real memory pressure. + */ + if (!sc->proactive) { + do_demote_pass = false; + goto retry; + } } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; @@ -2475,7 +2512,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); @@ -2492,14 +2529,14 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, move_folios_to_lru(lruvec, &folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); - lru_note_cost(lruvec, file, stat.nr_pageout); + lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); mem_cgroup_uncharge_list(&folio_list); free_unref_page_list(&folio_list); @@ -2651,6 +2688,8 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); + if (nr_rotated) + lru_note_cost(lruvec, file, 0, nr_rotated); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, @@ -3145,7 +3184,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) if (memcg) { struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; - /* for hotadd_new_pgdat() */ + /* see the comment in mem_cgroup_lruvec() */ if (!lruvec->pgdat) lruvec->pgdat = pgdat; @@ -3154,7 +3193,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) #endif VM_WARN_ON_ONCE(!mem_cgroup_disabled()); - return pgdat ? &pgdat->__lruvec : NULL; + return &pgdat->__lruvec; } static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) @@ -3218,9 +3257,6 @@ void lru_gen_add_mm(struct mm_struct *mm) for_each_node_state(nid, N_MEMORY) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - /* the first addition since the last iteration */ if (lruvec->mm_state.tail == &mm_list->fifo) lruvec->mm_state.tail = &mm->lru_gen.list; @@ -3250,9 +3286,6 @@ void lru_gen_del_mm(struct mm_struct *mm) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - /* where the last iteration ended (exclusive) */ if (lruvec->mm_state.tail == &mm->lru_gen.list) lruvec->mm_state.tail = lruvec->mm_state.tail->next; @@ -4498,7 +4531,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned mem_cgroup_calculate_protection(NULL, memcg); - if (mem_cgroup_below_min(memcg)) + if (mem_cgroup_below_min(NULL, memcg)) return false; need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); @@ -4869,7 +4902,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, break; } - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) { __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); @@ -5047,7 +5080,7 @@ retry: if (walk && walk->batched) reset_batch_size(lruvec, walk); - item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); __count_memcg_events(memcg, item, reclaimed); @@ -5085,8 +5118,9 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - if (mem_cgroup_below_min(memcg) || - (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || + (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && + !sc->memcg_low_reclaim)) return 0; *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); @@ -5327,9 +5361,6 @@ static void lru_gen_change_state(bool enabled) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -5395,7 +5426,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) caps |= BIT(LRU_GEN_NONLEAF_YOUNG); - return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); + return sysfs_emit(buf, "0x%04x\n", caps); } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ @@ -6084,13 +6115,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) mem_cgroup_calculate_protection(target_memcg, memcg); - if (mem_cgroup_below_min(memcg)) { + if (mem_cgroup_below_min(target_memcg, memcg)) { /* * Hard protection. * If there is no reclaimable memory, OOM. */ continue; - } else if (mem_cgroup_below_low(memcg)) { + } else if (mem_cgroup_below_low(target_memcg, memcg)) { /* * Soft protection. * Respect the protection only as long as @@ -6726,7 +6757,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options) + unsigned int reclaim_options, + nodemask_t *nodemask) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -6741,6 +6773,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), + .nodemask = nodemask, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put |