From aa6f8b2593b56a02043684182a89853f919dff3e Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 18 Oct 2024 15:34:11 -0700 Subject: mm/gup: stop leaking pinned pages in low memory conditions If a driver tries to call any of the pin_user_pages*(FOLL_LONGTERM) family of functions, and requests "too many" pages, then the call will erroneously leave pages pinned. This is visible in user space as an actual memory leak. Repro is trivial: just make enough pin_user_pages(FOLL_LONGTERM) calls to exhaust memory. The root cause of the problem is this sequence, within __gup_longterm_locked(): __get_user_pages_locked() rc = check_and_migrate_movable_pages() ...which gets retried in a loop. The loop error handling is incomplete, clearly due to a somewhat unusual and complicated tri-state error API. But anyway, if -ENOMEM, or in fact, any unexpected error is returned from check_and_migrate_movable_pages(), then __gup_longterm_locked() happily returns the error, while leaving the pages pinned. In the failed case, which is an app that requests (via a device driver) 30720000000 bytes to be pinned, and then exits, I see this: $ grep foll /proc/vmstat nr_foll_pin_acquired 7502048 nr_foll_pin_released 2048 And after applying this patch, it returns to balanced pins: $ grep foll /proc/vmstat nr_foll_pin_acquired 7502048 nr_foll_pin_released 7502048 Note that the child routine, check_and_migrate_movable_folios(), avoids this problem, by unpinning any folios in the **folios argument, before returning an error. Fix this by making check_and_migrate_movable_pages() behave in exactly the same way as check_and_migrate_movable_folios(): unpin all pages in **pages, before returning an error. Also, documentation was an aggravating factor, so: 1) Consolidate the documentation for these two routines, now that they have identical external behavior. 2) Rewrite the consolidated documentation: a) Clearly list the three return code cases, and what happens in each case. b) Mention that one of the cases unpins the pages or folios, before returning an error code. Link: https://lkml.kernel.org/r/20241018223411.310331-1-jhubbard@nvidia.com Fixes: 24a95998e9ba ("mm/gup.c: simplify and fix check_and_migrate_movable_pages() return codes") Signed-off-by: John Hubbard Reviewed-by: Alistair Popple Suggested-by: David Hildenbrand Cc: Shigeru Yoshida Cc: Jason Gunthorpe Cc: Minchan Kim Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- mm/gup.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index a82890b46a36..4637dab7b54f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2394,20 +2394,25 @@ err: } /* - * Check whether all folios are *allowed* to be pinned indefinitely (longterm). + * Check whether all folios are *allowed* to be pinned indefinitely (long term). * Rather confusingly, all folios in the range are required to be pinned via * FOLL_PIN, before calling this routine. * - * If any folios in the range are not allowed to be pinned, then this routine - * will migrate those folios away, unpin all the folios in the range and return - * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then - * call this routine again. + * Return values: * - * If an error other than -EAGAIN occurs, this indicates a migration failure. - * The caller should give up, and propagate the error back up the call stack. - * - * If everything is OK and all folios in the range are allowed to be pinned, + * 0: if everything is OK and all folios in the range are allowed to be pinned, * then this routine leaves all folios pinned and returns zero for success. + * + * -EAGAIN: if any folios in the range are not allowed to be pinned, then this + * routine will migrate those folios away, unpin all the folios in the range. If + * migration of the entire set of folios succeeds, then -EAGAIN is returned. The + * caller should re-pin the entire range with FOLL_PIN and then call this + * routine again. + * + * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this + * indicates a migration failure. The caller should give up, and propagate the + * error back up the call stack. The caller does not need to unpin any folios in + * that case, because this routine will do the unpinning. */ static long check_and_migrate_movable_folios(unsigned long nr_folios, struct folio **folios) @@ -2425,10 +2430,8 @@ static long check_and_migrate_movable_folios(unsigned long nr_folios, } /* - * This routine just converts all the pages in the @pages array to folios and - * calls check_and_migrate_movable_folios() to do the heavy lifting. - * - * Please see the check_and_migrate_movable_folios() documentation for details. + * Return values and behavior are the same as those for + * check_and_migrate_movable_folios(). */ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) @@ -2437,8 +2440,10 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, long i, ret; folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL); - if (!folios) + if (!folios) { + unpin_user_pages(pages, nr_pages); return -ENOMEM; + } for (i = 0; i < nr_pages; i++) folios[i] = page_folio(pages[i]); -- cgit v1.2.3-70-g09d2 From f4657e16e767105194f97586fe3c03d3f64c4d37 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Sun, 20 Oct 2024 15:08:19 +0800 Subject: mm/codetag: fix null pointer check logic for ref and tag When we compile and load lib/slub_kunit.c,it will cause a panic. The root cause is that __kmalloc_cache_noprof was directly called instead of kmem_cache_alloc,which resulted in no alloc_tag being allocated.This caused current->alloc_tag to be null,leading to a null pointer dereference in alloc_tag_ref_set. Despite the fact that my colleague Pei Xiao will later fix the code in slub_kunit.c,we still need fix null pointer check logic for ref and tag to avoid panic caused by a null pointer dereference. Here is the log for the panic: [ 74.779373][ T2158] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020 [ 74.780130][ T2158] Mem abort info: [ 74.780406][ T2158] ESR = 0x0000000096000004 [ 74.780756][ T2158] EC = 0x25: DABT (current EL), IL = 32 bits [ 74.781225][ T2158] SET = 0, FnV = 0 [ 74.781529][ T2158] EA = 0, S1PTW = 0 [ 74.781836][ T2158] FSC = 0x04: level 0 translation fault [ 74.782288][ T2158] Data abort info: [ 74.782577][ T2158] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 74.783068][ T2158] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 74.783533][ T2158] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 74.784010][ T2158] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000105f34000 [ 74.784586][ T2158] [0000000000000020] pgd=0000000000000000, p4d=0000000000000000 [ 74.785293][ T2158] Internal error: Oops: 0000000096000004 [#1] SMP [ 74.785805][ T2158] Modules linked in: slub_kunit kunit ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute ip6table_nat ip6table_mangle 4 [ 74.790661][ T2158] CPU: 0 UID: 0 PID: 2158 Comm: kunit_try_catch Kdump: loaded Tainted: G W N 6.12.0-rc3+ #2 [ 74.791535][ T2158] Tainted: [W]=WARN, [N]=TEST [ 74.791889][ T2158] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 74.792479][ T2158] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 74.793101][ T2158] pc : alloc_tagging_slab_alloc_hook+0x120/0x270 [ 74.793607][ T2158] lr : alloc_tagging_slab_alloc_hook+0x120/0x270 [ 74.794095][ T2158] sp : ffff800084d33cd0 [ 74.794418][ T2158] x29: ffff800084d33cd0 x28: 0000000000000000 x27: 0000000000000000 [ 74.795095][ T2158] x26: 0000000000000000 x25: 0000000000000012 x24: ffff80007b30e314 [ 74.795822][ T2158] x23: ffff000390ff6f10 x22: 0000000000000000 x21: 0000000000000088 [ 74.796555][ T2158] x20: ffff000390285840 x19: fffffd7fc3ef7830 x18: ffffffffffffffff [ 74.797283][ T2158] x17: ffff8000800e63b4 x16: ffff80007b33afc4 x15: ffff800081654c00 [ 74.798011][ T2158] x14: 0000000000000000 x13: 205d383531325420 x12: 5b5d383734363537 [ 74.798744][ T2158] x11: ffff800084d337e0 x10: 000000000000005d x9 : 00000000ffffffd0 [ 74.799476][ T2158] x8 : 7f7f7f7f7f7f7f7f x7 : ffff80008219d188 x6 : c0000000ffff7fff [ 74.800206][ T2158] x5 : ffff0003fdbc9208 x4 : ffff800081edd188 x3 : 0000000000000001 [ 74.800932][ T2158] x2 : 0beaa6dee1ac5a00 x1 : 0beaa6dee1ac5a00 x0 : ffff80037c2cb000 [ 74.801656][ T2158] Call trace: [ 74.801954][ T2158] alloc_tagging_slab_alloc_hook+0x120/0x270 [ 74.802494][ T2158] __kmalloc_cache_noprof+0x148/0x33c [ 74.802976][ T2158] test_kmalloc_redzone_access+0x4c/0x104 [slub_kunit] [ 74.803607][ T2158] kunit_try_run_case+0x70/0x17c [kunit] [ 74.804124][ T2158] kunit_generic_run_threadfn_adapter+0x2c/0x4c [kunit] [ 74.804768][ T2158] kthread+0x10c/0x118 [ 74.805141][ T2158] ret_from_fork+0x10/0x20 [ 74.805540][ T2158] Code: b9400a80 11000400 b9000a80 97ffd858 (f94012d3) [ 74.806176][ T2158] SMP: stopping secondary CPUs [ 74.808130][ T2158] Starting crashdump kernel... Link: https://lkml.kernel.org/r/20241020070819.307944-1-hao.ge@linux.dev Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()") Signed-off-by: Hao Ge Acked-by: Suren Baghdasaryan Suggested-by: Suren Baghdasaryan Acked-by: Yu Zhao Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 1f0a9ff23a2c..941deffc590d 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -135,18 +135,21 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {} #endif /* Caller should verify both ref and tag to be valid */ -static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +static inline bool __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) { alloc_tag_add_check(ref, tag); if (!ref || !tag) - return; + return false; ref->ct = &tag->ct; + return true; } -static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) { - __alloc_tag_ref_set(ref, tag); + if (unlikely(!__alloc_tag_ref_set(ref, tag))) + return false; + /* * We need in increment the call counter every time we have a new * allocation or when we split a large allocation into smaller ones. @@ -154,12 +157,13 @@ static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *t * counter because when we free each part the counter will be decremented. */ this_cpu_inc(tag->counters->calls); + return true; } static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) { - alloc_tag_ref_set(ref, tag); - this_cpu_add(tag->counters->bytes, bytes); + if (likely(alloc_tag_ref_set(ref, tag))) + this_cpu_add(tag->counters->bytes, bytes); } static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) -- cgit v1.2.3-70-g09d2 From e0fc203748377835bbb4fb4c45174592214a3211 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 23 Oct 2024 13:12:36 -0400 Subject: mm: avoid VM_BUG_ON when try to map an anon large folio to zero page. An anonymous large folio can be split into non order-0 folios, try_to_map_unused_to_zeropage() should not VM_BUG_ON compound pages but just return false. This fixes the crash when splitting anonymous large folios to non order-0 folios. Link: https://lkml.kernel.org/r/20241023171236.1122535-1-ziy@nvidia.com Fixes: b1f202060afe ("mm: remap unused subpages to shared zeropage when splitting isolated thp") Signed-off-by: Zi Yan Acked-by: David Hildenbrand Acked-by: Usama Arif Cc: Barry Song Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Nico Pache Cc: Rik van Riel Cc: Roman Gushchin Cc: Ryan Roberts Cc: Shakeel Butt Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/migrate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index df91248755e4..7e520562d421 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, pte_t newpte; void *addr; - VM_BUG_ON_PAGE(PageCompound(page), page); + if (PageCompound(page)) + return false; VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); -- cgit v1.2.3-70-g09d2 From b54e1bfecc4b2775c184d2edb319232b853a686d Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 24 Oct 2024 10:02:01 +1300 Subject: mm: fix PSWPIN counter for large folios swap-in Similar to PSWPOUT, we should count the number of base pages instead of large folios. Link: https://lkml.kernel.org/r/20241023210201.2798-1-21cnbao@gmail.com Fixes: 242d12c98174 ("mm: support large folios swap-in for sync io devices") Signed-off-by: Barry Song Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Chris Li Cc: Yosry Ahmed Cc: "Huang, Ying" Cc: Kairui Song Cc: Ryan Roberts Cc: Kanchana P Sridhar Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/page_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 78bc88acee79..69536a2b3c13 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -570,7 +570,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio, * attempt to access it in the page fault retry time check. */ get_task_struct(current); - count_vm_event(PSWPIN); + count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio_wait(&bio); __end_swap_bio_read(&bio); put_task_struct(current); @@ -585,7 +585,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); - count_vm_event(PSWPIN); + count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio(bio); } -- cgit v1.2.3-70-g09d2 From 5168a68eb78fa1c67a8b2d31d0642c7fd866cc12 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 23 Oct 2024 01:55:12 +0800 Subject: mm, swap: avoid over reclaim of full clusters When running low on usable slots, cluster allocator will try to reclaim the full clusters aggressively to reclaim HAS_CACHE slots. This guarantees that as long as there are any usable slots, HAS_CACHE or not, the swap device will be usable and workload won't go OOM early. Before the cluster allocator, swap allocator fails easily if device is filled up with reclaimable HAS_CACHE slots. Which can be easily reproduced with following simple program: #include #include #include #include #define SIZE 8192UL * 1024UL * 1024UL int main(int argc, char **argv) { long tmp; char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); memset(p, 0, SIZE); madvise(p, SIZE, MADV_PAGEOUT); for (unsigned long i = 0; i < SIZE; ++i) tmp += p[i]; getchar(); /* Pause */ return 0; } Setup an 8G non ramdisk swap, the first run of the program will swapout 8G ram successfully. But run same program again after the first run paused, the second run can't swapout all 8G memory as now half of the swap device is pinned by HAS_CACHE. There was a random scan in the old allocator that may reclaim part of the HAS_CACHE by luck, but it's unreliable. The new allocator's added reclaim of full clusters when device is low on usable slots. But when multiple CPUs are seeing the device is low on usable slots at the same time, they ran into a thundering herd problem. This is an observable problem on large machine with mass parallel workload, as full cluster reclaim is slower on large swap device and higher number of CPUs will also make things worse. Testing using a 128G ZRAM on a 48c96t system. When the swap device is very close to full (eg. 124G / 128G), running build linux kernel with make -j96 in a 1G memory cgroup will hung (not a softlockup though) spinning in full cluster reclaim for about ~5min before go OOM. To solve this, split the full reclaim into two parts: - Instead of do a synchronous aggressively reclaim when device is low, do only one aggressively reclaim when device is strictly full with a kworker. This still ensures in worst case the device won't be unusable because of HAS_CACHE slots. - To avoid allocation (especially higher order) suffer from HAS_CACHE filling up clusters and kworker not responsive enough, do one synchronous scan every time the free list is drained, and only scan one cluster. This is kind of similar to the random reclaim before, keeps the full clusters rotated and has a minimal latency. This should provide a fair reclaim strategy suitable for most workloads. Link: https://lkml.kernel.org/r/20241022175512.10398-1-ryncsn@gmail.com Fixes: 2cacbdfdee65 ("mm: swap: add a adaptive full cluster cache reclaim") Signed-off-by: Kairui Song Cc: Barry Song Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kalesh Singh Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 + mm/swapfile.c | 49 ++++++++++++++++++++++++++++++------------------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index ca533b478c21..f3e0ac20c2e8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -335,6 +335,7 @@ struct swap_info_struct { * list. */ struct work_struct discard_work; /* discard worker */ + struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one diff --git a/mm/swapfile.c b/mm/swapfile.c index b0915f3fab31..46bd4b1a3c07 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -731,15 +731,16 @@ done: return offset; } -static void swap_reclaim_full_clusters(struct swap_info_struct *si) +/* Return true if reclaimed a whole cluster */ +static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; unsigned char *map = si->swap_map; - int nr_reclaim, total_reclaimed = 0; + int nr_reclaim; - if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER) + if (force) to_scan = si->inuse_pages / SWAPFILE_CLUSTER; while (!list_empty(&si->full_clusters)) { @@ -749,28 +750,36 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si) end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; + spin_unlock(&si->lock); while (offset < end) { if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { - spin_unlock(&si->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); - spin_lock(&si->lock); - if (nr_reclaim > 0) { - offset += nr_reclaim; - total_reclaimed += nr_reclaim; - continue; - } else if (nr_reclaim < 0) { - offset += -nr_reclaim; + if (nr_reclaim) { + offset += abs(nr_reclaim); continue; } } offset++; } - if (to_scan <= 0 || total_reclaimed) + spin_lock(&si->lock); + + if (to_scan <= 0) break; } } +static void swap_reclaim_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, reclaim_work); + + spin_lock(&si->lock); + swap_reclaim_full_clusters(si, true); + spin_unlock(&si->lock); +} + /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU @@ -800,6 +809,10 @@ new_cluster: goto done; } + /* Try reclaim from full clusters if free clusters list is drained */ + if (vm_swap_full()) + swap_reclaim_full_clusters(si, false); + if (order < PMD_ORDER) { unsigned int frags = 0; @@ -881,13 +894,6 @@ new_cluster: } done: - /* Try reclaim from full clusters if device is nearfull */ - if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) { - swap_reclaim_full_clusters(si); - if (!found && !order && si->pages != si->inuse_pages) - goto new_cluster; - } - cluster->next[order] = offset; return found; } @@ -922,6 +928,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, si->lowest_bit = si->max; si->highest_bit = 0; del_from_avail_list(si); + + if (vm_swap_full()) + schedule_work(&si->reclaim_work); } } @@ -2816,6 +2825,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) wait_for_completion(&p->comp); flush_work(&p->discard_work); + flush_work(&p->reclaim_work); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) @@ -3376,6 +3386,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) return PTR_ERR(si); INIT_WORK(&si->discard_work, swap_discard_work); + INIT_WORK(&si->reclaim_work, swap_reclaim_work); name = getname(specialfile); if (IS_ERR(name)) { -- cgit v1.2.3-70-g09d2 From ece5897e5a10fcd56a317e32f2dc7219f366a5a8 Mon Sep 17 00:00:00 2001 From: Wladislav Wiebe Date: Tue, 22 Oct 2024 19:21:13 +0200 Subject: tools/mm: -Werror fixes in page-types/slabinfo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e6d2c436ff693 ("tools/mm: allow users to provide additional cflags/ldflags") passes now CFLAGS to Makefile. With this, build systems with default -Werror enabled found: slabinfo.c:1300:25: error: ignoring return value of 'chdir' declared with attribute 'warn_unused_result' [-Werror=unused-result]                          chdir("..");                          ^~~~~~~~~~~ page-types.c:397:35: error: format '%lu' expects argument of type 'long unsigned int', but argument 2 has type 'uint64_t' {aka 'long long unsigned int'} [-Werror=format=]                          printf("%lu\t", mapcnt0);                                  ~~^     ~~~~~~~ .. Fix page-types by using PRIu64 for uint64_t prints and check in slabinfo for return code on chdir(".."). Link: https://lkml.kernel.org/r/c1ceb507-94bc-461c-934d-c19b77edd825@gmail.com Fixes: e6d2c436ff69 ("tools/mm: allow users to provide additional cflags/ldflags") Signed-off-by: Wladislav Wiebe Cc: Vlastimil Babka Cc: Herton R. Krzesinski Cc: Signed-off-by: Andrew Morton --- tools/mm/page-types.c | 9 +++++---- tools/mm/slabinfo.c | 4 +++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index fa050d5a48cd..6eb17cc1a06c 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -391,9 +392,9 @@ static void show_page_range(unsigned long voffset, unsigned long offset, if (opt_file) printf("%lx\t", voff); if (opt_list_cgroup) - printf("@%llu\t", (unsigned long long)cgroup0); + printf("@%" PRIu64 "\t", cgroup0); if (opt_list_mapcnt) - printf("%lu\t", mapcnt0); + printf("%" PRIu64 "\t", mapcnt0); printf("%lx\t%lx\t%s\n", index, count, page_flag_name(flags0)); } @@ -419,9 +420,9 @@ static void show_page(unsigned long voffset, unsigned long offset, if (opt_file) printf("%lx\t", voffset); if (opt_list_cgroup) - printf("@%llu\t", (unsigned long long)cgroup); + printf("@%" PRIu64 "\t", cgroup) if (opt_list_mapcnt) - printf("%lu\t", mapcnt); + printf("%" PRIu64 "\t", mapcnt); printf("%lx\t%s\n", offset, page_flag_name(flags)); } diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c index cfaeaea71042..04e9e6ba86ea 100644 --- a/tools/mm/slabinfo.c +++ b/tools/mm/slabinfo.c @@ -1297,7 +1297,9 @@ static void read_slab_dir(void) slab->cpu_partial_free = get_obj("cpu_partial_free"); slab->alloc_node_mismatch = get_obj("alloc_node_mismatch"); slab->deactivate_bypass = get_obj("deactivate_bypass"); - chdir(".."); + if (chdir("..")) + fatal("Unable to chdir from slab ../%s\n", + slab->name); if (slab->name[0] == ':') alias_targets++; slab++; -- cgit v1.2.3-70-g09d2 From 330d8df81f3673d6fb74550bbc9bb159d81b35f7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 22 Oct 2024 18:07:06 +0200 Subject: kasan: remove vmalloc_percpu test Commit 1a2473f0cbc0 ("kasan: improve vmalloc tests") added the vmalloc_percpu KASAN test with the assumption that __alloc_percpu always uses vmalloc internally, which is tagged by KASAN. However, __alloc_percpu might allocate memory from the first per-CPU chunk, which is not allocated via vmalloc(). As a result, the test might fail. Remove the test until proper KASAN annotation for the per-CPU allocated are added; tracked in https://bugzilla.kernel.org/show_bug.cgi?id=215019. Link: https://lkml.kernel.org/r/20241022160706.38943-1-andrey.konovalov@linux.dev Fixes: 1a2473f0cbc0 ("kasan: improve vmalloc tests") Signed-off-by: Andrey Konovalov Reported-by: Samuel Holland Link: https://lore.kernel.org/all/4a245fff-cc46-44d1-a5f9-fd2f1c3764ae@sifive.com/ Reported-by: Sabyrzhan Tasbolatov Link: https://lore.kernel.org/all/CACzwLxiWzNqPBp4C1VkaXZ2wDwvY3yZeetCi1TLGFipKW77drA@mail.gmail.com/ Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Cc: Sabyrzhan Tasbolatov Cc: Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_c.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index a181e4780d9d..d8fb281e439d 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1810,32 +1810,6 @@ static void vm_map_ram_tags(struct kunit *test) free_pages((unsigned long)p_ptr, 1); } -static void vmalloc_percpu(struct kunit *test) -{ - char __percpu *ptr; - int cpu; - - /* - * This test is specifically crafted for the software tag-based mode, - * the only tag-based mode that poisons percpu mappings. - */ - KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); - - ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); - - for_each_possible_cpu(cpu) { - char *c_ptr = per_cpu_ptr(ptr, cpu); - - KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); - KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); - - /* Make sure that in-bounds accesses don't crash the kernel. */ - *c_ptr = 0; - } - - free_percpu(ptr); -} - /* * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN, * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based @@ -2023,7 +1997,6 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(vmalloc_oob), KUNIT_CASE(vmap_tags), KUNIT_CASE(vm_map_ram_tags), - KUNIT_CASE(vmalloc_percpu), KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), -- cgit v1.2.3-70-g09d2 From d31638ff6c5437ca2968d6c22fb16524fd485013 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 21 Oct 2024 00:22:00 +0100 Subject: Squashfs: fix variable overflow in squashfs_readpage_block Syzbot reports a slab out of bounds access in squashfs_readpage_block(). This is caused by an attempt to read page index 0x2000000000. This value (start_index) is stored in an integer loop variable which overflows producing a value of 0. This causes a loop which iterates over pages start_index -> end_index to iterate over 0 -> end_index, which ultimately causes an out of bounds page array access. Fix by changing variable to a loff_t, and rename to index to make it clearer it is a page index, and not a loop count. Link: https://lkml.kernel.org/r/20241020232200.837231-1-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reported-by: "Lai, Yi" Closes: https://lore.kernel.org/all/ZwzcnCAosIPqQ9Ie@ly-workstation/ Signed-off-by: Andrew Morton --- fs/squashfs/file_direct.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 22251743fadf..d19d4db74af8 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -30,7 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; loff_t start_index = folio->index & ~mask; loff_t end_index = start_index | mask; - int i, n, pages, bytes, res = -ENOMEM; + loff_t index; + int i, pages, bytes, res = -ENOMEM; struct page **page, *last_page; struct squashfs_page_actor *actor; void *pageaddr; @@ -45,9 +46,9 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, return res; /* Try to grab all the pages covered by the Squashfs block */ - for (i = 0, n = start_index; n <= end_index; n++) { - page[i] = (n == folio->index) ? target_page : - grab_cache_page_nowait(target_page->mapping, n); + for (i = 0, index = start_index; index <= end_index; index++) { + page[i] = (index == folio->index) ? target_page : + grab_cache_page_nowait(target_page->mapping, index); if (page[i] == NULL) continue; -- cgit v1.2.3-70-g09d2 From b3a033e3ecd3471248d474ef263aadc0059e516a Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 20 Oct 2024 13:51:28 +0900 Subject: nilfs2: fix potential deadlock with newly created symlinks Syzbot reported that page_symlink(), called by nilfs_symlink(), triggers memory reclamation involving the filesystem layer, which can result in circular lock dependencies among the reader/writer semaphore nilfs->ns_segctor_sem, s_writers percpu_rwsem (intwrite) and the fs_reclaim pseudo lock. This is because after commit 21fc61c73c39 ("don't put symlink bodies in pagecache into highmem"), the gfp flags of the page cache for symbolic links are overwritten to GFP_KERNEL via inode_nohighmem(). This is not a problem for symlinks read from the backing device, because the __GFP_FS flag is dropped after inode_nohighmem() is called. However, when a new symlink is created with nilfs_symlink(), the gfp flags remain overwritten to GFP_KERNEL. Then, memory allocation called from page_symlink() etc. triggers memory reclamation including the FS layer, which may call nilfs_evict_inode() or nilfs_dirty_inode(). And these can cause a deadlock if they are called while nilfs->ns_segctor_sem is held: Fix this issue by dropping the __GFP_FS flag from the page cache GFP flags of newly created symlinks in the same way that nilfs_new_inode() and __nilfs_read_inode() do, as a workaround until we adopt nofs allocation scope consistently or improve the locking constraints. Link: https://lkml.kernel.org/r/20241020050003.4308-1-konishi.ryusuke@gmail.com Fixes: 21fc61c73c39 ("don't put symlink bodies in pagecache into highmem") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+9ef37ac20608f4836256@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9ef37ac20608f4836256 Tested-by: syzbot+9ef37ac20608f4836256@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/namei.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 4905063790c5..9b108052d9f7 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -157,6 +157,9 @@ static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir, /* slow symlink */ inode->i_op = &nilfs_symlink_inode_operations; inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_constraint(inode->i_mapping, + ~__GFP_FS)); inode->i_mapping->a_ops = &nilfs_aops; err = page_symlink(inode, symname, l); if (err) -- cgit v1.2.3-70-g09d2 From 9d08ec41a0645283d79a2e642205d488feaceacf Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 19 Oct 2024 22:22:12 -0600 Subject: mm: allow set/clear page_type again Some page flags (page->flags) were converted to page types (page->page_types). A recent example is PG_hugetlb. From the exclusive writer's perspective, e.g., a thread doing __folio_set_hugetlb(), there is a difference between the page flag and type APIs: the former allows the same non-atomic operation to be repeated whereas the latter does not. For example, calling __folio_set_hugetlb() twice triggers VM_BUG_ON_FOLIO(), since the second call expects the type (PG_hugetlb) not to be set previously. Using add_hugetlb_folio() as an example, it calls __folio_set_hugetlb() in the following error-handling path. And when that happens, it triggers the aforementioned VM_BUG_ON_FOLIO(). if (folio_test_hugetlb(folio)) { rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, false); ... It is possible to make hugeTLB comply with the new requirements from the page type API. However, a straightforward fix would be to just allow the same page type to be set or cleared again inside the API, to avoid any changes to its callers. Link: https://lkml.kernel.org/r/20241020042212.296781-1-yuzhao@google.com Fixes: d99e3140a4d3 ("mm: turn folio_test_hugetlb into a PageType") Signed-off-by: Yu Zhao Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1b3a76710487..cc839e4365c1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -975,12 +975,16 @@ static __always_inline bool folio_test_##fname(const struct folio *folio) \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ + if (folio_test_##fname(folio)) \ + return; \ VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ folio); \ folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ + if (folio->page.page_type == UINT_MAX) \ + return; \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ folio->page.page_type = UINT_MAX; \ } @@ -993,11 +997,15 @@ static __always_inline int Page##uname(const struct page *page) \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ + if (Page##uname(page)) \ + return; \ VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ + if (page->page_type == UINT_MAX) \ + return; \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type = UINT_MAX; \ } -- cgit v1.2.3-70-g09d2 From 85d16bceaf5d8112c9ffcfedd2f1bb9d0a1c1578 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Fri, 25 Oct 2024 21:15:28 +0300 Subject: mailmap: update Jarkko's email addresses Remove my previous work email, and the new one. The previous was never used in the commit log, so there's no good reason to spare it. Link: https://lkml.kernel.org/r/20241025181530.6151-1-jarkko@kernel.org Signed-off-by: Jarkko Sakkinen Cc: Alex Elder Cc: David S. Miller Cc: Geliang Tang Cc: Jiri Kosina Cc: Kees Cook Cc: Matthieu Baerts (NGI0) Cc: Matt Ranostay Cc: Neeraj Upadhyay Cc: Quentin Monnet Signed-off-by: Andrew Morton --- .mailmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 9a94c514e32c..5cf2fab5fe83 100644 --- a/.mailmap +++ b/.mailmap @@ -282,7 +282,7 @@ Jan Glauber Jan Kuliga Jarkko Sakkinen Jarkko Sakkinen -Jarkko Sakkinen +Jarkko Sakkinen Jason Gunthorpe Jason Gunthorpe Jason Gunthorpe -- cgit v1.2.3-70-g09d2 From 35e41024c4c2b02ef8207f61b9004f6956cf037b Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 25 Oct 2024 10:17:24 -0400 Subject: vmscan,migrate: fix page count imbalance on node stats when demoting pages When numa balancing is enabled with demotion, vmscan will call migrate_pages when shrinking LRUs. migrate_pages will decrement the the node's isolated page count, leading to an imbalanced count when invoked from (MG)LRU code. The result is dmesg output like such: $ cat /proc/sys/vm/stat_refresh [77383.088417] vmstat_refresh: nr_isolated_anon -103212 [77383.088417] vmstat_refresh: nr_isolated_file -899642 This negative value may impact compaction and reclaim throttling. The following path produces the decrement: shrink_folio_list demote_folio_list migrate_pages migrate_pages_batch migrate_folio_move migrate_folio_done mod_node_page_state(-ve) <- decrement This path happens for SUCCESSFUL migrations, not failures. Typically callers to migrate_pages are required to handle putback/accounting for failures, but this is already handled in the shrink code. When accounting for migrations, instead do not decrement the count when the migration reason is MR_DEMOTION. As of v6.11, this demotion logic is the only source of MR_DEMOTION. Link: https://lkml.kernel.org/r/20241025141724.17927-1-gourry@gourry.net Fixes: 26aa2d199d6f ("mm/migrate: demote pages during reclaim") Signed-off-by: Gregory Price Reviewed-by: Yang Shi Reviewed-by: Davidlohr Bueso Reviewed-by: Shakeel Butt Reviewed-by: "Huang, Ying" Reviewed-by: Oscar Salvador Cc: Dave Hansen Cc: Wei Xu Cc: Signed-off-by: Andrew Morton --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 7e520562d421..fab84a776088 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1178,7 +1178,7 @@ static void migrate_folio_done(struct folio *src, * not accounted to NR_ISOLATED_*. They can be recognized * as __folio_test_movable */ - if (likely(!__folio_test_movable(src))) + if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION) mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + folio_is_file_lru(src), -folio_nr_pages(src)); -- cgit v1.2.3-70-g09d2 From 0173471d21ec964921f97ba4eca71af74beb29f7 Mon Sep 17 00:00:00 2001 From: Eugen Hristev Date: Fri, 25 Oct 2024 11:58:48 +0300 Subject: .mailmap: update e-mail address for Eugen Hristev Update e-mail address. Link: https://lkml.kernel.org/r/20241025085848.483149-1-eugen.hristev@linaro.org Signed-off-by: Eugen Hristev Signed-off-by: Andrew Morton --- .mailmap | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 5cf2fab5fe83..5378f04b2566 100644 --- a/.mailmap +++ b/.mailmap @@ -199,7 +199,8 @@ Elliot Berman Enric Balletbo i Serra Enric Balletbo i Serra Erik Kaneda -Eugen Hristev +Eugen Hristev +Eugen Hristev Evgeniy Polyakov Ezequiel Garcia Faith Ekstrand -- cgit v1.2.3-70-g09d2 From 15e8156713cc38031642fafc8baf7d53f19f2e83 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 25 Oct 2024 06:09:42 +0000 Subject: mm: shrinker: avoid memleak in alloc_shrinker_info A memleak was found as below: unreferenced object 0xffff8881010d2a80 (size 32): comm "mkdir", pid 1559, jiffies 4294932666 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 40 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 @............... backtrace (crc 2e7ef6fa): [] __kmalloc_node_noprof+0x394/0x470 [] alloc_shrinker_info+0x7b/0x1a0 [] mem_cgroup_css_online+0x11a/0x3b0 [] online_css+0x29/0xa0 [] cgroup_apply_control_enable+0x20d/0x360 [] cgroup_mkdir+0x168/0x5f0 [] kernfs_iop_mkdir+0x5e/0x90 [] vfs_mkdir+0x144/0x220 [] do_mkdirat+0x87/0x130 [] __x64_sys_mkdir+0x49/0x70 [] do_syscall_64+0x68/0x140 [] entry_SYSCALL_64_after_hwframe+0x76/0x7e alloc_shrinker_info(), when shrinker_unit_alloc() returns an errer, the info won't be freed. Just fix it. Link: https://lkml.kernel.org/r/20241025060942.1049263-1-chenridong@huaweicloud.com Fixes: 307bececcd12 ("mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}") Signed-off-by: Chen Ridong Acked-by: Qi Zheng Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Reviewed-by: Dave Chinner Cc: Anshuman Khandual Cc: Muchun Song Cc: Wang Weiyang Cc: Signed-off-by: Andrew Morton --- mm/shrinker.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index dc5d2a6fcfc4..4a93fd433689 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -76,19 +76,21 @@ void free_shrinker_info(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg) { - struct shrinker_info *info; int nid, ret = 0; int array_size = 0; mutex_lock(&shrinker_mutex); array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); + struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size, + GFP_KERNEL, nid); if (!info) goto err; info->map_nr_max = shrinker_nr_max; - if (shrinker_unit_alloc(info, NULL, nid)) + if (shrinker_unit_alloc(info, NULL, nid)) { + kvfree(info); goto err; + } rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } mutex_unlock(&shrinker_mutex); -- cgit v1.2.3-70-g09d2 From d4148aeab412432bf928f311eca8a2ba52bb05df Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 24 Oct 2024 17:12:29 +0200 Subject: mm, mmap: limit THP alignment of anonymous mappings to PMD-aligned sizes Since commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") a mmap() of anonymous memory without a specific address hint and of at least PMD_SIZE will be aligned to PMD so that it can benefit from a THP backing page. However this change has been shown to regress some workloads significantly. [1] reports regressions in various spec benchmarks, with up to 600% slowdown of the cactusBSSN benchmark on some platforms. The benchmark seems to create many mappings of 4632kB, which would have merged to a large THP-backed area before commit efa7df3e3bb5 and now they are fragmented to multiple areas each aligned to PMD boundary with gaps between. The regression then seems to be caused mainly due to the benchmark's memory access pattern suffering from TLB or cache aliasing due to the aligned boundaries of the individual areas. Another known regression bisected to commit efa7df3e3bb5 is darktable [2] [3] and early testing suggests this patch fixes the regression there as well. To fix the regression but still try to benefit from THP-friendly anonymous mapping alignment, add a condition that the size of the mapping must be a multiple of PMD size instead of at least PMD size. In case of many odd-sized mapping like the cactusBSSN creates, those will stop being aligned and with gaps between, and instead naturally merge again. Link: https://lkml.kernel.org/r/20241024151228.101841-2-vbabka@suse.cz Fixes: efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") Signed-off-by: Vlastimil Babka Reported-by: Michael Matz Debugged-by: Gabriel Krisman Bertazi Closes: https://bugzilla.suse.com/show_bug.cgi?id=1229012 [1] Reported-by: Matthias Bodenbinder Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219366 [2] Closes: https://lore.kernel.org/all/2050f0d4-57b0-481d-bab8-05e8d48fed0c@leemhuis.info/ [3] Reviewed-by: Lorenzo Stoakes Reviewed-by: Yang Shi Cc: Rik van Riel Cc: Jann Horn Cc: Liam R. Howlett Cc: Petr Tesarik Cc: Thorsten Leemhuis Cc: Signed-off-by: Andrew Morton --- mm/mmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1e0e34cb993f..9841b41e3c76 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -900,7 +900,8 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) + && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); -- cgit v1.2.3-70-g09d2 From ddd6d8e975b171ea3f63a011a75820883ff0d479 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 19 Oct 2024 01:29:38 +0000 Subject: mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL stats Patch series "mm: multi-gen LRU: Have secondary MMUs participate in MM_WALK". Today, the MM_WALK capability causes MGLRU to clear the young bit from PMDs and PTEs during the page table walk before eviction, but MGLRU does not call the clear_young() MMU notifier in this case. By not calling this notifier, the MM walk takes less time/CPU, but it causes pages that are accessed mostly through KVM / secondary MMUs to appear younger than they should be. We do call the clear_young() notifier today, but only when attempting to evict the page, so we end up clearing young/accessed information less frequently for secondary MMUs than for mm PTEs, and therefore they appear younger and are less likely to be evicted. Therefore, memory that is *not* being accessed mostly by KVM will be evicted *more* frequently, worsening performance. ChromeOS observed a tab-open latency regression when enabling MGLRU with a setup that involved running a VM: Tab-open latency histogram (ms) Version p50 mean p95 p99 max base 1315 1198 2347 3454 10319 mglru 2559 1311 7399 12060 43758 fix 1119 926 2470 4211 6947 This series replaces the final non-selftest patchs from this series[1], which introduced a similar change (and a new MMU notifier) with KVM optimizations. I'll send a separate series (to Sean and Paolo) for the KVM optimizations. This series also makes proactive reclaim with MGLRU possible for KVM memory. I have verified that this functions correctly with the selftest from [1], but given that that test is a KVM selftest, I'll send it with the rest of the KVM optimizations later. Andrew, let me know if you'd like to take the test now anyway. [1]: https://lore.kernel.org/linux-mm/20240926013506.860253-18-jthoughton@google.com/ This patch (of 2): The removed stats, MM_LEAF_OLD and MM_NONLEAF_TOTAL, are not very helpful and become more complicated to properly compute when adding test/clear_young() notifiers in MGLRU's mm walk. Link: https://lkml.kernel.org/r/20241019012940.3656292-1-jthoughton@google.com Link: https://lkml.kernel.org/r/20241019012940.3656292-2-jthoughton@google.com Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao Signed-off-by: James Houghton Cc: Axel Rasmussen Cc: David Matlack Cc: David Rientjes Cc: David Stevens Cc: Oliver Upton Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Wei Xu Cc: Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 -- mm/vmscan.c | 14 +++++--------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 17506e4a2835..9342e5692dab 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -458,9 +458,7 @@ struct lru_gen_folio { enum { MM_LEAF_TOTAL, /* total leaf entries */ - MM_LEAF_OLD, /* old leaf entries */ MM_LEAF_YOUNG, /* young leaf entries */ - MM_NONLEAF_TOTAL, /* total non-leaf entries */ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ NR_MM_STATS diff --git a/mm/vmscan.c b/mm/vmscan.c index eb4e8440c507..4f1d33e4b360 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3399,7 +3399,6 @@ restart: continue; if (!pte_young(ptent)) { - walk->mm_stats[MM_LEAF_OLD]++; continue; } @@ -3552,7 +3551,6 @@ restart: walk->mm_stats[MM_LEAF_TOTAL]++; if (!pmd_young(val)) { - walk->mm_stats[MM_LEAF_OLD]++; continue; } @@ -3564,8 +3562,6 @@ restart: continue; } - walk->mm_stats[MM_NONLEAF_TOTAL]++; - if (!walk->force_scan && should_clear_pmd_young()) { if (!pmd_young(val)) continue; @@ -5254,11 +5250,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); for (type = 0; type < ANON_AND_FILE; type++) { - const char *s = " "; + const char *s = "xxx"; unsigned long n[3] = {}; if (seq == max_seq) { - s = "RT "; + s = "RTx"; n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); n[1] = READ_ONCE(lrugen->avg_total[type][tier]); } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { @@ -5280,14 +5276,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { - const char *s = " "; + const char *s = "xxxx"; unsigned long n = 0; if (seq == max_seq && NR_HIST_GENS == 1) { - s = "LOYNFA"; + s = "TYFA"; n = READ_ONCE(mm_state->stats[hist][i]); } else if (seq != max_seq && NR_HIST_GENS > 1) { - s = "loynfa"; + s = "tyfa"; n = READ_ONCE(mm_state->stats[hist][i]); } -- cgit v1.2.3-70-g09d2 From 1d4832becdc2cdb2cffe2a6050c9d9fd8ff1c58c Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 19 Oct 2024 01:29:39 +0000 Subject: mm: multi-gen LRU: use {ptep,pmdp}_clear_young_notify() When the MM_WALK capability is enabled, memory that is mostly accessed by a VM appears younger than it really is, therefore this memory will be less likely to be evicted. Therefore, the presence of a running VM can significantly increase swap-outs for non-VM memory, regressing the performance for the rest of the system. Fix this regression by always calling {ptep,pmdp}_clear_young_notify() whenever we clear the young bits on PMDs/PTEs. [jthoughton@google.com: fix link-time error] Link: https://lkml.kernel.org/r/20241019012940.3656292-3-jthoughton@google.com Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao Signed-off-by: James Houghton Reported-by: David Stevens Cc: Axel Rasmussen Cc: David Matlack Cc: David Rientjes Cc: Oliver Upton Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Wei Xu Cc: Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +-- mm/rmap.c | 9 ++---- mm/vmscan.c | 88 ++++++++++++++++++++++++++++---------------------- 3 files changed, 55 insertions(+), 47 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9342e5692dab..5b1c984daf45 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -555,7 +555,7 @@ struct lru_gen_memcg { void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_lruvec(struct lruvec *lruvec); -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); @@ -574,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } -static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { + return false; } static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) diff --git a/mm/rmap.c b/mm/rmap.c index a8797d1b3d49..73d5998677d4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -885,13 +885,10 @@ static bool folio_referenced_one(struct folio *folio, return false; } - if (pvmw.pte) { - if (lru_gen_enabled() && - pte_young(ptep_get(pvmw.pte))) { - lru_gen_look_around(&pvmw); + if (lru_gen_enabled() && pvmw.pte) { + if (lru_gen_look_around(&pvmw)) referenced++; - } - + } else if (pvmw.pte) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) referenced++; diff --git a/mm/vmscan.c b/mm/vmscan.c index 4f1d33e4b360..ddaaff67642e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -3294,7 +3295,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk return false; } -static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) { unsigned long pfn = pte_pfn(pte); @@ -3306,13 +3308,20 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) return -1; + if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) + return -1; + if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + return pfn; } -static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) { unsigned long pfn = pmd_pfn(pmd); @@ -3324,9 +3333,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned if (WARN_ON_ONCE(pmd_devmap(pmd))) return -1; + if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) + return -1; + if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + return pfn; } @@ -3335,10 +3350,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, { struct folio *folio; - /* try to avoid unnecessary memory loads */ - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) - return NULL; - folio = pfn_folio(pfn); if (folio_nid(folio) != pgdat->node_id) return NULL; @@ -3394,20 +3405,16 @@ restart: total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(ptent, args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr, pgdat); if (pfn == -1) continue; - if (!pte_young(ptent)) { - continue; - } - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); if (!folio) continue; - if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) - VM_WARN_ON_ONCE(true); + if (!ptep_clear_young_notify(args->vma, addr, pte + i)) + continue; young++; walk->mm_stats[MM_LEAF_YOUNG]++; @@ -3473,21 +3480,25 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area /* don't round down the first address */ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; - pfn = get_pmd_pfn(pmd[i], vma, addr); - if (pfn == -1) + if (!pmd_present(pmd[i])) goto next; if (!pmd_trans_huge(pmd[i])) { - if (!walk->force_scan && should_clear_pmd_young()) + if (!walk->force_scan && should_clear_pmd_young() && + !mm_has_notifiers(args->mm)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } + pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat); + if (pfn == -1) + goto next; + folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); if (!folio) goto next; - if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) + if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; walk->mm_stats[MM_LEAF_YOUNG]++; @@ -3545,24 +3556,18 @@ restart: } if (pmd_trans_huge(val)) { - unsigned long pfn = pmd_pfn(val); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat); walk->mm_stats[MM_LEAF_TOTAL]++; - if (!pmd_young(val)) { - continue; - } - - /* try to avoid unnecessary memory loads */ - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) - continue; - - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); + if (pfn != -1) + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } - if (!walk->force_scan && should_clear_pmd_young()) { + if (!walk->force_scan && should_clear_pmd_young() && + !mm_has_notifiers(args->mm)) { if (!pmd_young(val)) continue; @@ -4036,13 +4041,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; - int young = 0; + int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; @@ -4058,12 +4063,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + if (!ptep_clear_young_notify(vma, addr, pte)) + return false; + if (spin_is_contended(pvmw->ptl)) - return; + return true; /* exclude special VMAs containing anon pages from COW */ if (vma->vm_flags & VM_SPECIAL) - return; + return true; /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; @@ -4071,6 +4079,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) start = max(addr & PMD_MASK, vma->vm_start); end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; + if (end - start == PAGE_SIZE) + return true; + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) end = start + MIN_LRU_BATCH * PAGE_SIZE; @@ -4084,7 +4095,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* folio_update_gen() requires stable folio_memcg() */ if (!mem_cgroup_trylock_pages(memcg)) - return; + return true; arch_enter_lazy_mmu_mode(); @@ -4094,19 +4105,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) unsigned long pfn; pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(ptent, vma, addr); + pfn = get_pte_pfn(ptent, vma, addr, pgdat); if (pfn == -1) continue; - if (!pte_young(ptent)) - continue; - folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); if (!folio) continue; - if (!ptep_test_and_clear_young(vma, addr, pte + i)) - VM_WARN_ON_ONCE(true); + if (!ptep_clear_young_notify(vma, addr, pte + i)) + continue; young++; @@ -4136,6 +4144,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) update_bloom_filter(mm_state, max_seq, pvmw->pmd); + + return true; } /****************************************************************************** -- cgit v1.2.3-70-g09d2