diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-03 10:25:05 -1000 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-03 10:25:05 -1000 |
commit | a8cc7432728d019a10cb412401ebc15ed7504289 (patch) | |
tree | 91500fbca59e047fe67e58468241416a3594556e | |
parent | d5aaa0bc6de9c2649fa15def775a6710c052c966 (diff) | |
parent | 1d4832becdc2cdb2cffe2a6050c9d9fd8ff1c58c (diff) |
Merge tag 'mm-hotfixes-stable-2024-11-03-10-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc fixes from Andrew Morton:
"17 hotfixes. 9 are cc:stable. 13 are MM and 4 are non-MM.
The usual collection of singletons - please see the changelogs"
* tag 'mm-hotfixes-stable-2024-11-03-10-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
mm: multi-gen LRU: use {ptep,pmdp}_clear_young_notify()
mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL stats
mm, mmap: limit THP alignment of anonymous mappings to PMD-aligned sizes
mm: shrinker: avoid memleak in alloc_shrinker_info
.mailmap: update e-mail address for Eugen Hristev
vmscan,migrate: fix page count imbalance on node stats when demoting pages
mailmap: update Jarkko's email addresses
mm: allow set/clear page_type again
nilfs2: fix potential deadlock with newly created symlinks
Squashfs: fix variable overflow in squashfs_readpage_block
kasan: remove vmalloc_percpu test
tools/mm: -Werror fixes in page-types/slabinfo
mm, swap: avoid over reclaim of full clusters
mm: fix PSWPIN counter for large folios swap-in
mm: avoid VM_BUG_ON when try to map an anon large folio to zero page.
mm/codetag: fix null pointer check logic for ref and tag
mm/gup: stop leaking pinned pages in low memory conditions
-rw-r--r-- | .mailmap | 5 | ||||
-rw-r--r-- | fs/nilfs2/namei.c | 3 | ||||
-rw-r--r-- | fs/squashfs/file_direct.c | 9 | ||||
-rw-r--r-- | include/linux/alloc_tag.h | 16 | ||||
-rw-r--r-- | include/linux/mmzone.h | 7 | ||||
-rw-r--r-- | include/linux/page-flags.h | 8 | ||||
-rw-r--r-- | include/linux/swap.h | 1 | ||||
-rw-r--r-- | mm/gup.c | 33 | ||||
-rw-r--r-- | mm/kasan/kasan_test_c.c | 27 | ||||
-rw-r--r-- | mm/migrate.c | 5 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/page_io.c | 4 | ||||
-rw-r--r-- | mm/rmap.c | 9 | ||||
-rw-r--r-- | mm/shrinker.c | 8 | ||||
-rw-r--r-- | mm/swapfile.c | 49 | ||||
-rw-r--r-- | mm/vmscan.c | 102 | ||||
-rw-r--r-- | tools/mm/page-types.c | 9 | ||||
-rw-r--r-- | tools/mm/slabinfo.c | 4 |
18 files changed, 159 insertions, 143 deletions
@@ -199,7 +199,8 @@ Elliot Berman <quic_eberman@quicinc.com> <eberman@codeaurora.org> Enric Balletbo i Serra <eballetbo@kernel.org> <enric.balletbo@collabora.com> Enric Balletbo i Serra <eballetbo@kernel.org> <eballetbo@iseebcn.com> Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com> -Eugen Hristev <eugen.hristev@collabora.com> <eugen.hristev@microchip.com> +Eugen Hristev <eugen.hristev@linaro.org> <eugen.hristev@microchip.com> +Eugen Hristev <eugen.hristev@linaro.org> <eugen.hristev@collabora.com> Evgeniy Polyakov <johnpol@2ka.mipt.ru> Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com> Faith Ekstrand <faith.ekstrand@collabora.com> <jason@jlekstrand.net> @@ -282,7 +283,7 @@ Jan Glauber <jan.glauber@gmail.com> <jglauber@cavium.com> Jan Kuliga <jtkuliga.kdev@gmail.com> <jankul@alatek.krakow.pl> Jarkko Sakkinen <jarkko@kernel.org> <jarkko.sakkinen@linux.intel.com> Jarkko Sakkinen <jarkko@kernel.org> <jarkko@profian.com> -Jarkko Sakkinen <jarkko@kernel.org> <jarkko.sakkinen@tuni.fi> +Jarkko Sakkinen <jarkko@kernel.org> <jarkko.sakkinen@parity.io> Jason Gunthorpe <jgg@ziepe.ca> <jgg@mellanox.com> Jason Gunthorpe <jgg@ziepe.ca> <jgg@nvidia.com> Jason Gunthorpe <jgg@ziepe.ca> <jgunthorpe@obsidianresearch.com> diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 4905063790c5..9b108052d9f7 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -157,6 +157,9 @@ static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir, /* slow symlink */ inode->i_op = &nilfs_symlink_inode_operations; inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_constraint(inode->i_mapping, + ~__GFP_FS)); inode->i_mapping->a_ops = &nilfs_aops; err = page_symlink(inode, symname, l); if (err) diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 22251743fadf..d19d4db74af8 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -30,7 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; loff_t start_index = folio->index & ~mask; loff_t end_index = start_index | mask; - int i, n, pages, bytes, res = -ENOMEM; + loff_t index; + int i, pages, bytes, res = -ENOMEM; struct page **page, *last_page; struct squashfs_page_actor *actor; void *pageaddr; @@ -45,9 +46,9 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, return res; /* Try to grab all the pages covered by the Squashfs block */ - for (i = 0, n = start_index; n <= end_index; n++) { - page[i] = (n == folio->index) ? target_page : - grab_cache_page_nowait(target_page->mapping, n); + for (i = 0, index = start_index; index <= end_index; index++) { + page[i] = (index == folio->index) ? target_page : + grab_cache_page_nowait(target_page->mapping, index); if (page[i] == NULL) continue; diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 1f0a9ff23a2c..941deffc590d 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -135,18 +135,21 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {} #endif /* Caller should verify both ref and tag to be valid */ -static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +static inline bool __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) { alloc_tag_add_check(ref, tag); if (!ref || !tag) - return; + return false; ref->ct = &tag->ct; + return true; } -static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) { - __alloc_tag_ref_set(ref, tag); + if (unlikely(!__alloc_tag_ref_set(ref, tag))) + return false; + /* * We need in increment the call counter every time we have a new * allocation or when we split a large allocation into smaller ones. @@ -154,12 +157,13 @@ static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *t * counter because when we free each part the counter will be decremented. */ this_cpu_inc(tag->counters->calls); + return true; } static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) { - alloc_tag_ref_set(ref, tag); - this_cpu_add(tag->counters->bytes, bytes); + if (likely(alloc_tag_ref_set(ref, tag))) + this_cpu_add(tag->counters->bytes, bytes); } static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 17506e4a2835..5b1c984daf45 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -458,9 +458,7 @@ struct lru_gen_folio { enum { MM_LEAF_TOTAL, /* total leaf entries */ - MM_LEAF_OLD, /* old leaf entries */ MM_LEAF_YOUNG, /* young leaf entries */ - MM_NONLEAF_TOTAL, /* total non-leaf entries */ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ NR_MM_STATS @@ -557,7 +555,7 @@ struct lru_gen_memcg { void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_lruvec(struct lruvec *lruvec); -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); @@ -576,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } -static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { + return false; } static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1b3a76710487..cc839e4365c1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -975,12 +975,16 @@ static __always_inline bool folio_test_##fname(const struct folio *folio) \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ + if (folio_test_##fname(folio)) \ + return; \ VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ folio); \ folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ + if (folio->page.page_type == UINT_MAX) \ + return; \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ folio->page.page_type = UINT_MAX; \ } @@ -993,11 +997,15 @@ static __always_inline int Page##uname(const struct page *page) \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ + if (Page##uname(page)) \ + return; \ VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ + if (page->page_type == UINT_MAX) \ + return; \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type = UINT_MAX; \ } diff --git a/include/linux/swap.h b/include/linux/swap.h index ca533b478c21..f3e0ac20c2e8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -335,6 +335,7 @@ struct swap_info_struct { * list. */ struct work_struct discard_work; /* discard worker */ + struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one @@ -2394,20 +2394,25 @@ err: } /* - * Check whether all folios are *allowed* to be pinned indefinitely (longterm). + * Check whether all folios are *allowed* to be pinned indefinitely (long term). * Rather confusingly, all folios in the range are required to be pinned via * FOLL_PIN, before calling this routine. * - * If any folios in the range are not allowed to be pinned, then this routine - * will migrate those folios away, unpin all the folios in the range and return - * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then - * call this routine again. + * Return values: * - * If an error other than -EAGAIN occurs, this indicates a migration failure. - * The caller should give up, and propagate the error back up the call stack. - * - * If everything is OK and all folios in the range are allowed to be pinned, + * 0: if everything is OK and all folios in the range are allowed to be pinned, * then this routine leaves all folios pinned and returns zero for success. + * + * -EAGAIN: if any folios in the range are not allowed to be pinned, then this + * routine will migrate those folios away, unpin all the folios in the range. If + * migration of the entire set of folios succeeds, then -EAGAIN is returned. The + * caller should re-pin the entire range with FOLL_PIN and then call this + * routine again. + * + * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this + * indicates a migration failure. The caller should give up, and propagate the + * error back up the call stack. The caller does not need to unpin any folios in + * that case, because this routine will do the unpinning. */ static long check_and_migrate_movable_folios(unsigned long nr_folios, struct folio **folios) @@ -2425,10 +2430,8 @@ static long check_and_migrate_movable_folios(unsigned long nr_folios, } /* - * This routine just converts all the pages in the @pages array to folios and - * calls check_and_migrate_movable_folios() to do the heavy lifting. - * - * Please see the check_and_migrate_movable_folios() documentation for details. + * Return values and behavior are the same as those for + * check_and_migrate_movable_folios(). */ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) @@ -2437,8 +2440,10 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, long i, ret; folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL); - if (!folios) + if (!folios) { + unpin_user_pages(pages, nr_pages); return -ENOMEM; + } for (i = 0; i < nr_pages; i++) folios[i] = page_folio(pages[i]); diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index a181e4780d9d..d8fb281e439d 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1810,32 +1810,6 @@ static void vm_map_ram_tags(struct kunit *test) free_pages((unsigned long)p_ptr, 1); } -static void vmalloc_percpu(struct kunit *test) -{ - char __percpu *ptr; - int cpu; - - /* - * This test is specifically crafted for the software tag-based mode, - * the only tag-based mode that poisons percpu mappings. - */ - KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); - - ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); - - for_each_possible_cpu(cpu) { - char *c_ptr = per_cpu_ptr(ptr, cpu); - - KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); - KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); - - /* Make sure that in-bounds accesses don't crash the kernel. */ - *c_ptr = 0; - } - - free_percpu(ptr); -} - /* * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN, * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based @@ -2023,7 +1997,6 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(vmalloc_oob), KUNIT_CASE(vmap_tags), KUNIT_CASE(vm_map_ram_tags), - KUNIT_CASE(vmalloc_percpu), KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), diff --git a/mm/migrate.c b/mm/migrate.c index df91248755e4..fab84a776088 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, pte_t newpte; void *addr; - VM_BUG_ON_PAGE(PageCompound(page), page); + if (PageCompound(page)) + return false; VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); @@ -1177,7 +1178,7 @@ static void migrate_folio_done(struct folio *src, * not accounted to NR_ISOLATED_*. They can be recognized * as __folio_test_movable */ - if (likely(!__folio_test_movable(src))) + if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION) mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + folio_is_file_lru(src), -folio_nr_pages(src)); diff --git a/mm/mmap.c b/mm/mmap.c index 1e0e34cb993f..9841b41e3c76 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -900,7 +900,8 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) + && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); diff --git a/mm/page_io.c b/mm/page_io.c index 78bc88acee79..69536a2b3c13 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -570,7 +570,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio, * attempt to access it in the page fault retry time check. */ get_task_struct(current); - count_vm_event(PSWPIN); + count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio_wait(&bio); __end_swap_bio_read(&bio); put_task_struct(current); @@ -585,7 +585,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); - count_vm_event(PSWPIN); + count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio(bio); } diff --git a/mm/rmap.c b/mm/rmap.c index a8797d1b3d49..73d5998677d4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -885,13 +885,10 @@ static bool folio_referenced_one(struct folio *folio, return false; } - if (pvmw.pte) { - if (lru_gen_enabled() && - pte_young(ptep_get(pvmw.pte))) { - lru_gen_look_around(&pvmw); + if (lru_gen_enabled() && pvmw.pte) { + if (lru_gen_look_around(&pvmw)) referenced++; - } - + } else if (pvmw.pte) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) referenced++; diff --git a/mm/shrinker.c b/mm/shrinker.c index dc5d2a6fcfc4..4a93fd433689 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -76,19 +76,21 @@ void free_shrinker_info(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg) { - struct shrinker_info *info; int nid, ret = 0; int array_size = 0; mutex_lock(&shrinker_mutex); array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); + struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size, + GFP_KERNEL, nid); if (!info) goto err; info->map_nr_max = shrinker_nr_max; - if (shrinker_unit_alloc(info, NULL, nid)) + if (shrinker_unit_alloc(info, NULL, nid)) { + kvfree(info); goto err; + } rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } mutex_unlock(&shrinker_mutex); diff --git a/mm/swapfile.c b/mm/swapfile.c index b0915f3fab31..46bd4b1a3c07 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -731,15 +731,16 @@ done: return offset; } -static void swap_reclaim_full_clusters(struct swap_info_struct *si) +/* Return true if reclaimed a whole cluster */ +static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; unsigned char *map = si->swap_map; - int nr_reclaim, total_reclaimed = 0; + int nr_reclaim; - if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER) + if (force) to_scan = si->inuse_pages / SWAPFILE_CLUSTER; while (!list_empty(&si->full_clusters)) { @@ -749,28 +750,36 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si) end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; + spin_unlock(&si->lock); while (offset < end) { if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { - spin_unlock(&si->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); - spin_lock(&si->lock); - if (nr_reclaim > 0) { - offset += nr_reclaim; - total_reclaimed += nr_reclaim; - continue; - } else if (nr_reclaim < 0) { - offset += -nr_reclaim; + if (nr_reclaim) { + offset += abs(nr_reclaim); continue; } } offset++; } - if (to_scan <= 0 || total_reclaimed) + spin_lock(&si->lock); + + if (to_scan <= 0) break; } } +static void swap_reclaim_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, reclaim_work); + + spin_lock(&si->lock); + swap_reclaim_full_clusters(si, true); + spin_unlock(&si->lock); +} + /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU @@ -800,6 +809,10 @@ new_cluster: goto done; } + /* Try reclaim from full clusters if free clusters list is drained */ + if (vm_swap_full()) + swap_reclaim_full_clusters(si, false); + if (order < PMD_ORDER) { unsigned int frags = 0; @@ -881,13 +894,6 @@ new_cluster: } done: - /* Try reclaim from full clusters if device is nearfull */ - if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) { - swap_reclaim_full_clusters(si); - if (!found && !order && si->pages != si->inuse_pages) - goto new_cluster; - } - cluster->next[order] = offset; return found; } @@ -922,6 +928,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, si->lowest_bit = si->max; si->highest_bit = 0; del_from_avail_list(si); + + if (vm_swap_full()) + schedule_work(&si->reclaim_work); } } @@ -2816,6 +2825,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) wait_for_completion(&p->comp); flush_work(&p->discard_work); + flush_work(&p->reclaim_work); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) @@ -3376,6 +3386,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) return PTR_ERR(si); INIT_WORK(&si->discard_work, swap_discard_work); + INIT_WORK(&si->reclaim_work, swap_reclaim_work); name = getname(specialfile); if (IS_ERR(name)) { diff --git a/mm/vmscan.c b/mm/vmscan.c index eb4e8440c507..ddaaff67642e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,6 +56,7 @@ #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -3294,7 +3295,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk return false; } -static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) { unsigned long pfn = pte_pfn(pte); @@ -3306,13 +3308,20 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) return -1; + if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) + return -1; + if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + return pfn; } -static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) { unsigned long pfn = pmd_pfn(pmd); @@ -3324,9 +3333,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned if (WARN_ON_ONCE(pmd_devmap(pmd))) return -1; + if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) + return -1; + if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + return pfn; } @@ -3335,10 +3350,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, { struct folio *folio; - /* try to avoid unnecessary memory loads */ - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) - return NULL; - folio = pfn_folio(pfn); if (folio_nid(folio) != pgdat->node_id) return NULL; @@ -3394,21 +3405,16 @@ restart: total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(ptent, args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr, pgdat); if (pfn == -1) continue; - if (!pte_young(ptent)) { - walk->mm_stats[MM_LEAF_OLD]++; - continue; - } - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); if (!folio) continue; - if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) - VM_WARN_ON_ONCE(true); + if (!ptep_clear_young_notify(args->vma, addr, pte + i)) + continue; young++; walk->mm_stats[MM_LEAF_YOUNG]++; @@ -3474,21 +3480,25 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area /* don't round down the first address */ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; - pfn = get_pmd_pfn(pmd[i], vma, addr); - if (pfn == -1) + if (!pmd_present(pmd[i])) goto next; if (!pmd_trans_huge(pmd[i])) { - if (!walk->force_scan && should_clear_pmd_young()) + if (!walk->force_scan && should_clear_pmd_young() && + !mm_has_notifiers(args->mm)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } + pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat); + if (pfn == -1) + goto next; + folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); if (!folio) goto next; - if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) + if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; walk->mm_stats[MM_LEAF_YOUNG]++; @@ -3546,27 +3556,18 @@ restart: } if (pmd_trans_huge(val)) { - unsigned long pfn = pmd_pfn(val); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat); walk->mm_stats[MM_LEAF_TOTAL]++; - if (!pmd_young(val)) { - walk->mm_stats[MM_LEAF_OLD]++; - continue; - } - - /* try to avoid unnecessary memory loads */ - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) - continue; - - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); + if (pfn != -1) + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } - walk->mm_stats[MM_NONLEAF_TOTAL]++; - - if (!walk->force_scan && should_clear_pmd_young()) { + if (!walk->force_scan && should_clear_pmd_young() && + !mm_has_notifiers(args->mm)) { if (!pmd_young(val)) continue; @@ -4040,13 +4041,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; - int young = 0; + int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; @@ -4062,12 +4063,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + if (!ptep_clear_young_notify(vma, addr, pte)) + return false; + if (spin_is_contended(pvmw->ptl)) - return; + return true; /* exclude special VMAs containing anon pages from COW */ if (vma->vm_flags & VM_SPECIAL) - return; + return true; /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; @@ -4075,6 +4079,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) start = max(addr & PMD_MASK, vma->vm_start); end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; + if (end - start == PAGE_SIZE) + return true; + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) end = start + MIN_LRU_BATCH * PAGE_SIZE; @@ -4088,7 +4095,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* folio_update_gen() requires stable folio_memcg() */ if (!mem_cgroup_trylock_pages(memcg)) - return; + return true; arch_enter_lazy_mmu_mode(); @@ -4098,19 +4105,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) unsigned long pfn; pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(ptent, vma, addr); + pfn = get_pte_pfn(ptent, vma, addr, pgdat); if (pfn == -1) continue; - if (!pte_young(ptent)) - continue; - folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); if (!folio) continue; - if (!ptep_test_and_clear_young(vma, addr, pte + i)) - VM_WARN_ON_ONCE(true); + if (!ptep_clear_young_notify(vma, addr, pte + i)) + continue; young++; @@ -4140,6 +4144,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) update_bloom_filter(mm_state, max_seq, pvmw->pmd); + + return true; } /****************************************************************************** @@ -5254,11 +5260,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); for (type = 0; type < ANON_AND_FILE; type++) { - const char *s = " "; + const char *s = "xxx"; unsigned long n[3] = {}; if (seq == max_seq) { - s = "RT "; + s = "RTx"; n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); n[1] = READ_ONCE(lrugen->avg_total[type][tier]); } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { @@ -5280,14 +5286,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { - const char *s = " "; + const char *s = "xxxx"; unsigned long n = 0; if (seq == max_seq && NR_HIST_GENS == 1) { - s = "LOYNFA"; + s = "TYFA"; n = READ_ONCE(mm_state->stats[hist][i]); } else if (seq != max_seq && NR_HIST_GENS > 1) { - s = "loynfa"; + s = "tyfa"; n = READ_ONCE(mm_state->stats[hist][i]); } diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index fa050d5a48cd..6eb17cc1a06c 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -22,6 +22,7 @@ #include <time.h> #include <setjmp.h> #include <signal.h> +#include <inttypes.h> #include <sys/types.h> #include <sys/errno.h> #include <sys/fcntl.h> @@ -391,9 +392,9 @@ static void show_page_range(unsigned long voffset, unsigned long offset, if (opt_file) printf("%lx\t", voff); if (opt_list_cgroup) - printf("@%llu\t", (unsigned long long)cgroup0); + printf("@%" PRIu64 "\t", cgroup0); if (opt_list_mapcnt) - printf("%lu\t", mapcnt0); + printf("%" PRIu64 "\t", mapcnt0); printf("%lx\t%lx\t%s\n", index, count, page_flag_name(flags0)); } @@ -419,9 +420,9 @@ static void show_page(unsigned long voffset, unsigned long offset, if (opt_file) printf("%lx\t", voffset); if (opt_list_cgroup) - printf("@%llu\t", (unsigned long long)cgroup); + printf("@%" PRIu64 "\t", cgroup) if (opt_list_mapcnt) - printf("%lu\t", mapcnt); + printf("%" PRIu64 "\t", mapcnt); printf("%lx\t%s\n", offset, page_flag_name(flags)); } diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c index cfaeaea71042..04e9e6ba86ea 100644 --- a/tools/mm/slabinfo.c +++ b/tools/mm/slabinfo.c @@ -1297,7 +1297,9 @@ static void read_slab_dir(void) slab->cpu_partial_free = get_obj("cpu_partial_free"); slab->alloc_node_mismatch = get_obj("alloc_node_mismatch"); slab->deactivate_bypass = get_obj("deactivate_bypass"); - chdir(".."); + if (chdir("..")) + fatal("Unable to chdir from slab ../%s\n", + slab->name); if (slab->name[0] == ':') alias_targets++; slab++; |