summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-03 10:25:05 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-03 10:25:05 -1000
commita8cc7432728d019a10cb412401ebc15ed7504289 (patch)
tree91500fbca59e047fe67e58468241416a3594556e
parentd5aaa0bc6de9c2649fa15def775a6710c052c966 (diff)
parent1d4832becdc2cdb2cffe2a6050c9d9fd8ff1c58c (diff)
Merge tag 'mm-hotfixes-stable-2024-11-03-10-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc fixes from Andrew Morton: "17 hotfixes. 9 are cc:stable. 13 are MM and 4 are non-MM. The usual collection of singletons - please see the changelogs" * tag 'mm-hotfixes-stable-2024-11-03-10-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: mm: multi-gen LRU: use {ptep,pmdp}_clear_young_notify() mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL stats mm, mmap: limit THP alignment of anonymous mappings to PMD-aligned sizes mm: shrinker: avoid memleak in alloc_shrinker_info .mailmap: update e-mail address for Eugen Hristev vmscan,migrate: fix page count imbalance on node stats when demoting pages mailmap: update Jarkko's email addresses mm: allow set/clear page_type again nilfs2: fix potential deadlock with newly created symlinks Squashfs: fix variable overflow in squashfs_readpage_block kasan: remove vmalloc_percpu test tools/mm: -Werror fixes in page-types/slabinfo mm, swap: avoid over reclaim of full clusters mm: fix PSWPIN counter for large folios swap-in mm: avoid VM_BUG_ON when try to map an anon large folio to zero page. mm/codetag: fix null pointer check logic for ref and tag mm/gup: stop leaking pinned pages in low memory conditions
-rw-r--r--.mailmap5
-rw-r--r--fs/nilfs2/namei.c3
-rw-r--r--fs/squashfs/file_direct.c9
-rw-r--r--include/linux/alloc_tag.h16
-rw-r--r--include/linux/mmzone.h7
-rw-r--r--include/linux/page-flags.h8
-rw-r--r--include/linux/swap.h1
-rw-r--r--mm/gup.c33
-rw-r--r--mm/kasan/kasan_test_c.c27
-rw-r--r--mm/migrate.c5
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/rmap.c9
-rw-r--r--mm/shrinker.c8
-rw-r--r--mm/swapfile.c49
-rw-r--r--mm/vmscan.c102
-rw-r--r--tools/mm/page-types.c9
-rw-r--r--tools/mm/slabinfo.c4
18 files changed, 159 insertions, 143 deletions
diff --git a/.mailmap b/.mailmap
index 9a94c514e32c..5378f04b2566 100644
--- a/.mailmap
+++ b/.mailmap
@@ -199,7 +199,8 @@ Elliot Berman <quic_eberman@quicinc.com> <eberman@codeaurora.org>
Enric Balletbo i Serra <eballetbo@kernel.org> <enric.balletbo@collabora.com>
Enric Balletbo i Serra <eballetbo@kernel.org> <eballetbo@iseebcn.com>
Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com>
-Eugen Hristev <eugen.hristev@collabora.com> <eugen.hristev@microchip.com>
+Eugen Hristev <eugen.hristev@linaro.org> <eugen.hristev@microchip.com>
+Eugen Hristev <eugen.hristev@linaro.org> <eugen.hristev@collabora.com>
Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com>
Faith Ekstrand <faith.ekstrand@collabora.com> <jason@jlekstrand.net>
@@ -282,7 +283,7 @@ Jan Glauber <jan.glauber@gmail.com> <jglauber@cavium.com>
Jan Kuliga <jtkuliga.kdev@gmail.com> <jankul@alatek.krakow.pl>
Jarkko Sakkinen <jarkko@kernel.org> <jarkko.sakkinen@linux.intel.com>
Jarkko Sakkinen <jarkko@kernel.org> <jarkko@profian.com>
-Jarkko Sakkinen <jarkko@kernel.org> <jarkko.sakkinen@tuni.fi>
+Jarkko Sakkinen <jarkko@kernel.org> <jarkko.sakkinen@parity.io>
Jason Gunthorpe <jgg@ziepe.ca> <jgg@mellanox.com>
Jason Gunthorpe <jgg@ziepe.ca> <jgg@nvidia.com>
Jason Gunthorpe <jgg@ziepe.ca> <jgunthorpe@obsidianresearch.com>
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 4905063790c5..9b108052d9f7 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -157,6 +157,9 @@ static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
/* slow symlink */
inode->i_op = &nilfs_symlink_inode_operations;
inode_nohighmem(inode);
+ mapping_set_gfp_mask(inode->i_mapping,
+ mapping_gfp_constraint(inode->i_mapping,
+ ~__GFP_FS));
inode->i_mapping->a_ops = &nilfs_aops;
err = page_symlink(inode, symname, l);
if (err)
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 22251743fadf..d19d4db74af8 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -30,7 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
loff_t start_index = folio->index & ~mask;
loff_t end_index = start_index | mask;
- int i, n, pages, bytes, res = -ENOMEM;
+ loff_t index;
+ int i, pages, bytes, res = -ENOMEM;
struct page **page, *last_page;
struct squashfs_page_actor *actor;
void *pageaddr;
@@ -45,9 +46,9 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
return res;
/* Try to grab all the pages covered by the Squashfs block */
- for (i = 0, n = start_index; n <= end_index; n++) {
- page[i] = (n == folio->index) ? target_page :
- grab_cache_page_nowait(target_page->mapping, n);
+ for (i = 0, index = start_index; index <= end_index; index++) {
+ page[i] = (index == folio->index) ? target_page :
+ grab_cache_page_nowait(target_page->mapping, index);
if (page[i] == NULL)
continue;
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 1f0a9ff23a2c..941deffc590d 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -135,18 +135,21 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
#endif
/* Caller should verify both ref and tag to be valid */
-static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+static inline bool __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
{
alloc_tag_add_check(ref, tag);
if (!ref || !tag)
- return;
+ return false;
ref->ct = &tag->ct;
+ return true;
}
-static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
{
- __alloc_tag_ref_set(ref, tag);
+ if (unlikely(!__alloc_tag_ref_set(ref, tag)))
+ return false;
+
/*
* We need in increment the call counter every time we have a new
* allocation or when we split a large allocation into smaller ones.
@@ -154,12 +157,13 @@ static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *t
* counter because when we free each part the counter will be decremented.
*/
this_cpu_inc(tag->counters->calls);
+ return true;
}
static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
{
- alloc_tag_ref_set(ref, tag);
- this_cpu_add(tag->counters->bytes, bytes);
+ if (likely(alloc_tag_ref_set(ref, tag)))
+ this_cpu_add(tag->counters->bytes, bytes);
}
static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 17506e4a2835..5b1c984daf45 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,9 +458,7 @@ struct lru_gen_folio {
enum {
MM_LEAF_TOTAL, /* total leaf entries */
- MM_LEAF_OLD, /* old leaf entries */
MM_LEAF_YOUNG, /* young leaf entries */
- MM_NONLEAF_TOTAL, /* total non-leaf entries */
MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
NR_MM_STATS
@@ -557,7 +555,7 @@ struct lru_gen_memcg {
void lru_gen_init_pgdat(struct pglist_data *pgdat);
void lru_gen_init_lruvec(struct lruvec *lruvec);
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -576,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}
-static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
+ return false;
}
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1b3a76710487..cc839e4365c1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -975,12 +975,16 @@ static __always_inline bool folio_test_##fname(const struct folio *folio) \
} \
static __always_inline void __folio_set_##fname(struct folio *folio) \
{ \
+ if (folio_test_##fname(folio)) \
+ return; \
VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \
folio); \
folio->page.page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __folio_clear_##fname(struct folio *folio) \
{ \
+ if (folio->page.page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
folio->page.page_type = UINT_MAX; \
}
@@ -993,11 +997,15 @@ static __always_inline int Page##uname(const struct page *page) \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
+ if (Page##uname(page)) \
+ return; \
VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \
page->page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
+ if (page->page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type = UINT_MAX; \
}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ca533b478c21..f3e0ac20c2e8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -335,6 +335,7 @@ struct swap_info_struct {
* list.
*/
struct work_struct discard_work; /* discard worker */
+ struct work_struct reclaim_work; /* reclaim worker */
struct list_head discard_clusters; /* discard clusters list */
struct plist_node avail_lists[]; /*
* entries in swap_avail_heads, one
diff --git a/mm/gup.c b/mm/gup.c
index a82890b46a36..4637dab7b54f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2394,20 +2394,25 @@ err:
}
/*
- * Check whether all folios are *allowed* to be pinned indefinitely (longterm).
+ * Check whether all folios are *allowed* to be pinned indefinitely (long term).
* Rather confusingly, all folios in the range are required to be pinned via
* FOLL_PIN, before calling this routine.
*
- * If any folios in the range are not allowed to be pinned, then this routine
- * will migrate those folios away, unpin all the folios in the range and return
- * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
- * call this routine again.
+ * Return values:
*
- * If an error other than -EAGAIN occurs, this indicates a migration failure.
- * The caller should give up, and propagate the error back up the call stack.
- *
- * If everything is OK and all folios in the range are allowed to be pinned,
+ * 0: if everything is OK and all folios in the range are allowed to be pinned,
* then this routine leaves all folios pinned and returns zero for success.
+ *
+ * -EAGAIN: if any folios in the range are not allowed to be pinned, then this
+ * routine will migrate those folios away, unpin all the folios in the range. If
+ * migration of the entire set of folios succeeds, then -EAGAIN is returned. The
+ * caller should re-pin the entire range with FOLL_PIN and then call this
+ * routine again.
+ *
+ * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this
+ * indicates a migration failure. The caller should give up, and propagate the
+ * error back up the call stack. The caller does not need to unpin any folios in
+ * that case, because this routine will do the unpinning.
*/
static long check_and_migrate_movable_folios(unsigned long nr_folios,
struct folio **folios)
@@ -2425,10 +2430,8 @@ static long check_and_migrate_movable_folios(unsigned long nr_folios,
}
/*
- * This routine just converts all the pages in the @pages array to folios and
- * calls check_and_migrate_movable_folios() to do the heavy lifting.
- *
- * Please see the check_and_migrate_movable_folios() documentation for details.
+ * Return values and behavior are the same as those for
+ * check_and_migrate_movable_folios().
*/
static long check_and_migrate_movable_pages(unsigned long nr_pages,
struct page **pages)
@@ -2437,8 +2440,10 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
long i, ret;
folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL);
- if (!folios)
+ if (!folios) {
+ unpin_user_pages(pages, nr_pages);
return -ENOMEM;
+ }
for (i = 0; i < nr_pages; i++)
folios[i] = page_folio(pages[i]);
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index a181e4780d9d..d8fb281e439d 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -1810,32 +1810,6 @@ static void vm_map_ram_tags(struct kunit *test)
free_pages((unsigned long)p_ptr, 1);
}
-static void vmalloc_percpu(struct kunit *test)
-{
- char __percpu *ptr;
- int cpu;
-
- /*
- * This test is specifically crafted for the software tag-based mode,
- * the only tag-based mode that poisons percpu mappings.
- */
- KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
-
- ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
-
- for_each_possible_cpu(cpu) {
- char *c_ptr = per_cpu_ptr(ptr, cpu);
-
- KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN);
- KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL);
-
- /* Make sure that in-bounds accesses don't crash the kernel. */
- *c_ptr = 0;
- }
-
- free_percpu(ptr);
-}
-
/*
* Check that the assigned pointer tag falls within the [KASAN_TAG_MIN,
* KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based
@@ -2023,7 +1997,6 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(vmalloc_oob),
KUNIT_CASE(vmap_tags),
KUNIT_CASE(vm_map_ram_tags),
- KUNIT_CASE(vmalloc_percpu),
KUNIT_CASE(match_all_not_assigned),
KUNIT_CASE(match_all_ptr_tag),
KUNIT_CASE(match_all_mem_tag),
diff --git a/mm/migrate.c b/mm/migrate.c
index df91248755e4..fab84a776088 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
pte_t newpte;
void *addr;
- VM_BUG_ON_PAGE(PageCompound(page), page);
+ if (PageCompound(page))
+ return false;
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
@@ -1177,7 +1178,7 @@ static void migrate_folio_done(struct folio *src,
* not accounted to NR_ISOLATED_*. They can be recognized
* as __folio_test_movable
*/
- if (likely(!__folio_test_movable(src)))
+ if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION)
mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
folio_is_file_lru(src), -folio_nr_pages(src));
diff --git a/mm/mmap.c b/mm/mmap.c
index 1e0e34cb993f..9841b41e3c76 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -900,7 +900,8 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (get_area) {
addr = get_area(file, addr, len, pgoff, flags);
- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)
+ && IS_ALIGNED(len, PMD_SIZE)) {
/* Ensures that larger anonymous mappings are THP aligned. */
addr = thp_get_unmapped_area_vmflags(file, addr, len,
pgoff, flags, vm_flags);
diff --git a/mm/page_io.c b/mm/page_io.c
index 78bc88acee79..69536a2b3c13 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -570,7 +570,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio,
* attempt to access it in the page fault retry time check.
*/
get_task_struct(current);
- count_vm_event(PSWPIN);
+ count_vm_events(PSWPIN, folio_nr_pages(folio));
submit_bio_wait(&bio);
__end_swap_bio_read(&bio);
put_task_struct(current);
@@ -585,7 +585,7 @@ static void swap_read_folio_bdev_async(struct folio *folio,
bio->bi_iter.bi_sector = swap_folio_sector(folio);
bio->bi_end_io = end_swap_bio_read;
bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
- count_vm_event(PSWPIN);
+ count_vm_events(PSWPIN, folio_nr_pages(folio));
submit_bio(bio);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index a8797d1b3d49..73d5998677d4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -885,13 +885,10 @@ static bool folio_referenced_one(struct folio *folio,
return false;
}
- if (pvmw.pte) {
- if (lru_gen_enabled() &&
- pte_young(ptep_get(pvmw.pte))) {
- lru_gen_look_around(&pvmw);
+ if (lru_gen_enabled() && pvmw.pte) {
+ if (lru_gen_look_around(&pvmw))
referenced++;
- }
-
+ } else if (pvmw.pte) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte))
referenced++;
diff --git a/mm/shrinker.c b/mm/shrinker.c
index dc5d2a6fcfc4..4a93fd433689 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -76,19 +76,21 @@ void free_shrinker_info(struct mem_cgroup *memcg)
int alloc_shrinker_info(struct mem_cgroup *memcg)
{
- struct shrinker_info *info;
int nid, ret = 0;
int array_size = 0;
mutex_lock(&shrinker_mutex);
array_size = shrinker_unit_size(shrinker_nr_max);
for_each_node(nid) {
- info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
+ struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size,
+ GFP_KERNEL, nid);
if (!info)
goto err;
info->map_nr_max = shrinker_nr_max;
- if (shrinker_unit_alloc(info, NULL, nid))
+ if (shrinker_unit_alloc(info, NULL, nid)) {
+ kvfree(info);
goto err;
+ }
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
}
mutex_unlock(&shrinker_mutex);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b0915f3fab31..46bd4b1a3c07 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -731,15 +731,16 @@ done:
return offset;
}
-static void swap_reclaim_full_clusters(struct swap_info_struct *si)
+/* Return true if reclaimed a whole cluster */
+static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
long to_scan = 1;
unsigned long offset, end;
struct swap_cluster_info *ci;
unsigned char *map = si->swap_map;
- int nr_reclaim, total_reclaimed = 0;
+ int nr_reclaim;
- if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER)
+ if (force)
to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
while (!list_empty(&si->full_clusters)) {
@@ -749,28 +750,36 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si)
end = min(si->max, offset + SWAPFILE_CLUSTER);
to_scan--;
+ spin_unlock(&si->lock);
while (offset < end) {
if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
- spin_unlock(&si->lock);
nr_reclaim = __try_to_reclaim_swap(si, offset,
TTRS_ANYWAY | TTRS_DIRECT);
- spin_lock(&si->lock);
- if (nr_reclaim > 0) {
- offset += nr_reclaim;
- total_reclaimed += nr_reclaim;
- continue;
- } else if (nr_reclaim < 0) {
- offset += -nr_reclaim;
+ if (nr_reclaim) {
+ offset += abs(nr_reclaim);
continue;
}
}
offset++;
}
- if (to_scan <= 0 || total_reclaimed)
+ spin_lock(&si->lock);
+
+ if (to_scan <= 0)
break;
}
}
+static void swap_reclaim_work(struct work_struct *work)
+{
+ struct swap_info_struct *si;
+
+ si = container_of(work, struct swap_info_struct, reclaim_work);
+
+ spin_lock(&si->lock);
+ swap_reclaim_full_clusters(si, true);
+ spin_unlock(&si->lock);
+}
+
/*
* Try to get swap entries with specified order from current cpu's swap entry
* pool (a cluster). This might involve allocating a new cluster for current CPU
@@ -800,6 +809,10 @@ new_cluster:
goto done;
}
+ /* Try reclaim from full clusters if free clusters list is drained */
+ if (vm_swap_full())
+ swap_reclaim_full_clusters(si, false);
+
if (order < PMD_ORDER) {
unsigned int frags = 0;
@@ -881,13 +894,6 @@ new_cluster:
}
done:
- /* Try reclaim from full clusters if device is nearfull */
- if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) {
- swap_reclaim_full_clusters(si);
- if (!found && !order && si->pages != si->inuse_pages)
- goto new_cluster;
- }
-
cluster->next[order] = offset;
return found;
}
@@ -922,6 +928,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
si->lowest_bit = si->max;
si->highest_bit = 0;
del_from_avail_list(si);
+
+ if (vm_swap_full())
+ schedule_work(&si->reclaim_work);
}
}
@@ -2816,6 +2825,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
wait_for_completion(&p->comp);
flush_work(&p->discard_work);
+ flush_work(&p->reclaim_work);
destroy_swap_extents(p);
if (p->flags & SWP_CONTINUED)
@@ -3376,6 +3386,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
return PTR_ERR(si);
INIT_WORK(&si->discard_work, swap_discard_work);
+ INIT_WORK(&si->reclaim_work, swap_reclaim_work);
name = getname(specialfile);
if (IS_ERR(name)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb4e8440c507..ddaaff67642e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,6 +56,7 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -3294,7 +3295,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk
return false;
}
-static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr,
+ struct pglist_data *pgdat)
{
unsigned long pfn = pte_pfn(pte);
@@ -3306,13 +3308,20 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
return -1;
+ if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
+ return -1;
+
if (WARN_ON_ONCE(!pfn_valid(pfn)))
return -1;
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return -1;
+
return pfn;
}
-static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
+static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr,
+ struct pglist_data *pgdat)
{
unsigned long pfn = pmd_pfn(pmd);
@@ -3324,9 +3333,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
if (WARN_ON_ONCE(pmd_devmap(pmd)))
return -1;
+ if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
+ return -1;
+
if (WARN_ON_ONCE(!pfn_valid(pfn)))
return -1;
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return -1;
+
return pfn;
}
@@ -3335,10 +3350,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
{
struct folio *folio;
- /* try to avoid unnecessary memory loads */
- if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
- return NULL;
-
folio = pfn_folio(pfn);
if (folio_nid(folio) != pgdat->node_id)
return NULL;
@@ -3394,21 +3405,16 @@ restart:
total++;
walk->mm_stats[MM_LEAF_TOTAL]++;
- pfn = get_pte_pfn(ptent, args->vma, addr);
+ pfn = get_pte_pfn(ptent, args->vma, addr, pgdat);
if (pfn == -1)
continue;
- if (!pte_young(ptent)) {
- walk->mm_stats[MM_LEAF_OLD]++;
- continue;
- }
-
folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
if (!folio)
continue;
- if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
- VM_WARN_ON_ONCE(true);
+ if (!ptep_clear_young_notify(args->vma, addr, pte + i))
+ continue;
young++;
walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3474,21 +3480,25 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
/* don't round down the first address */
addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
- pfn = get_pmd_pfn(pmd[i], vma, addr);
- if (pfn == -1)
+ if (!pmd_present(pmd[i]))
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (!walk->force_scan && should_clear_pmd_young())
+ if (!walk->force_scan && should_clear_pmd_young() &&
+ !mm_has_notifiers(args->mm))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
+ pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat);
+ if (pfn == -1)
+ goto next;
+
folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
if (!folio)
goto next;
- if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
+ if (!pmdp_clear_young_notify(vma, addr, pmd + i))
goto next;
walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3546,27 +3556,18 @@ restart:
}
if (pmd_trans_huge(val)) {
- unsigned long pfn = pmd_pfn(val);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat);
walk->mm_stats[MM_LEAF_TOTAL]++;
- if (!pmd_young(val)) {
- walk->mm_stats[MM_LEAF_OLD]++;
- continue;
- }
-
- /* try to avoid unnecessary memory loads */
- if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
- continue;
-
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
+ if (pfn != -1)
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
continue;
}
- walk->mm_stats[MM_NONLEAF_TOTAL]++;
-
- if (!walk->force_scan && should_clear_pmd_young()) {
+ if (!walk->force_scan && should_clear_pmd_young() &&
+ !mm_has_notifiers(args->mm)) {
if (!pmd_young(val))
continue;
@@ -4040,13 +4041,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* the PTE table to the Bloom filter. This forms a feedback loop between the
* eviction and the aging.
*/
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
int i;
unsigned long start;
unsigned long end;
struct lru_gen_mm_walk *walk;
- int young = 0;
+ int young = 1;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
@@ -4062,12 +4063,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+ if (!ptep_clear_young_notify(vma, addr, pte))
+ return false;
+
if (spin_is_contended(pvmw->ptl))
- return;
+ return true;
/* exclude special VMAs containing anon pages from COW */
if (vma->vm_flags & VM_SPECIAL)
- return;
+ return true;
/* avoid taking the LRU lock under the PTL when possible */
walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
@@ -4075,6 +4079,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
start = max(addr & PMD_MASK, vma->vm_start);
end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
+ if (end - start == PAGE_SIZE)
+ return true;
+
if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
end = start + MIN_LRU_BATCH * PAGE_SIZE;
@@ -4088,7 +4095,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
/* folio_update_gen() requires stable folio_memcg() */
if (!mem_cgroup_trylock_pages(memcg))
- return;
+ return true;
arch_enter_lazy_mmu_mode();
@@ -4098,19 +4105,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
unsigned long pfn;
pte_t ptent = ptep_get(pte + i);
- pfn = get_pte_pfn(ptent, vma, addr);
+ pfn = get_pte_pfn(ptent, vma, addr, pgdat);
if (pfn == -1)
continue;
- if (!pte_young(ptent))
- continue;
-
folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
if (!folio)
continue;
- if (!ptep_test_and_clear_young(vma, addr, pte + i))
- VM_WARN_ON_ONCE(true);
+ if (!ptep_clear_young_notify(vma, addr, pte + i))
+ continue;
young++;
@@ -4140,6 +4144,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
/* feedback from rmap walkers to page table walkers */
if (mm_state && suitable_to_scan(i, young))
update_bloom_filter(mm_state, max_seq, pvmw->pmd);
+
+ return true;
}
/******************************************************************************
@@ -5254,11 +5260,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
for (type = 0; type < ANON_AND_FILE; type++) {
- const char *s = " ";
+ const char *s = "xxx";
unsigned long n[3] = {};
if (seq == max_seq) {
- s = "RT ";
+ s = "RTx";
n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
@@ -5280,14 +5286,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
- const char *s = " ";
+ const char *s = "xxxx";
unsigned long n = 0;
if (seq == max_seq && NR_HIST_GENS == 1) {
- s = "LOYNFA";
+ s = "TYFA";
n = READ_ONCE(mm_state->stats[hist][i]);
} else if (seq != max_seq && NR_HIST_GENS > 1) {
- s = "loynfa";
+ s = "tyfa";
n = READ_ONCE(mm_state->stats[hist][i]);
}
diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c
index fa050d5a48cd..6eb17cc1a06c 100644
--- a/tools/mm/page-types.c
+++ b/tools/mm/page-types.c
@@ -22,6 +22,7 @@
#include <time.h>
#include <setjmp.h>
#include <signal.h>
+#include <inttypes.h>
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/fcntl.h>
@@ -391,9 +392,9 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
if (opt_file)
printf("%lx\t", voff);
if (opt_list_cgroup)
- printf("@%llu\t", (unsigned long long)cgroup0);
+ printf("@%" PRIu64 "\t", cgroup0);
if (opt_list_mapcnt)
- printf("%lu\t", mapcnt0);
+ printf("%" PRIu64 "\t", mapcnt0);
printf("%lx\t%lx\t%s\n",
index, count, page_flag_name(flags0));
}
@@ -419,9 +420,9 @@ static void show_page(unsigned long voffset, unsigned long offset,
if (opt_file)
printf("%lx\t", voffset);
if (opt_list_cgroup)
- printf("@%llu\t", (unsigned long long)cgroup);
+ printf("@%" PRIu64 "\t", cgroup)
if (opt_list_mapcnt)
- printf("%lu\t", mapcnt);
+ printf("%" PRIu64 "\t", mapcnt);
printf("%lx\t%s\n", offset, page_flag_name(flags));
}
diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c
index cfaeaea71042..04e9e6ba86ea 100644
--- a/tools/mm/slabinfo.c
+++ b/tools/mm/slabinfo.c
@@ -1297,7 +1297,9 @@ static void read_slab_dir(void)
slab->cpu_partial_free = get_obj("cpu_partial_free");
slab->alloc_node_mismatch = get_obj("alloc_node_mismatch");
slab->deactivate_bypass = get_obj("deactivate_bypass");
- chdir("..");
+ if (chdir(".."))
+ fatal("Unable to chdir from slab ../%s\n",
+ slab->name);
if (slab->name[0] == ':')
alias_targets++;
slab++;