summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-23 09:58:07 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-23 09:58:07 -0800
commit5c00ff742bf5caf85f60e1c73999f99376fb865d (patch)
treefa484e83c27af79f1c0511e7e0673507461c9379 /mm
parent228a1157fb9fec47eb135b51c0202b574e079ebf (diff)
parent2532e6c74a67e65b95f310946e0c0e0a41b3a34b (diff)
Merge tag 'mm-stable-2024-11-18-19-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - The series "zram: optimal post-processing target selection" from Sergey Senozhatsky improves zram's post-processing selection algorithm. This leads to improved memory savings. - Wei Yang has gone to town on the mapletree code, contributing several series which clean up the implementation: - "refine mas_mab_cp()" - "Reduce the space to be cleared for maple_big_node" - "maple_tree: simplify mas_push_node()" - "Following cleanup after introduce mas_wr_store_type()" - "refine storing null" - The series "selftests/mm: hugetlb_fault_after_madv improvements" from David Hildenbrand fixes this selftest for s390. - The series "introduce pte_offset_map_{ro|rw}_nolock()" from Qi Zheng implements some rationaizations and cleanups in the page mapping code. - The series "mm: optimize shadow entries removal" from Shakeel Butt optimizes the file truncation code by speeding up the handling of shadow entries. - The series "Remove PageKsm()" from Matthew Wilcox completes the migration of this flag over to being a folio-based flag. - The series "Unify hugetlb into arch_get_unmapped_area functions" from Oscar Salvador implements a bunch of consolidations and cleanups in the hugetlb code. - The series "Do not shatter hugezeropage on wp-fault" from Dev Jain takes away the wp-fault time practice of turning a huge zero page into small pages. Instead we replace the whole thing with a THP. More consistent cleaner and potentiall saves a large number of pagefaults. - The series "percpu: Add a test case and fix for clang" from Andy Shevchenko enhances and fixes the kernel's built in percpu test code. - The series "mm/mremap: Remove extra vma tree walk" from Liam Howlett optimizes mremap() by avoiding doing things which we didn't need to do. - The series "Improve the tmpfs large folio read performance" from Baolin Wang teaches tmpfs to copy data into userspace at the folio size rather than as individual pages. A 20% speedup was observed. - The series "mm/damon/vaddr: Fix issue in damon_va_evenly_split_region()" fro Zheng Yejian fixes DAMON splitting. - The series "memcg-v1: fully deprecate charge moving" from Shakeel Butt removes the long-deprecated memcgv2 charge moving feature. - The series "fix error handling in mmap_region() and refactor" from Lorenzo Stoakes cleanup up some of the mmap() error handling and addresses some potential performance issues. - The series "x86/module: use large ROX pages for text allocations" from Mike Rapoport teaches x86 to use large pages for read-only-execute module text. - The series "page allocation tag compression" from Suren Baghdasaryan is followon maintenance work for the new page allocation profiling feature. - The series "page->index removals in mm" from Matthew Wilcox remove most references to page->index in mm/. A slow march towards shrinking struct page. - The series "damon/{self,kunit}tests: minor fixups for DAMON debugfs interface tests" from Andrew Paniakin performs maintenance work for DAMON's self testing code. - The series "mm: zswap swap-out of large folios" from Kanchana Sridhar improves zswap's batching of compression and decompression. It is a step along the way towards using Intel IAA hardware acceleration for this zswap operation. - The series "kasan: migrate the last module test to kunit" from Sabyrzhan Tasbolatov completes the migration of the KASAN built-in tests over to the KUnit framework. - The series "implement lightweight guard pages" from Lorenzo Stoakes permits userapace to place fault-generating guard pages within a single VMA, rather than requiring that multiple VMAs be created for this. Improved efficiencies for userspace memory allocators are expected. - The series "memcg: tracepoint for flushing stats" from JP Kobryn uses tracepoints to provide increased visibility into memcg stats flushing activity. - The series "zram: IDLE flag handling fixes" from Sergey Senozhatsky fixes a zram buglet which potentially affected performance. - The series "mm: add more kernel parameters to control mTHP" from MaĆ­ra Canal enhances our ability to control/configuremultisize THP from the kernel boot command line. - The series "kasan: few improvements on kunit tests" from Sabyrzhan Tasbolatov has a couple of fixups for the KASAN KUnit tests. - The series "mm/list_lru: Split list_lru lock into per-cgroup scope" from Kairui Song optimizes list_lru memory utilization when lockdep is enabled. * tag 'mm-stable-2024-11-18-19-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (215 commits) cma: enforce non-zero pageblock_order during cma_init_reserved_mem() mm/kfence: add a new kunit test test_use_after_free_read_nofault() zram: fix NULL pointer in comp_algorithm_show() memcg/hugetlb: add hugeTLB counters to memcg vmstat: call fold_vm_zone_numa_events() before show per zone NUMA event mm: mmap_lock: check trace_mmap_lock_$type_enabled() instead of regcount zram: ZRAM_DEF_COMP should depend on ZRAM MAINTAINERS/MEMORY MANAGEMENT: add document files for mm Docs/mm/damon: recommend academic papers to read and/or cite mm: define general function pXd_init() kmemleak: iommu/iova: fix transient kmemleak false positive mm/list_lru: simplify the list_lru walk callback function mm/list_lru: split the lock to per-cgroup scope mm/list_lru: simplify reparenting and initial allocation mm/list_lru: code clean up for reparenting mm/list_lru: don't export list_lru_add mm/list_lru: don't pass unnecessary key parameters kasan: add kunit tests for kmalloc_track_caller, kmalloc_node_track_caller kasan: change kasan_atomics kunit test as KUNIT_CASE_SLOW kasan: use EXPORT_SYMBOL_IF_KUNIT to export symbols ...
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem_info.c11
-rw-r--r--mm/cma.c12
-rw-r--r--mm/damon/Kconfig2
-rw-r--r--mm/damon/tests/dbgfs-kunit.h2
-rw-r--r--mm/damon/tests/vaddr-kunit.h4
-rw-r--r--mm/damon/vaddr.c9
-rw-r--r--mm/execmem.c352
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/gup.c8
-rw-r--r--mm/huge_memory.c227
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/internal.h52
-rw-r--r--mm/kasan/Makefile2
-rw-r--r--mm/kasan/hw_tags.c7
-rw-r--r--mm/kasan/init.c12
-rw-r--r--mm/kasan/kasan.h2
-rw-r--r--mm/kasan/kasan_test_c.c118
-rw-r--r--mm/kasan/kasan_test_module.c81
-rw-r--r--mm/kasan/report.c19
-rw-r--r--mm/kasan/shadow.c14
-rw-r--r--mm/kfence/kfence_test.c17
-rw-r--r--mm/khugepaged.c31
-rw-r--r--mm/kmemleak.c41
-rw-r--r--mm/kmsan/kmsan_test.c17
-rw-r--r--mm/ksm.c110
-rw-r--r--mm/list_lru.c383
-rw-r--r--mm/maccess.c11
-rw-r--r--mm/madvise.c298
-rw-r--r--mm/memcontrol-v1.c983
-rw-r--r--mm/memcontrol-v1.h6
-rw-r--r--mm/memcontrol.c210
-rw-r--r--mm/memory-failure.c32
-rw-r--r--mm/memory.c67
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c7
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mm_init.c5
-rw-r--r--mm/mmap.c276
-rw-r--r--mm/mmap_lock.c39
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/mremap.c104
-rw-r--r--mm/mseal.c1
-rw-r--r--mm/oom_kill.c1
-rw-r--r--mm/page-writeback.c45
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/page_io.c10
-rw-r--r--mm/page_vma_mapped.c32
-rw-r--r--mm/pagewalk.c246
-rw-r--r--mm/percpu.c11
-rw-r--r--mm/pgtable-generic.c41
-rw-r--r--mm/process_vm_access.c4
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c45
-rw-r--r--mm/shmem.c345
-rw-r--r--mm/show_mem.c3
-rw-r--r--mm/sparse-vmemmap.c12
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swap.c31
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/truncate.c103
-rw-r--r--mm/userfaultfd.c17
-rw-r--r--mm/util.c2
-rw-r--r--mm/vma.c447
-rw-r--r--mm/vma.h97
-rw-r--r--mm/vma_internal.h5
-rw-r--r--mm/vmalloc.c52
-rw-r--r--mm/vmscan.c68
-rw-r--r--mm/vmstat.c28
-rw-r--r--mm/workingset.c34
-rw-r--r--mm/zsmalloc.c86
-rw-r--r--mm/zswap.c245
71 files changed, 3016 insertions, 2629 deletions
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
index fa7cb0c87c03..95f288169a38 100644
--- a/mm/bootmem_info.c
+++ b/mm/bootmem_info.c
@@ -14,23 +14,24 @@
#include <linux/memory_hotplug.h>
#include <linux/kmemleak.h>
-void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
+void get_page_bootmem(unsigned long info, struct page *page,
+ enum bootmem_type type)
{
- page->index = type;
+ BUG_ON(type > 0xf);
+ BUG_ON(info > (ULONG_MAX >> 4));
SetPagePrivate(page);
- set_page_private(page, info);
+ set_page_private(page, info << 4 | type);
page_ref_inc(page);
}
void put_page_bootmem(struct page *page)
{
- unsigned long type = page->index;
+ enum bootmem_type type = bootmem_type(page);
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (page_ref_dec_return(page) == 1) {
- page->index = 0;
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
diff --git a/mm/cma.c b/mm/cma.c
index 2d9fae939283..de5bc0c81fc2 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -32,7 +32,7 @@
#include "cma.h"
struct cma cma_areas[MAX_CMA_AREAS];
-unsigned cma_area_count;
+unsigned int cma_area_count;
static DEFINE_MUTEX(cma_mutex);
phys_addr_t cma_get_base(const struct cma *cma)
@@ -135,7 +135,6 @@ out_error:
totalcma_pages -= cma->count;
cma->count = 0;
pr_err("CMA area %s could not be activated\n", cma->name);
- return;
}
static int __init cma_init_reserved_areas(void)
@@ -182,6 +181,15 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
+ /*
+ * CMA uses CMA_MIN_ALIGNMENT_BYTES as alignment requirement which
+ * needs pageblock_order to be initialized. Let's enforce it.
+ */
+ if (!pageblock_order) {
+ pr_err("pageblock_order not yet initialized. Called during early boot?\n");
+ return -EINVAL;
+ }
+
/* ensure minimal alignment required by mm core */
if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 35b72f88983a..d0357f3e9372 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -60,7 +60,7 @@ config DAMON_SYSFS
the interface for arbitrary data access monitoring.
config DAMON_SYSFS_KUNIT_TEST
- bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
+ bool "Test for damon sysfs interface" if !KUNIT_ALL_TESTS
depends on DAMON_SYSFS && KUNIT=y
default KUNIT_ALL_TESTS
help
diff --git a/mm/damon/tests/dbgfs-kunit.h b/mm/damon/tests/dbgfs-kunit.h
index d2ecfcc8db86..087e53f641a8 100644
--- a/mm/damon/tests/dbgfs-kunit.h
+++ b/mm/damon/tests/dbgfs-kunit.h
@@ -168,6 +168,6 @@ static struct kunit_suite damon_test_suite = {
};
kunit_test_suite(damon_test_suite);
-#endif /* _DAMON_TEST_H */
+#endif /* _DAMON_DBGFS_TEST_H */
#endif /* CONFIG_DAMON_KUNIT_TEST */
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index a339d117150f..b9fe3bc8472b 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -66,7 +66,7 @@ failed:
static void damon_test_three_regions_in_vmas(struct kunit *test)
{
static struct mm_struct mm;
- struct damon_addr_range regions[3] = {0,};
+ struct damon_addr_range regions[3] = {0};
/* 10-20-25, 200-210-220, 300-305, 307-330 */
struct vm_area_struct vmas[] = {
(struct vm_area_struct) {.vm_start = 10, .vm_end = 20},
@@ -300,6 +300,8 @@ static void damon_test_split_evenly(struct kunit *test)
damon_test_split_evenly_fail(test, 0, 100, 0);
damon_test_split_evenly_succ(test, 0, 100, 10);
damon_test_split_evenly_succ(test, 5, 59, 5);
+ damon_test_split_evenly_succ(test, 4, 6, 1);
+ damon_test_split_evenly_succ(test, 0, 3, 2);
damon_test_split_evenly_fail(test, 5, 6, 2);
}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 08cfd22b5249..b9eaa20b73b9 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -67,10 +67,14 @@ static int damon_va_evenly_split_region(struct damon_target *t,
unsigned long sz_orig, sz_piece, orig_end;
struct damon_region *n = NULL, *next;
unsigned long start;
+ unsigned int i;
if (!r || !nr_pieces)
return -EINVAL;
+ if (nr_pieces == 1)
+ return 0;
+
orig_end = r->ar.end;
sz_orig = damon_sz_region(r);
sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
@@ -80,8 +84,7 @@ static int damon_va_evenly_split_region(struct damon_target *t,
r->ar.end = r->ar.start + sz_piece;
next = damon_next_region(r);
- for (start = r->ar.end; start + sz_piece <= orig_end;
- start += sz_piece) {
+ for (start = r->ar.end, i = 1; i < nr_pieces; start += sz_piece, i++) {
n = damon_new_region(start, start + sz_piece);
if (!n)
return -ENOMEM;
@@ -353,11 +356,9 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
set_huge_pte_at(mm, addr, pte, entry, psize);
}
-#ifdef CONFIG_MMU_NOTIFIER
if (mmu_notifier_clear_young(mm, addr,
addr + huge_page_size(hstate_vma(vma))))
referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
folio_set_young(folio);
diff --git a/mm/execmem.c b/mm/execmem.c
index 0c4b36bc6d10..317b6a8d35be 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -6,28 +6,41 @@
* Copyright (C) 2024 Mike Rapoport IBM.
*/
+#define pr_fmt(fmt) "execmem: " fmt
+
#include <linux/mm.h>
+#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/execmem.h>
+#include <linux/maple_tree.h>
+#include <linux/set_memory.h>
#include <linux/moduleloader.h>
+#include <linux/text-patching.h>
+
+#include <asm/tlbflush.h>
+
+#include "internal.h"
static struct execmem_info *execmem_info __ro_after_init;
static struct execmem_info default_execmem_info __ro_after_init;
-static void *__execmem_alloc(struct execmem_range *range, size_t size)
+#ifdef CONFIG_MMU
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+ pgprot_t pgprot, unsigned long vm_flags)
{
bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
- unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
+ unsigned int align = range->alignment;
unsigned long start = range->start;
unsigned long end = range->end;
- unsigned int align = range->alignment;
- pgprot_t pgprot = range->pgprot;
void *p;
if (kasan)
vm_flags |= VM_DEFER_KMEMLEAK;
+ if (vm_flags & VM_ALLOW_HUGE_VMAP)
+ align = PMD_SIZE;
+
p = __vmalloc_node_range(size, align, start, end, gfp_flags,
pgprot, vm_flags, NUMA_NO_NODE,
__builtin_return_address(0));
@@ -40,7 +53,7 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size)
}
if (!p) {
- pr_warn_ratelimited("execmem: unable to allocate memory\n");
+ pr_warn_ratelimited("unable to allocate memory\n");
return NULL;
}
@@ -49,14 +62,314 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size)
return NULL;
}
- return kasan_reset_tag(p);
+ return p;
+}
+
+struct vm_struct *execmem_vmap(size_t size)
+{
+ struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA];
+ struct vm_struct *area;
+
+ area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC,
+ range->start, range->end, NUMA_NO_NODE,
+ GFP_KERNEL, __builtin_return_address(0));
+ if (!area && range->fallback_start)
+ area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC,
+ range->fallback_start, range->fallback_end,
+ NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0));
+
+ return area;
+}
+#else
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+ pgprot_t pgprot, unsigned long vm_flags)
+{
+ return vmalloc(size);
+}
+#endif /* CONFIG_MMU */
+
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
+struct execmem_cache {
+ struct mutex mutex;
+ struct maple_tree busy_areas;
+ struct maple_tree free_areas;
+};
+
+static struct execmem_cache execmem_cache = {
+ .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
+ .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
+ execmem_cache.mutex),
+ .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
+ execmem_cache.mutex),
+};
+
+static inline unsigned long mas_range_len(struct ma_state *mas)
+{
+ return mas->last - mas->index + 1;
+}
+
+static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid)
+{
+ unsigned int nr = (1 << get_vm_area_page_order(vm));
+ unsigned int updated = 0;
+ int err = 0;
+
+ for (int i = 0; i < vm->nr_pages; i += nr) {
+ err = set_direct_map_valid_noflush(vm->pages[i], nr, valid);
+ if (err)
+ goto err_restore;
+ updated += nr;
+ }
+
+ return 0;
+
+err_restore:
+ for (int i = 0; i < updated; i += nr)
+ set_direct_map_valid_noflush(vm->pages[i], nr, !valid);
+
+ return err;
+}
+
+static void execmem_cache_clean(struct work_struct *work)
+{
+ struct maple_tree *free_areas = &execmem_cache.free_areas;
+ struct mutex *mutex = &execmem_cache.mutex;
+ MA_STATE(mas, free_areas, 0, ULONG_MAX);
+ void *area;
+
+ mutex_lock(mutex);
+ mas_for_each(&mas, area, ULONG_MAX) {
+ size_t size = mas_range_len(&mas);
+
+ if (IS_ALIGNED(size, PMD_SIZE) &&
+ IS_ALIGNED(mas.index, PMD_SIZE)) {
+ struct vm_struct *vm = find_vm_area(area);
+
+ execmem_set_direct_map_valid(vm, true);
+ mas_store_gfp(&mas, NULL, GFP_KERNEL);
+ vfree(area);
+ }
+ }
+ mutex_unlock(mutex);
+}
+
+static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
+
+static int execmem_cache_add(void *ptr, size_t size)
+{
+ struct maple_tree *free_areas = &execmem_cache.free_areas;
+ struct mutex *mutex = &execmem_cache.mutex;
+ unsigned long addr = (unsigned long)ptr;
+ MA_STATE(mas, free_areas, addr - 1, addr + 1);
+ unsigned long lower, upper;
+ void *area = NULL;
+ int err;
+
+ lower = addr;
+ upper = addr + size - 1;
+
+ mutex_lock(mutex);
+ area = mas_walk(&mas);
+ if (area && mas.last == addr - 1)
+ lower = mas.index;
+
+ area = mas_next(&mas, ULONG_MAX);
+ if (area && mas.index == addr + size)
+ upper = mas.last;
+
+ mas_set_range(&mas, lower, upper);
+ err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
+ mutex_unlock(mutex);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static bool within_range(struct execmem_range *range, struct ma_state *mas,
+ size_t size)
+{
+ unsigned long addr = mas->index;
+
+ if (addr >= range->start && addr + size < range->end)
+ return true;
+
+ if (range->fallback_start &&
+ addr >= range->fallback_start && addr + size < range->fallback_end)
+ return true;
+
+ return false;
+}
+
+static void *__execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+ struct maple_tree *free_areas = &execmem_cache.free_areas;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
+ MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
+ struct mutex *mutex = &execmem_cache.mutex;
+ unsigned long addr, last, area_size = 0;
+ void *area, *ptr = NULL;
+ int err;
+
+ mutex_lock(mutex);
+ mas_for_each(&mas_free, area, ULONG_MAX) {
+ area_size = mas_range_len(&mas_free);
+
+ if (area_size >= size && within_range(range, &mas_free, size))
+ break;
+ }
+
+ if (area_size < size)
+ goto out_unlock;
+
+ addr = mas_free.index;
+ last = mas_free.last;
+
+ /* insert allocated size to busy_areas at range [addr, addr + size) */
+ mas_set_range(&mas_busy, addr, addr + size - 1);
+ err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL);
+ if (err)
+ goto out_unlock;
+
+ mas_store_gfp(&mas_free, NULL, GFP_KERNEL);
+ if (area_size > size) {
+ void *ptr = (void *)(addr + size);
+
+ /*
+ * re-insert remaining free size to free_areas at range
+ * [addr + size, last]
+ */
+ mas_set_range(&mas_free, addr + size, last);
+ err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL);
+ if (err) {
+ mas_store_gfp(&mas_busy, NULL, GFP_KERNEL);
+ goto out_unlock;
+ }
+ }
+ ptr = (void *)addr;
+
+out_unlock:
+ mutex_unlock(mutex);
+ return ptr;
+}
+
+static int execmem_cache_populate(struct execmem_range *range, size_t size)
+{
+ unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
+ unsigned long start, end;
+ struct vm_struct *vm;
+ size_t alloc_size;
+ int err = -ENOMEM;
+ void *p;
+
+ alloc_size = round_up(size, PMD_SIZE);
+ p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ if (!p)
+ return err;
+
+ vm = find_vm_area(p);
+ if (!vm)
+ goto err_free_mem;
+
+ /* fill memory with instructions that will trap */
+ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
+
+ start = (unsigned long)p;
+ end = start + alloc_size;
+
+ vunmap_range(start, end);
+
+ err = execmem_set_direct_map_valid(vm, false);
+ if (err)
+ goto err_free_mem;
+
+ err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
+ PMD_SHIFT);
+ if (err)
+ goto err_free_mem;
+
+ err = execmem_cache_add(p, alloc_size);
+ if (err)
+ goto err_free_mem;
+
+ return 0;
+
+err_free_mem:
+ vfree(p);
+ return err;
+}
+
+static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+ void *p;
+ int err;
+
+ p = __execmem_cache_alloc(range, size);
+ if (p)
+ return p;
+
+ err = execmem_cache_populate(range, size);
+ if (err)
+ return NULL;
+
+ return __execmem_cache_alloc(range, size);
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ struct mutex *mutex = &execmem_cache.mutex;
+ unsigned long addr = (unsigned long)ptr;
+ MA_STATE(mas, busy_areas, addr, addr);
+ size_t size;
+ void *area;
+
+ mutex_lock(mutex);
+ area = mas_walk(&mas);
+ if (!area) {
+ mutex_unlock(mutex);
+ return false;
+ }
+ size = mas_range_len(&mas);
+
+ mas_store_gfp(&mas, NULL, GFP_KERNEL);
+ mutex_unlock(mutex);
+
+ execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
+
+ execmem_cache_add(ptr, size);
+
+ schedule_work(&execmem_cache_clean_work);
+
+ return true;
+}
+#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
+static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+ return NULL;
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+ return false;
}
+#endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */
void *execmem_alloc(enum execmem_type type, size_t size)
{
struct execmem_range *range = &execmem_info->ranges[type];
+ bool use_cache = range->flags & EXECMEM_ROX_CACHE;
+ unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
+ pgprot_t pgprot = range->pgprot;
+ void *p;
- return __execmem_alloc(range, size);
+ if (use_cache)
+ p = execmem_cache_alloc(range, size);
+ else
+ p = execmem_vmalloc(range, size, pgprot, vm_flags);
+
+ return kasan_reset_tag(p);
}
void execmem_free(void *ptr)
@@ -66,7 +379,19 @@ void execmem_free(void *ptr)
* supported by vmalloc.
*/
WARN_ON(in_interrupt());
- vfree(ptr);
+
+ if (!execmem_cache_free(ptr))
+ vfree(ptr);
+}
+
+void *execmem_update_copy(void *dst, const void *src, size_t size)
+{
+ return text_poke_copy(dst, src, size);
+}
+
+bool execmem_is_rox(enum execmem_type type)
+{
+ return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);
}
static bool execmem_validate(struct execmem_info *info)
@@ -78,6 +403,17 @@ static bool execmem_validate(struct execmem_info *info)
return false;
}
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) {
+ for (int i = EXECMEM_DEFAULT; i < EXECMEM_TYPE_MAX; i++) {
+ r = &info->ranges[i];
+
+ if (r->flags & EXECMEM_ROX_CACHE) {
+ pr_warn_once("ROX cache is not supported\n");
+ r->flags &= ~EXECMEM_ROX_CACHE;
+ }
+ }
+ }
+
return true;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 196779e8e396..7c76a123ba18 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -119,7 +119,6 @@
* ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)
* bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)
* ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)
- * ->memcg->move_lock (folio_remove_rmap_pte->folio_memcg_lock)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->block_dirty_folio)
@@ -3260,8 +3259,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
return 0;
- ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
- &vmf->ptl);
+ ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
if (unlikely(!ptep))
return VM_FAULT_NOPAGE;
diff --git a/mm/gup.c b/mm/gup.c
index 2b84eead9baa..746070a1d8bf 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -922,14 +922,14 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
}
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
- !pte_dirty(pte) && !PageDirty(page))
- set_page_dirty(page);
+ !pte_dirty(pte) && !folio_test_dirty(folio))
+ folio_mark_dirty(folio);
/*
* pte_mkyoung() would be more correct here, but atomic care
* is needed to avoid losing the dirty bit: it is easier to use
- * mark_page_accessed().
+ * folio_mark_accessed().
*/
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
}
out:
pte_unmap_unlock(ptep, ptl);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5734d5d5060f..ee335d96fc39 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -8,7 +8,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
@@ -84,6 +83,21 @@ unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
static bool anon_orders_configured __initdata;
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+ struct inode *inode;
+
+ if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+ return false;
+
+ if (!vma->vm_file)
+ return false;
+
+ inode = file_inode(vma->vm_file);
+
+ return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
+}
+
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long vm_flags,
unsigned long tva_flags,
@@ -601,6 +615,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
+DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
#ifdef CONFIG_SHMEM
@@ -619,6 +635,8 @@ static struct attribute *anon_stats_attrs[] = {
&anon_fault_fallback_attr.attr,
&anon_fault_fallback_charge_attr.attr,
#ifndef CONFIG_SHMEM
+ &zswpout_attr.attr,
+ &swpin_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -649,6 +667,8 @@ static struct attribute_group file_stats_attr_grp = {
static struct attribute *any_stats_attrs[] = {
#ifdef CONFIG_SHMEM
+ &zswpout_attr.attr,
+ &swpin_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -938,26 +958,6 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline int get_order_from_str(const char *size_str)
-{
- unsigned long size;
- char *endptr;
- int order;
-
- size = memparse(size_str, &endptr);
-
- if (!is_power_of_2(size))
- goto err;
- order = get_order(size);
- if (BIT(order) & ~THP_ORDERS_ALL_ANON)
- goto err;
-
- return order;
-err:
- pr_err("invalid size %s in thp_anon boot parameter\n", size_str);
- return -EINVAL;
-}
-
static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_anon(char *str)
{
@@ -969,7 +969,7 @@ static int __init setup_thp_anon(char *str)
if (!str || strlen(str) + 1 > PAGE_SIZE)
goto err;
- strcpy(str_dup, str);
+ strscpy(str_dup, str);
always = huge_anon_orders_always;
madvise = huge_anon_orders_madvise;
@@ -987,10 +987,22 @@ static int __init setup_thp_anon(char *str)
start_size = strsep(&subtoken, "-");
end_size = subtoken;
- start = get_order_from_str(start_size);
- end = get_order_from_str(end_size);
+ start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON);
+ end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON);
} else {
- start = end = get_order_from_str(subtoken);
+ start_size = end_size = subtoken;
+ start = end = get_order_from_str(subtoken,
+ THP_ORDERS_ALL_ANON);
+ }
+
+ if (start == -EINVAL) {
+ pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
+ goto err;
+ }
+
+ if (end == -EINVAL) {
+ pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
+ goto err;
}
if (start < 0 || end < 0 || start > end)
@@ -1137,47 +1149,87 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
- struct page *page, gfp_t gfp)
+static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct vm_area_struct *vma = vmf->vma;
- struct folio *folio = page_folio(page);
- pgtable_t pgtable;
- unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- vm_fault_t ret = 0;
+ gfp_t gfp = vma_thp_gfp_mask(vma);
+ const int order = HPAGE_PMD_ORDER;
+ struct folio *folio;
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);
+ if (unlikely(!folio)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+ return NULL;
+ }
+
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
folio_put(folio);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
- return VM_FAULT_FALLBACK;
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ return NULL;
}
folio_throttle_swaprate(folio, gfp);
- pgtable = pte_alloc_one(vma->vm_mm);
- if (unlikely(!pgtable)) {
- ret = VM_FAULT_OOM;
- goto release;
- }
-
- folio_zero_user(folio, vmf->address);
+ /*
+ * When a folio is not zeroed during allocation (__GFP_ZERO not used),
+ * folio_zero_user() is used to make sure that the page corresponding
+ * to the faulting address will be hot in the cache after zeroing.
+ */
+ if (!alloc_zeroed())
+ folio_zero_user(folio, addr);
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
* folio_zero_user writes become visible before the set_pmd_at()
* write.
*/
__folio_mark_uptodate(folio);
+ return folio;
+}
+
+static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
+ struct vm_area_struct *vma, unsigned long haddr)
+{
+ pmd_t entry;
+
+ entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
+ folio_add_lru_vma(folio, vma);
+ set_pmd_at(vma->vm_mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, haddr, pmd);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+}
+
+static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+{
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
+ pgtable_t pgtable;
+ vm_fault_t ret = 0;
+
+ folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
+ if (unlikely(!folio))
+ return VM_FAULT_FALLBACK;
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable)) {
+ ret = VM_FAULT_OOM;
+ goto release;
+ }
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
goto unlock_release;
} else {
- pmd_t entry;
-
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock_release;
@@ -1191,21 +1243,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
-
- entry = mk_huge_pmd(page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
- folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
mm_inc_nr_ptes(vma->vm_mm);
deferred_split_folio(folio, false);
spin_unlock(vmf->ptl);
- count_vm_event(THP_FAULT_ALLOC);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
- count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
return 0;
@@ -1272,8 +1314,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- gfp_t gfp;
- struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret;
@@ -1324,14 +1364,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
}
return ret;
}
- gfp = vma_thp_gfp_mask(vma);
- folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
- if (unlikely(!folio)) {
- count_vm_event(THP_FAULT_FALLBACK);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
- }
- return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
+
+ return __do_huge_pmd_anonymous_page(vmf);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1763,6 +1797,38 @@ unlock:
spin_unlock(vmf->ptl);
}
+static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
+{
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+ struct folio *folio;
+ vm_fault_t ret = 0;
+
+ folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
+ if (unlikely(!folio))
+ return VM_FAULT_FALLBACK;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
+ goto release;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto release;
+ (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
+ map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
+ goto unlock;
+release:
+ folio_put(folio);
+unlock:
+ spin_unlock(vmf->ptl);
+ mmu_notifier_invalidate_range_end(&range);
+ return ret;
+}
+
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
@@ -1775,8 +1841,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
- if (is_huge_zero_pmd(orig_pmd))
+ if (is_huge_zero_pmd(orig_pmd)) {
+ vm_fault_t ret = do_huge_zero_wp_pmd(vmf);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+
+ /* Fallback to splitting PMD if THP cannot be allocated */
goto fallback;
+ }
spin_lock(vmf->ptl);
@@ -3124,8 +3197,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
/* ->mapping in first and second tail page is replaced by other uses */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
page_tail);
- page_tail->mapping = head->mapping;
- page_tail->index = head->index + tail;
+ new_folio->mapping = folio->mapping;
+ new_folio->index = folio->index + tail;
/*
* page->private should not be set in tail pages. Fix up and warn once
@@ -3201,11 +3274,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageHasHWPoisoned(head);
for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
+ struct folio *tail;
__split_huge_page_tail(folio, i, lruvec, list, new_order);
+ tail = page_folio(head + i);
/* Some pages can be beyond EOF: drop them from page cache */
- if (head[i].index >= end) {
- struct folio *tail = page_folio(head + i);
-
+ if (tail->index >= end) {
if (shmem_mapping(folio->mapping))
nr_dropped++;
else if (folio_test_clear_dirty(tail))
@@ -3213,12 +3286,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
inode_to_wb(folio->mapping->host));
__filemap_remove_folio(tail, NULL);
folio_put(tail);
- } else if (!PageAnon(page)) {
- __xa_store(&folio->mapping->i_pages, head[i].index,
- head + i, 0);
+ } else if (!folio_test_anon(folio)) {
+ __xa_store(&folio->mapping->i_pages, tail->index,
+ tail, 0);
} else if (swap_cache) {
__xa_store(&swap_cache->i_pages, offset + i,
- head + i, 0);
+ tail, 0);
}
}
@@ -4096,7 +4169,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
tok = strsep(&buf, ",");
if (tok) {
- strcpy(file_path, tok);
+ strscpy(file_path, tok);
} else {
ret = -EINVAL;
goto out;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 190fa05635f4..ea2ed8e301ef 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1925,6 +1925,7 @@ void free_huge_folio(struct folio *folio)
pages_per_huge_page(h), folio);
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
+ lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h));
mem_cgroup_uncharge(folio);
if (restore_reserve)
h->resv_huge_pages++;
@@ -3093,6 +3094,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
if (!memcg_charge_ret)
mem_cgroup_commit_charge(folio, memcg);
+ lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
mem_cgroup_put(memcg);
return folio;
@@ -3301,6 +3303,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
{
unsigned long i;
char buf[32];
+ LIST_HEAD(folio_list);
for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
if (hstate_is_gigantic(h)) {
@@ -3310,14 +3313,18 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
struct folio *folio;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
- folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
- &node_states[N_MEMORY]);
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
+ &node_states[N_MEMORY], NULL);
if (!folio)
break;
- free_huge_folio(folio); /* free it into the hugepage allocator */
+ list_add(&folio->lru, &folio_list);
}
cond_resched();
}
+
+ if (!list_empty(&folio_list))
+ prep_and_add_allocated_folios(h, &folio_list);
+
if (i == h->max_huge_pages_node[nid])
return;
@@ -6348,6 +6355,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
+ } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
+ /* This isn't supported in hugetlb. */
+ ret = VM_FAULT_SIGSEGV;
+ goto out_mutex;
}
}
diff --git a/mm/internal.h b/mm/internal.h
index 64c2eb0b160e..5a7302baeed7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
+#include <linux/pagewalk.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -841,7 +842,7 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
}
/* mm/util.c */
-struct anon_vma *folio_anon_vma(struct folio *folio);
+struct anon_vma *folio_anon_vma(const struct folio *folio);
#ifdef CONFIG_MMU
void unmap_mapping_folio(struct folio *folio);
@@ -959,7 +960,7 @@ extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
* If any page in this range is mapped by this VMA, return the first address
* where any of these pages appear. Otherwise, return -EFAULT.
*/
-static inline unsigned long vma_address(struct vm_area_struct *vma,
+static inline unsigned long vma_address(const struct vm_area_struct *vma,
pgoff_t pgoff, unsigned long nr_pages)
{
unsigned long address;
@@ -1117,10 +1118,11 @@ void ClearPageHWPoisonTakenOff(struct page *page);
bool take_page_off_buddy(struct page *page);
bool put_page_back_buddy(struct page *page);
struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
-void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
unsigned long ksm_addr);
-unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
+unsigned long page_mapped_in_vma(const struct page *page,
+ struct vm_area_struct *vma);
#else
static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
@@ -1234,6 +1236,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
void __init vmalloc_init(void);
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift);
+unsigned int get_vm_area_page_order(struct vm_struct *vm);
#else
static inline void vmalloc_init(void)
{
@@ -1262,6 +1265,12 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
void free_zone_device_folio(struct folio *folio);
int migrate_device_coherent_folio(struct folio *folio);
+struct vm_struct *__get_vm_area_node(unsigned long size,
+ unsigned long align, unsigned long shift,
+ unsigned long flags, unsigned long start,
+ unsigned long end, int node, gfp_t gfp_mask,
+ const void *caller);
+
/*
* mm/gup.c
*/
@@ -1276,6 +1285,34 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr,
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write);
+static inline bool alloc_zeroed(void)
+{
+ return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
+ &init_on_alloc);
+}
+
+/*
+ * Parses a string with mem suffixes into its order. Useful to parse kernel
+ * parameters.
+ */
+static inline int get_order_from_str(const char *size_str,
+ unsigned long valid_orders)
+{
+ unsigned long size;
+ char *endptr;
+ int order;
+
+ size = memparse(size_str, &endptr);
+
+ if (!is_power_of_2(size))
+ return -EINVAL;
+ order = get_order(size);
+ if (BIT(order) & ~valid_orders)
+ return -EINVAL;
+
+ return order;
+}
+
enum {
/* mark page accessed */
FOLL_TOUCH = 1 << 16,
@@ -1356,7 +1393,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
smp_rmb();
/*
- * Note that PageKsm() pages cannot be exclusive, and consequently,
+ * Note that KSM pages cannot be exclusive, and consequently,
* cannot get pinned.
*/
return !PageAnonExclusive(page);
@@ -1488,4 +1525,9 @@ static inline void accept_page(struct page *page)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
+/* pagewalk.c */
+int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index b88543e5c0cc..1a958e7c8a46 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -46,7 +46,6 @@ endif
CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST)
RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN)
-CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST)
obj-y := common.o report.o
obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
@@ -59,4 +58,3 @@ ifdef CONFIG_RUST
endif
obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o
-obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 9958ebc15d38..ccd66c7a4081 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) "kasan: " fmt
+#include <kunit/visibility.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
@@ -394,12 +395,12 @@ void kasan_enable_hw_tags(void)
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-EXPORT_SYMBOL_GPL(kasan_enable_hw_tags);
+EXPORT_SYMBOL_IF_KUNIT(kasan_enable_hw_tags);
-void kasan_force_async_fault(void)
+VISIBLE_IF_KUNIT void kasan_force_async_fault(void)
{
hw_force_async_tag_fault();
}
-EXPORT_SYMBOL_GPL(kasan_force_async_fault);
+EXPORT_SYMBOL_IF_KUNIT(kasan_force_async_fault);
#endif
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ac607c306292..ced6b29fcf76 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -106,10 +106,6 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
}
}
-void __weak __meminit kernel_pte_init(void *addr)
-{
-}
-
static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
unsigned long end)
{
@@ -145,10 +141,6 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
return 0;
}
-void __weak __meminit pmd_init(void *addr)
-{
-}
-
static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
unsigned long end)
{
@@ -187,10 +179,6 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
return 0;
}
-void __weak __meminit pud_init(void *addr)
-{
-}
-
static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
unsigned long end)
{
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index f438a6cdc964..b7e4b81421b3 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -568,7 +568,7 @@ static inline void kasan_kunit_test_suite_end(void) { }
#endif /* CONFIG_KASAN_KUNIT_TEST */
-#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
bool kasan_save_enable_multi_shot(void);
void kasan_restore_multi_shot(bool enabled);
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index d8fb281e439d..e0ec5a6d15be 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -33,6 +33,8 @@
#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
+MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+
static bool multishot;
/* Fields set based on lines observed in the console. */
@@ -213,6 +215,36 @@ static void kmalloc_node_oob_right(struct kunit *test)
kfree(ptr);
}
+static void kmalloc_track_caller_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ /*
+ * Check that KASAN detects out-of-bounds access for object allocated via
+ * kmalloc_track_caller().
+ */
+ ptr = kmalloc_track_caller(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'y');
+
+ kfree(ptr);
+
+ /*
+ * Check that KASAN detects out-of-bounds access for object allocated via
+ * kmalloc_node_track_caller().
+ */
+ ptr = kmalloc_node_track_caller(size, GFP_KERNEL, 0);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'y');
+
+ kfree(ptr);
+}
+
/*
* Check that KASAN detects an out-of-bounds access for a big object allocated
* via kmalloc(). But not as big as to trigger the page_alloc fallback.
@@ -1928,10 +1960,92 @@ static void rust_uaf(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, kasan_test_rust_uaf());
}
+static void copy_to_kernel_nofault_oob(struct kunit *test)
+{
+ char *ptr;
+ char buf[128];
+ size_t size = sizeof(buf);
+
+ /*
+ * This test currently fails with the HW_TAGS mode. The reason is
+ * unknown and needs to be investigated.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS);
+
+ ptr = kmalloc(size - KASAN_GRANULE_SIZE, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ OPTIMIZER_HIDE_VAR(ptr);
+
+ /*
+ * We test copy_to_kernel_nofault() to detect corrupted memory that is
+ * being written into the kernel. In contrast,
+ * copy_from_kernel_nofault() is primarily used in kernel helper
+ * functions where the source address might be random or uninitialized.
+ * Applying KASAN instrumentation to copy_from_kernel_nofault() could
+ * lead to false positives. By focusing KASAN checks only on
+ * copy_to_kernel_nofault(), we ensure that only valid memory is
+ * written to the kernel, minimizing the risk of kernel corruption
+ * while avoiding false positives in the reverse case.
+ */
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ copy_to_kernel_nofault(&buf[0], ptr, size));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ copy_to_kernel_nofault(ptr, &buf[0], size));
+
+ kfree(ptr);
+}
+
+static void copy_user_test_oob(struct kunit *test)
+{
+ char *kmem;
+ char __user *usermem;
+ unsigned long useraddr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+ int __maybe_unused unused;
+
+ kmem = kunit_kmalloc(test, size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, kmem);
+
+ useraddr = kunit_vm_mmap(test, NULL, 0, PAGE_SIZE,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+ KUNIT_ASSERT_NE_MSG(test, useraddr, 0,
+ "Could not create userspace mm");
+ KUNIT_ASSERT_LT_MSG(test, useraddr, (unsigned long)TASK_SIZE,
+ "Failed to allocate user memory");
+
+ OPTIMIZER_HIDE_VAR(size);
+ usermem = (char __user *)useraddr;
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = copy_from_user(kmem, usermem, size + 1));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = copy_to_user(usermem, kmem, size + 1));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = __copy_from_user(kmem, usermem, size + 1));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = __copy_to_user(usermem, kmem, size + 1));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = __copy_from_user_inatomic(kmem, usermem, size + 1));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = __copy_to_user_inatomic(usermem, kmem, size + 1));
+
+ /*
+ * Prepare a long string in usermem to avoid the strncpy_from_user test
+ * bailing out on '\0' before it reaches out-of-bounds.
+ */
+ memset(kmem, 'a', size);
+ KUNIT_EXPECT_EQ(test, copy_to_user(usermem, kmem, size), 0);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = strncpy_from_user(kmem, usermem, size + 1));
+}
+
static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kmalloc_oob_right),
KUNIT_CASE(kmalloc_oob_left),
KUNIT_CASE(kmalloc_node_oob_right),
+ KUNIT_CASE(kmalloc_track_caller_oob_right),
KUNIT_CASE(kmalloc_big_oob_right),
KUNIT_CASE(kmalloc_large_oob_right),
KUNIT_CASE(kmalloc_large_uaf),
@@ -1992,7 +2106,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kasan_strings),
KUNIT_CASE(kasan_bitops_generic),
KUNIT_CASE(kasan_bitops_tags),
- KUNIT_CASE(kasan_atomics),
+ KUNIT_CASE_SLOW(kasan_atomics),
KUNIT_CASE(vmalloc_helpers_tags),
KUNIT_CASE(vmalloc_oob),
KUNIT_CASE(vmap_tags),
@@ -2000,7 +2114,9 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(match_all_not_assigned),
KUNIT_CASE(match_all_ptr_tag),
KUNIT_CASE(match_all_mem_tag),
+ KUNIT_CASE(copy_to_kernel_nofault_oob),
KUNIT_CASE(rust_uaf),
+ KUNIT_CASE(copy_user_test_oob),
{}
};
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
deleted file mode 100644
index 27ec22767e42..000000000000
--- a/mm/kasan/kasan_test_module.c
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *
- * Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
- */
-
-#define pr_fmt(fmt) "kasan: test: " fmt
-
-#include <linux/mman.h>
-#include <linux/module.h>
-#include <linux/printk.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-#include "kasan.h"
-
-static noinline void __init copy_user_test(void)
-{
- char *kmem;
- char __user *usermem;
- size_t size = 128 - KASAN_GRANULE_SIZE;
- int __maybe_unused unused;
-
- kmem = kmalloc(size, GFP_KERNEL);
- if (!kmem)
- return;
-
- usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_ANONYMOUS | MAP_PRIVATE, 0);
- if (IS_ERR(usermem)) {
- pr_err("Failed to allocate user memory\n");
- kfree(kmem);
- return;
- }
-
- OPTIMIZER_HIDE_VAR(size);
-
- pr_info("out-of-bounds in copy_from_user()\n");
- unused = copy_from_user(kmem, usermem, size + 1);
-
- pr_info("out-of-bounds in copy_to_user()\n");
- unused = copy_to_user(usermem, kmem, size + 1);
-
- pr_info("out-of-bounds in __copy_from_user()\n");
- unused = __copy_from_user(kmem, usermem, size + 1);
-
- pr_info("out-of-bounds in __copy_to_user()\n");
- unused = __copy_to_user(usermem, kmem, size + 1);
-
- pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
- unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
-
- pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
- unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
-
- pr_info("out-of-bounds in strncpy_from_user()\n");
- unused = strncpy_from_user(kmem, usermem, size + 1);
-
- vm_munmap((unsigned long)usermem, PAGE_SIZE);
- kfree(kmem);
-}
-
-static int __init kasan_test_module_init(void)
-{
- /*
- * Temporarily enable multi-shot mode. Otherwise, KASAN would only
- * report the first detected bug and panic the kernel if panic_on_warn
- * is enabled.
- */
- bool multishot = kasan_save_enable_multi_shot();
-
- copy_user_test();
-
- kasan_restore_multi_shot(multishot);
- return -EAGAIN;
-}
-
-module_init(kasan_test_module_init);
-MODULE_LICENSE("GPL");
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b48c768acc84..50fb19ad4388 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -10,6 +10,7 @@
*/
#include <kunit/test.h>
+#include <kunit/visibility.h>
#include <linux/bitops.h>
#include <linux/ftrace.h>
#include <linux/init.h>
@@ -132,20 +133,20 @@ static bool report_enabled(void)
return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
}
-#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-bool kasan_save_enable_multi_shot(void)
+VISIBLE_IF_KUNIT bool kasan_save_enable_multi_shot(void)
{
return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
}
-EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
+EXPORT_SYMBOL_IF_KUNIT(kasan_save_enable_multi_shot);
-void kasan_restore_multi_shot(bool enabled)
+VISIBLE_IF_KUNIT void kasan_restore_multi_shot(bool enabled)
{
if (!enabled)
clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
}
-EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
+EXPORT_SYMBOL_IF_KUNIT(kasan_restore_multi_shot);
#endif
@@ -157,17 +158,17 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
*/
static bool kasan_kunit_executing;
-void kasan_kunit_test_suite_start(void)
+VISIBLE_IF_KUNIT void kasan_kunit_test_suite_start(void)
{
WRITE_ONCE(kasan_kunit_executing, true);
}
-EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_start);
+EXPORT_SYMBOL_IF_KUNIT(kasan_kunit_test_suite_start);
-void kasan_kunit_test_suite_end(void)
+VISIBLE_IF_KUNIT void kasan_kunit_test_suite_end(void)
{
WRITE_ONCE(kasan_kunit_executing, false);
}
-EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_end);
+EXPORT_SYMBOL_IF_KUNIT(kasan_kunit_test_suite_end);
static bool kasan_kunit_test_suite_executing(void)
{
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d6210ca48dda..88d1c9dcb507 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
*/
void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long free_region_start,
- unsigned long free_region_end)
+ unsigned long free_region_end,
+ unsigned long flags)
{
void *shadow_start, *shadow_end;
unsigned long region_start, region_end;
@@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
return;
}
- apply_to_existing_page_range(&init_mm,
+
+
+ if (flags & KASAN_VMALLOC_PAGE_RANGE)
+ apply_to_existing_page_range(&init_mm,
(unsigned long)shadow_start,
size, kasan_depopulate_vmalloc_pte,
NULL);
- flush_tlb_kernel_range((unsigned long)shadow_start,
- (unsigned long)shadow_end);
+
+ if (flags & KASAN_VMALLOC_TLB_FLUSH)
+ flush_tlb_kernel_range((unsigned long)shadow_start,
+ (unsigned long)shadow_end);
}
}
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 00fd17285285..f65fb182466d 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -383,6 +383,22 @@ static void test_use_after_free_read(struct kunit *test)
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
+static void test_use_after_free_read_nofault(struct kunit *test)
+{
+ const size_t size = 32;
+ char *addr;
+ char dst;
+ int ret;
+
+ setup_test_cache(test, size, 0, NULL);
+ addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ test_free(addr);
+ /* Use after free with *_nofault() */
+ ret = copy_from_kernel_nofault(&dst, addr, 1);
+ KUNIT_EXPECT_EQ(test, ret, -EFAULT);
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
static void test_double_free(struct kunit *test)
{
const size_t size = 32;
@@ -780,6 +796,7 @@ static struct kunit_case kfence_test_cases[] = {
KFENCE_KUNIT_CASE(test_out_of_bounds_read),
KFENCE_KUNIT_CASE(test_out_of_bounds_write),
KFENCE_KUNIT_CASE(test_use_after_free_read),
+ KFENCE_KUNIT_CASE(test_use_after_free_read_nofault),
KFENCE_KUNIT_CASE(test_double_free),
KFENCE_KUNIT_CASE(test_invalid_addr_free),
KFENCE_KUNIT_CASE(test_corruption),
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b538c3d48386..6f8d46d107b4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -4,7 +4,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -416,9 +415,11 @@ static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
static bool hugepage_pmd_enabled(void)
{
/*
- * We cover both the anon and the file-backed case here; file-backed
+ * We cover the anon, shmem and the file-backed case here; file-backed
* hugepages, when configured in, are determined by the global control.
* Anon pmd-sized hugepages are determined by the pmd-size control.
+ * Shmem pmd-sized hugepages are also determined by its pmd-size control,
+ * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
*/
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
hugepage_global_enabled())
@@ -430,6 +431,8 @@ static bool hugepage_pmd_enabled(void)
if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
hugepage_global_enabled())
return true;
+ if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
+ return true;
return false;
}
@@ -1011,7 +1014,11 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
};
if (!pte++) {
- pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
+ /*
+ * Here the ptl is only used to check pte_same() in
+ * do_swap_page(), so readonly version is enough.
+ */
+ pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl);
if (!pte) {
mmap_read_unlock(mm);
result = SCAN_PMD_NULL;
@@ -1601,7 +1608,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
pml = pmd_lock(mm, pmd);
- start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
goto abort;
if (!pml)
@@ -1609,6 +1616,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
else if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
+ goto abort;
+
/* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
@@ -1641,7 +1651,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
nr_ptes++;
}
- pte_unmap(start_pte);
if (!pml)
spin_unlock(ptl);
@@ -1654,14 +1663,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 4: remove empty page table */
if (!pml) {
pml = pmd_lock(mm, pmd);
- if (ptl != pml)
+ if (ptl != pml) {
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
+ flush_tlb_mm(mm);
+ goto unlock;
+ }
+ }
}
pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
pmdp_get_lockless_sync();
+ pte_unmap_unlock(start_pte, ptl);
if (ptl != pml)
- spin_unlock(ptl);
- spin_unlock(pml);
+ spin_unlock(pml);
mmu_notifier_invalidate_range_end(&range);
@@ -1681,6 +1695,7 @@ abort:
folio_ref_sub(folio, nr_ptes);
add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
}
+unlock:
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
if (pml && pml != ptl)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 0400f5e8ac60..2a945c07ae99 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -935,6 +935,28 @@ static void make_black_object(unsigned long ptr, unsigned int objflags)
}
/*
+ * Reset the checksum of an object. The immediate effect is that it will not
+ * be reported as a leak during the next scan until its checksum is updated.
+ */
+static void reset_checksum(unsigned long ptr)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Not resetting the checksum of an unknown object at 0x%08lx\n",
+ ptr);
+ return;
+ }
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ object->checksum = 0;
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
* Add a scanning area to the object. If at least one such area is added,
* kmemleak will only scan these ranges rather than the whole memory block.
*/
@@ -1011,7 +1033,7 @@ static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
}
/*
- * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
+ * Set the OBJECT_NO_SCAN flag for the object corresponding to the given
* pointer. Such object will not be scanned by kmemleak but references to it
* are searched.
*/
@@ -1203,6 +1225,23 @@ void __ref kmemleak_not_leak(const void *ptr)
EXPORT_SYMBOL(kmemleak_not_leak);
/**
+ * kmemleak_transient_leak - mark an allocated object as transient false positive
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to not be
+ * reported as a leak temporarily. This may happen, for example, if the object
+ * is part of a singly linked list and the ->next reference to it is changed.
+ */
+void __ref kmemleak_transient_leak(const void *ptr)
+{
+ pr_debug("%s(0x%px)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ reset_checksum((unsigned long)ptr);
+}
+EXPORT_SYMBOL(kmemleak_transient_leak);
+
+/**
* kmemleak_ignore - ignore an allocated object
* @ptr: pointer to beginning of the object
*
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 13236d579eba..9733a22c46c1 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -640,6 +640,22 @@ static void test_unpoison_memory(struct kunit *test)
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
+static void test_copy_from_kernel_nofault(struct kunit *test)
+{
+ long ret;
+ char buf[4], src[4];
+ size_t size = sizeof(buf);
+
+ EXPECTATION_UNINIT_VALUE_FN(expect, "copy_from_kernel_nofault");
+ kunit_info(
+ test,
+ "testing copy_from_kernel_nofault with uninitialized memory\n");
+
+ ret = copy_from_kernel_nofault((char *)&buf[0], (char *)&src[0], size);
+ USE(ret);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_uninit_kmalloc),
KUNIT_CASE(test_init_kmalloc),
@@ -664,6 +680,7 @@ static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_long_origin_chain),
KUNIT_CASE(test_stackdepot_roundtrip),
KUNIT_CASE(test_unpoison_memory),
+ KUNIT_CASE(test_copy_from_kernel_nofault),
{},
};
diff --git a/mm/ksm.c b/mm/ksm.c
index a2e2a521df0a..31a9bc365437 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -20,7 +20,6 @@
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/cputime.h>
#include <linux/rwsem.h>
#include <linux/pagemap.h>
@@ -657,7 +656,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
*
* VM_FAULT_SIGBUS could occur if we race with truncation of the
* backing file, which also invalidates anonymous pages: that's
- * okay, that truncation will have unmapped the PageKsm for us.
+ * okay, that truncation will have unmapped the KSM page for us.
*
* VM_FAULT_OOM: at the time of writing (late July 2009), setting
* aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
@@ -1052,7 +1051,8 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
return err;
}
-static inline struct ksm_stable_node *folio_stable_node(struct folio *folio)
+static inline
+struct ksm_stable_node *folio_stable_node(const struct folio *folio)
{
return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
}
@@ -1257,7 +1257,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
if (WARN_ON_ONCE(folio_test_large(folio)))
return err;
- pvmw.address = page_address_in_vma(&folio->page, vma);
+ pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma);
if (pvmw.address == -EFAULT)
goto out;
@@ -1341,7 +1341,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
{
struct folio *kfolio = page_folio(kpage);
struct mm_struct *mm = vma->vm_mm;
- struct folio *folio;
+ struct folio *folio = page_folio(page);
pmd_t *pmd;
pmd_t pmde;
pte_t *ptep;
@@ -1351,7 +1351,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
int err = -EFAULT;
struct mmu_notifier_range range;
- addr = page_address_in_vma(page, vma);
+ addr = page_address_in_vma(folio, page, vma);
if (addr == -EFAULT)
goto out;
@@ -1417,7 +1417,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
ptep_clear_flush(vma, addr, ptep);
set_pte_at(mm, addr, ptep, newpte);
- folio = page_folio(page);
folio_remove_rmap_pte(folio, page, vma);
if (!folio_mapped(folio))
folio_free_swap(folio);
@@ -1435,7 +1434,7 @@ out:
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
* @page: the PageAnon page that we want to replace with kpage
- * @kpage: the PageKsm page that we want to map instead of page,
+ * @kpage: the KSM page that we want to map instead of page,
* or NULL the first time when we want to use page as kpage.
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
@@ -1443,28 +1442,29 @@ out:
static int try_to_merge_one_page(struct vm_area_struct *vma,
struct page *page, struct page *kpage)
{
+ struct folio *folio = page_folio(page);
pte_t orig_pte = __pte(0);
int err = -EFAULT;
if (page == kpage) /* ksm page forked */
return 0;
- if (!PageAnon(page))
+ if (!folio_test_anon(folio))
goto out;
/*
* We need the folio lock to read a stable swapcache flag in
- * write_protect_page(). We use trylock_page() instead of
- * lock_page() because we don't want to wait here - we
- * prefer to continue scanning and merging different pages,
- * then come back to this page when it is unlocked.
+ * write_protect_page(). We trylock because we don't want to wait
+ * here - we prefer to continue scanning and merging different
+ * pages, then come back to this page when it is unlocked.
*/
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto out;
- if (PageTransCompound(page)) {
+ if (folio_test_large(folio)) {
if (split_huge_page(page))
goto out_unlock;
+ folio = page_folio(page);
}
/*
@@ -1473,28 +1473,28 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
* ptes are necessarily already write-protected. But in either
* case, we need to lock and check page_count is not raised.
*/
- if (write_protect_page(vma, page_folio(page), &orig_pte) == 0) {
+ if (write_protect_page(vma, folio, &orig_pte) == 0) {
if (!kpage) {
/*
- * While we hold page lock, upgrade page from
- * PageAnon+anon_vma to PageKsm+NULL stable_node:
+ * While we hold folio lock, upgrade folio from
+ * anon to a NULL stable_node with the KSM flag set:
* stable_tree_insert() will update stable_node.
*/
- folio_set_stable_node(page_folio(page), NULL);
- mark_page_accessed(page);
+ folio_set_stable_node(folio, NULL);
+ folio_mark_accessed(folio);
/*
- * Page reclaim just frees a clean page with no dirty
+ * Page reclaim just frees a clean folio with no dirty
* ptes: make sure that the ksm page would be swapped.
*/
- if (!PageDirty(page))
- SetPageDirty(page);
+ if (!folio_test_dirty(folio))
+ folio_mark_dirty(folio);
err = 0;
} else if (pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte);
}
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
out:
return err;
}
@@ -1582,7 +1582,7 @@ out:
* Note that this function upgrades page to ksm page: if one of the pages
* is already a ksm page, try_to_merge_with_ksm_page should be used.
*/
-static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
+static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
struct page *page,
struct ksm_rmap_item *tree_rmap_item,
struct page *tree_page)
@@ -1600,7 +1600,7 @@ static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
if (err)
break_cow(rmap_item);
}
- return err ? NULL : page;
+ return err ? NULL : page_folio(page);
}
static __always_inline
@@ -1787,9 +1787,9 @@ static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d,
* with identical content to the page that we are scanning right now.
*
* This function returns the stable tree node of identical content if found,
- * NULL otherwise.
+ * -EBUSY if the stable node's page is being migrated, NULL otherwise.
*/
-static struct page *stable_tree_search(struct page *page)
+static struct folio *stable_tree_search(struct page *page)
{
int nid;
struct rb_root *root;
@@ -1804,7 +1804,7 @@ static struct page *stable_tree_search(struct page *page)
if (page_node && page_node->head != &migrate_nodes) {
/* ksm page forked */
folio_get(folio);
- return &folio->page;
+ return folio;
}
nid = get_kpfn_nid(folio_pfn(folio));
@@ -1899,7 +1899,7 @@ again:
folio_put(tree_folio);
goto replace;
}
- return &tree_folio->page;
+ return tree_folio;
}
}
@@ -1913,7 +1913,7 @@ again:
out:
if (is_page_sharing_candidate(page_node)) {
folio_get(folio);
- return &folio->page;
+ return folio;
} else
return NULL;
@@ -1963,7 +1963,7 @@ replace:
}
stable_node_dup->head = &migrate_nodes;
list_add(&stable_node_dup->list, stable_node_dup->head);
- return &folio->page;
+ return folio;
chain_append:
/*
@@ -2217,7 +2217,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
struct ksm_rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
struct ksm_stable_node *stable_node;
- struct page *kpage;
+ struct folio *kfolio;
unsigned int checksum;
int err;
bool max_page_sharing_bypass = false;
@@ -2259,31 +2259,31 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
return;
}
- /* We first start with searching the page inside the stable tree */
- kpage = stable_tree_search(page);
- if (kpage == page && rmap_item->head == stable_node) {
- put_page(kpage);
+ /* Start by searching for the folio in the stable tree */
+ kfolio = stable_tree_search(page);
+ if (&kfolio->page == page && rmap_item->head == stable_node) {
+ folio_put(kfolio);
return;
}
remove_rmap_item_from_tree(rmap_item);
- if (kpage) {
- if (PTR_ERR(kpage) == -EBUSY)
+ if (kfolio) {
+ if (kfolio == ERR_PTR(-EBUSY))
return;
- err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
+ err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page);
if (!err) {
/*
* The page was successfully merged:
* add its rmap_item to the stable tree.
*/
- lock_page(kpage);
- stable_tree_append(rmap_item, page_stable_node(kpage),
+ folio_lock(kfolio);
+ stable_tree_append(rmap_item, folio_stable_node(kfolio),
max_page_sharing_bypass);
- unlock_page(kpage);
+ folio_unlock(kfolio);
}
- put_page(kpage);
+ folio_put(kfolio);
return;
}
@@ -2292,7 +2292,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
if (tree_rmap_item) {
bool split;
- kpage = try_to_merge_two_pages(rmap_item, page,
+ kfolio = try_to_merge_two_pages(rmap_item, page,
tree_rmap_item, tree_page);
/*
* If both pages we tried to merge belong to the same compound
@@ -2307,20 +2307,20 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
split = PageTransCompound(page)
&& compound_head(page) == compound_head(tree_page);
put_page(tree_page);
- if (kpage) {
+ if (kfolio) {
/*
* The pages were successfully merged: insert new
* node in the stable tree and add both rmap_items.
*/
- lock_page(kpage);
- stable_node = stable_tree_insert(page_folio(kpage));
+ folio_lock(kfolio);
+ stable_node = stable_tree_insert(kfolio);
if (stable_node) {
stable_tree_append(tree_rmap_item, stable_node,
false);
stable_tree_append(rmap_item, stable_node,
false);
}
- unlock_page(kpage);
+ folio_unlock(kfolio);
/*
* If we fail to insert the page into the stable tree,
@@ -2401,10 +2401,10 @@ static unsigned int skip_age(rmap_age_t age)
/*
* Determines if a page should be skipped for the current scan.
*
- * @page: page to check
+ * @folio: folio containing the page to check
* @rmap_item: associated rmap_item of page
*/
-static bool should_skip_rmap_item(struct page *page,
+static bool should_skip_rmap_item(struct folio *folio,
struct ksm_rmap_item *rmap_item)
{
rmap_age_t age;
@@ -2417,7 +2417,7 @@ static bool should_skip_rmap_item(struct page *page,
* will essentially ignore them, but we still have to process them
* properly.
*/
- if (PageKsm(page))
+ if (folio_test_ksm(folio))
return false;
age = rmap_item->age;
@@ -2560,7 +2560,7 @@ next_mm:
ksm_scan.rmap_list =
&rmap_item->rmap_list;
- if (should_skip_rmap_item(tmp_page, rmap_item)) {
+ if (should_skip_rmap_item(folio, rmap_item)) {
folio_put(folio);
goto next_page;
}
@@ -2970,7 +2970,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio,
if (!folio_test_uptodate(folio))
return folio; /* let do_swap_page report the error */
- new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
if (new_folio &&
mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) {
folio_put(new_folio);
@@ -3067,7 +3067,7 @@ again:
/*
* Collect processes when the error hit an ksm page.
*/
-void collect_procs_ksm(struct folio *folio, struct page *page,
+void collect_procs_ksm(const struct folio *folio, const struct page *page,
struct list_head *to_kill, int force_early)
{
struct ksm_stable_node *stable_node;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 9b7ff06e9d32..f93ada6a207b 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -59,6 +59,53 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
}
return &lru->node[nid].lru;
}
+
+static inline struct list_lru_one *
+lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ bool irq, bool skip_empty)
+{
+ struct list_lru_one *l;
+ long nr_items;
+
+ rcu_read_lock();
+again:
+ l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
+ if (likely(l)) {
+ if (irq)
+ spin_lock_irq(&l->lock);
+ else
+ spin_lock(&l->lock);
+ nr_items = READ_ONCE(l->nr_items);
+ if (likely(nr_items != LONG_MIN)) {
+ WARN_ON(nr_items < 0);
+ rcu_read_unlock();
+ return l;
+ }
+ if (irq)
+ spin_unlock_irq(&l->lock);
+ else
+ spin_unlock(&l->lock);
+ }
+ /*
+ * Caller may simply bail out if raced with reparenting or
+ * may iterate through the list_lru and expect empty slots.
+ */
+ if (skip_empty) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ VM_WARN_ON(!css_is_dying(&memcg->css));
+ memcg = parent_mem_cgroup(memcg);
+ goto again;
+}
+
+static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
+{
+ if (irq_off)
+ spin_unlock_irq(&l->lock);
+ else
+ spin_unlock(&l->lock);
+}
#else
static void list_lru_register(struct list_lru *lru)
{
@@ -83,30 +130,52 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
return &lru->node[nid].lru;
}
+
+static inline struct list_lru_one *
+lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ bool irq, bool skip_empty)
+{
+ struct list_lru_one *l = &lru->node[nid].lru;
+
+ if (irq)
+ spin_lock_irq(&l->lock);
+ else
+ spin_lock(&l->lock);
+
+ return l;
+}
+
+static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
+{
+ if (irq_off)
+ spin_unlock_irq(&l->lock);
+ else
+ spin_unlock(&l->lock);
+}
#endif /* CONFIG_MEMCG */
/* The caller must ensure the memcg lifetime. */
bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
- struct mem_cgroup *memcg)
+ struct mem_cgroup *memcg)
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
- spin_lock(&nlru->lock);
+ l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
+ if (!l)
+ return false;
if (list_empty(item)) {
- l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
list_add_tail(item, &l->list);
/* Set shrinker bit if the first element was added */
if (!l->nr_items++)
set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
- nlru->nr_items++;
- spin_unlock(&nlru->lock);
+ unlock_list_lru(l, false);
+ atomic_long_inc(&nlru->nr_items);
return true;
}
- spin_unlock(&nlru->lock);
+ unlock_list_lru(l, false);
return false;
}
-EXPORT_SYMBOL_GPL(list_lru_add);
bool list_lru_add_obj(struct list_lru *lru, struct list_head *item)
{
@@ -127,24 +196,23 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj);
/* The caller must ensure the memcg lifetime. */
bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
- struct mem_cgroup *memcg)
+ struct mem_cgroup *memcg)
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
-
- spin_lock(&nlru->lock);
+ l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
+ if (!l)
+ return false;
if (!list_empty(item)) {
- l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
list_del_init(item);
l->nr_items--;
- nlru->nr_items--;
- spin_unlock(&nlru->lock);
+ unlock_list_lru(l, false);
+ atomic_long_dec(&nlru->nr_items);
return true;
}
- spin_unlock(&nlru->lock);
+ unlock_list_lru(l, false);
return false;
}
-EXPORT_SYMBOL_GPL(list_lru_del);
bool list_lru_del_obj(struct list_lru *lru, struct list_head *item)
{
@@ -201,25 +269,24 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
struct list_lru_node *nlru;
nlru = &lru->node[nid];
- return nlru->nr_items;
+ return atomic_long_read(&nlru->nr_items);
}
EXPORT_SYMBOL_GPL(list_lru_count_node);
static unsigned long
-__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+__list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
- unsigned long *nr_to_walk)
+ unsigned long *nr_to_walk, bool irq_off)
{
struct list_lru_node *nlru = &lru->node[nid];
- struct list_lru_one *l;
+ struct list_lru_one *l = NULL;
struct list_head *item, *n;
unsigned long isolated = 0;
restart:
- l = list_lru_from_memcg_idx(lru, nid, memcg_idx);
+ l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true);
if (!l)
- goto out;
-
+ return isolated;
list_for_each_safe(item, n, &l->list) {
enum lru_status ret;
@@ -231,19 +298,19 @@ restart:
break;
--*nr_to_walk;
- ret = isolate(item, l, &nlru->lock, cb_arg);
+ ret = isolate(item, l, cb_arg);
switch (ret) {
+ /*
+ * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru
+ * lock. List traversal will have to restart from scratch.
+ */
+ case LRU_RETRY:
+ goto restart;
case LRU_REMOVED_RETRY:
- assert_spin_locked(&nlru->lock);
fallthrough;
case LRU_REMOVED:
isolated++;
- nlru->nr_items--;
- /*
- * If the lru lock has been dropped, our list
- * traversal is now invalid and so we have to
- * restart from scratch.
- */
+ atomic_long_dec(&nlru->nr_items);
if (ret == LRU_REMOVED_RETRY)
goto restart;
break;
@@ -252,20 +319,13 @@ restart:
break;
case LRU_SKIP:
break;
- case LRU_RETRY:
- /*
- * The lru lock has been dropped, our list traversal is
- * now invalid and so we have to restart from scratch.
- */
- assert_spin_locked(&nlru->lock);
- goto restart;
case LRU_STOP:
- assert_spin_locked(&nlru->lock);
goto out;
default:
BUG();
}
}
+ unlock_list_lru(l, irq_off);
out:
return isolated;
}
@@ -275,14 +335,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
- struct list_lru_node *nlru = &lru->node[nid];
- unsigned long ret;
-
- spin_lock(&nlru->lock);
- ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
- cb_arg, nr_to_walk);
- spin_unlock(&nlru->lock);
- return ret;
+ return __list_lru_walk_one(lru, nid, memcg, isolate,
+ cb_arg, nr_to_walk, false);
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);
@@ -291,14 +345,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
- struct list_lru_node *nlru = &lru->node[nid];
- unsigned long ret;
-
- spin_lock_irq(&nlru->lock);
- ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
- cb_arg, nr_to_walk);
- spin_unlock_irq(&nlru->lock);
- return ret;
+ return __list_lru_walk_one(lru, nid, memcg, isolate,
+ cb_arg, nr_to_walk, true);
}
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
@@ -313,16 +361,21 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
#ifdef CONFIG_MEMCG
if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
struct list_lru_memcg *mlru;
+ struct mem_cgroup *memcg;
unsigned long index;
xa_for_each(&lru->xa, index, mlru) {
- struct list_lru_node *nlru = &lru->node[nid];
-
- spin_lock(&nlru->lock);
- isolated += __list_lru_walk_one(lru, nid, index,
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(index);
+ if (!mem_cgroup_tryget(memcg)) {
+ rcu_read_unlock();
+ continue;
+ }
+ rcu_read_unlock();
+ isolated += __list_lru_walk_one(lru, nid, memcg,
isolate, cb_arg,
- nr_to_walk);
- spin_unlock(&nlru->lock);
+ nr_to_walk, false);
+ mem_cgroup_put(memcg);
if (*nr_to_walk <= 0)
break;
@@ -334,14 +387,19 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
-static void init_one_lru(struct list_lru_one *l)
+static void init_one_lru(struct list_lru *lru, struct list_lru_one *l)
{
INIT_LIST_HEAD(&l->list);
+ spin_lock_init(&l->lock);
l->nr_items = 0;
+#ifdef CONFIG_LOCKDEP
+ if (lru->key)
+ lockdep_set_class(&l->lock, lru->key);
+#endif
}
#ifdef CONFIG_MEMCG
-static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
+static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp)
{
int nid;
struct list_lru_memcg *mlru;
@@ -351,25 +409,11 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
return NULL;
for_each_node(nid)
- init_one_lru(&mlru->node[nid]);
+ init_one_lru(lru, &mlru->node[nid]);
return mlru;
}
-static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
-{
- struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);
-
- /*
- * The __list_lru_walk_one() can walk the list of this node.
- * We need kvfree_rcu() here. And the walking of the list
- * is under lru->node[nid]->lock, which can serve as a RCU
- * read-side critical section.
- */
- if (mlru)
- kvfree_rcu(mlru, rcu);
-}
-
static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
if (memcg_aware)
@@ -393,77 +437,64 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
xas_unlock_irq(&xas);
}
-static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
- int src_idx, struct mem_cgroup *dst_memcg)
+static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid,
+ struct list_lru_one *src,
+ struct mem_cgroup *dst_memcg)
{
- struct list_lru_node *nlru = &lru->node[nid];
int dst_idx = dst_memcg->kmemcg_id;
- struct list_lru_one *src, *dst;
-
- /*
- * Since list_lru_{add,del} may be called under an IRQ-safe lock,
- * we have to use IRQ-safe primitives here to avoid deadlock.
- */
- spin_lock_irq(&nlru->lock);
+ struct list_lru_one *dst;
- src = list_lru_from_memcg_idx(lru, nid, src_idx);
- if (!src)
- goto out;
+ spin_lock_irq(&src->lock);
dst = list_lru_from_memcg_idx(lru, nid, dst_idx);
+ spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING);
list_splice_init(&src->list, &dst->list);
-
if (src->nr_items) {
dst->nr_items += src->nr_items;
set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
- src->nr_items = 0;
}
-out:
- spin_unlock_irq(&nlru->lock);
-}
-
-static void memcg_reparent_list_lru(struct list_lru *lru,
- int src_idx, struct mem_cgroup *dst_memcg)
-{
- int i;
+ /* Mark the list_lru_one dead */
+ src->nr_items = LONG_MIN;
- for_each_node(i)
- memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg);
-
- memcg_list_lru_free(lru, src_idx);
+ spin_unlock(&dst->lock);
+ spin_unlock_irq(&src->lock);
}
void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
- struct cgroup_subsys_state *css;
struct list_lru *lru;
- int src_idx = memcg->kmemcg_id;
+ int i;
- /*
- * Change kmemcg_id of this cgroup and all its descendants to the
- * parent's id, and then move all entries from this cgroup's list_lrus
- * to ones of the parent.
- *
- * After we have finished, all list_lrus corresponding to this cgroup
- * are guaranteed to remain empty. So we can safely free this cgroup's
- * list lrus in memcg_list_lru_free().
- *
- * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc()
- * from allocating list lrus for this cgroup after memcg_list_lru_free()
- * call.
- */
- rcu_read_lock();
- css_for_each_descendant_pre(css, &memcg->css) {
- struct mem_cgroup *child;
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &memcg_list_lrus, list) {
+ struct list_lru_memcg *mlru;
+ XA_STATE(xas, &lru->xa, memcg->kmemcg_id);
- child = mem_cgroup_from_css(css);
- WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
- }
- rcu_read_unlock();
+ /*
+ * Lock the Xarray to ensure no on going list_lru_memcg
+ * allocation and further allocation will see css_is_dying().
+ */
+ xas_lock_irq(&xas);
+ mlru = xas_store(&xas, NULL);
+ xas_unlock_irq(&xas);
+ if (!mlru)
+ continue;
- mutex_lock(&list_lrus_mutex);
- list_for_each_entry(lru, &memcg_list_lrus, list)
- memcg_reparent_list_lru(lru, src_idx, parent);
+ /*
+ * With Xarray value set to NULL, holding the lru lock below
+ * prevents list_lru_{add,del,isolate} from touching the lru,
+ * safe to reparent.
+ */
+ for_each_node(i)
+ memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent);
+
+ /*
+ * Here all list_lrus corresponding to the cgroup are guaranteed
+ * to remain empty, we can safely free this lru, any further
+ * memcg_list_lru_alloc() call will simply bail out.
+ */
+ kvfree_rcu(mlru, rcu);
+ }
mutex_unlock(&list_lrus_mutex);
}
@@ -478,77 +509,48 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp)
{
- int i;
unsigned long flags;
- struct list_lru_memcg_table {
- struct list_lru_memcg *mlru;
- struct mem_cgroup *memcg;
- } *table;
+ struct list_lru_memcg *mlru;
+ struct mem_cgroup *pos, *parent;
XA_STATE(xas, &lru->xa, 0);
if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
return 0;
gfp &= GFP_RECLAIM_MASK;
- table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp);
- if (!table)
- return -ENOMEM;
-
/*
* Because the list_lru can be reparented to the parent cgroup's
* list_lru, we should make sure that this cgroup and all its
* ancestors have allocated list_lru_memcg.
*/
- for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) {
- if (memcg_list_lru_allocated(memcg, lru))
- break;
-
- table[i].memcg = memcg;
- table[i].mlru = memcg_init_list_lru_one(gfp);
- if (!table[i].mlru) {
- while (i--)
- kfree(table[i].mlru);
- kfree(table);
- return -ENOMEM;
+ do {
+ /*
+ * Keep finding the farest parent that wasn't populated
+ * until found memcg itself.
+ */
+ pos = memcg;
+ parent = parent_mem_cgroup(pos);
+ while (!memcg_list_lru_allocated(parent, lru)) {
+ pos = parent;
+ parent = parent_mem_cgroup(pos);
}
- }
-
- xas_lock_irqsave(&xas, flags);
- while (i--) {
- int index = READ_ONCE(table[i].memcg->kmemcg_id);
- struct list_lru_memcg *mlru = table[i].mlru;
- xas_set(&xas, index);
-retry:
- if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
- kfree(mlru);
- } else {
- xas_store(&xas, mlru);
- if (xas_error(&xas) == -ENOMEM) {
- xas_unlock_irqrestore(&xas, flags);
- if (xas_nomem(&xas, gfp))
- xas_set_err(&xas, 0);
- xas_lock_irqsave(&xas, flags);
- /*
- * The xas lock has been released, this memcg
- * can be reparented before us. So reload
- * memcg id. More details see the comments
- * in memcg_reparent_list_lrus().
- */
- index = READ_ONCE(table[i].memcg->kmemcg_id);
- if (index < 0)
- xas_set_err(&xas, 0);
- else if (!xas_error(&xas) && index != xas.xa_index)
- xas_set(&xas, index);
- goto retry;
+ mlru = memcg_init_list_lru_one(lru, gfp);
+ if (!mlru)
+ return -ENOMEM;
+ xas_set(&xas, pos->kmemcg_id);
+ do {
+ xas_lock_irqsave(&xas, flags);
+ if (!xas_load(&xas) && !css_is_dying(&pos->css)) {
+ xas_store(&xas, mlru);
+ if (!xas_error(&xas))
+ mlru = NULL;
}
- }
- }
- /* xas_nomem() is used to free memory instead of memory allocation. */
- if (xas.xa_alloc)
- xas_nomem(&xas, gfp);
- xas_unlock_irqrestore(&xas, flags);
- kfree(table);
+ xas_unlock_irqrestore(&xas, flags);
+ } while (xas_nomem(&xas, gfp));
+ if (mlru)
+ kfree(mlru);
+ } while (pos != memcg && !css_is_dying(&pos->css));
return xas_error(&xas);
}
@@ -562,8 +564,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
}
#endif /* CONFIG_MEMCG */
-int __list_lru_init(struct list_lru *lru, bool memcg_aware,
- struct lock_class_key *key, struct shrinker *shrinker)
+int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker)
{
int i;
@@ -581,12 +582,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
if (!lru->node)
return -ENOMEM;
- for_each_node(i) {
- spin_lock_init(&lru->node[i].lock);
- if (key)
- lockdep_set_class(&lru->node[i].lock, key);
- init_one_lru(&lru->node[i].lru);
- }
+ for_each_node(i)
+ init_one_lru(lru, &lru->node[i].lru);
memcg_init_list_lru(lru, memcg_aware);
list_lru_register(lru);
diff --git a/mm/maccess.c b/mm/maccess.c
index 518a25667323..8f0906180a94 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -13,9 +13,14 @@ bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
return true;
}
+/*
+ * The below only uses kmsan_check_memory() to ensure uninitialized kernel
+ * memory isn't leaked.
+ */
#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \
while (len >= sizeof(type)) { \
- __get_kernel_nofault(dst, src, type, err_label); \
+ __get_kernel_nofault(dst, src, type, err_label); \
+ kmsan_check_memory(src, sizeof(type)); \
dst += sizeof(type); \
src += sizeof(type); \
len -= sizeof(type); \
@@ -49,7 +54,8 @@ EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \
while (len >= sizeof(type)) { \
- __put_kernel_nofault(dst, src, type, err_label); \
+ __put_kernel_nofault(dst, src, type, err_label); \
+ instrument_write(dst, sizeof(type)); \
dst += sizeof(type); \
src += sizeof(type); \
len -= sizeof(type); \
@@ -76,6 +82,7 @@ Efault:
pagefault_enable();
return -EFAULT;
}
+EXPORT_SYMBOL_GPL(copy_to_kernel_nofault);
long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
diff --git a/mm/madvise.c b/mm/madvise.c
index ff139e57cca2..0ceae57da7da 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -37,6 +37,12 @@
#include "internal.h"
#include "swap.h"
+/*
+ * Maximum number of attempts we make to install guard pages before we give up
+ * and return -ERESTARTNOINTR to have userspace try again.
+ */
+#define MAX_MADVISE_GUARD_RETRIES 3
+
struct madvise_walk_private {
struct mmu_gather *tlb;
bool pageout;
@@ -60,6 +66,8 @@ static int madvise_need_mmap_write(int behavior)
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -1017,6 +1025,214 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}
+static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
+{
+ vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;
+
+ /*
+ * A user could lock after setting a guard range but that's fine, as
+ * they'd not be able to fault in. The issue arises when we try to zap
+ * existing locked VMAs. We don't want to do that.
+ */
+ if (!allow_locked)
+ disallowed |= VM_LOCKED;
+
+ if (!vma_is_anonymous(vma))
+ return false;
+
+ if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
+ return false;
+
+ return true;
+}
+
+static bool is_guard_pte_marker(pte_t ptent)
+{
+ return is_pte_marker(ptent) &&
+ is_guard_swp_entry(pte_to_swp_entry(ptent));
+}
+
+static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pudval = pudp_get(pud);
+
+ /* If huge return >0 so we abort the operation + zap. */
+ return pud_trans_huge(pudval) || pud_devmap(pudval);
+}
+
+static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmdval = pmdp_get(pmd);
+
+ /* If huge return >0 so we abort the operation + zap. */
+ return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
+}
+
+static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t pteval = ptep_get(pte);
+ unsigned long *nr_pages = (unsigned long *)walk->private;
+
+ /* If there is already a guard page marker, we have nothing to do. */
+ if (is_guard_pte_marker(pteval)) {
+ (*nr_pages)++;
+
+ return 0;
+ }
+
+ /* If populated return >0 so we abort the operation + zap. */
+ return 1;
+}
+
+static int guard_install_set_pte(unsigned long addr, unsigned long next,
+ pte_t *ptep, struct mm_walk *walk)
+{
+ unsigned long *nr_pages = (unsigned long *)walk->private;
+
+ /* Simply install a PTE marker, this causes segfault on access. */
+ *ptep = make_pte_marker(PTE_MARKER_GUARD);
+ (*nr_pages)++;
+
+ return 0;
+}
+
+static const struct mm_walk_ops guard_install_walk_ops = {
+ .pud_entry = guard_install_pud_entry,
+ .pmd_entry = guard_install_pmd_entry,
+ .pte_entry = guard_install_pte_entry,
+ .install_pte = guard_install_set_pte,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static long madvise_guard_install(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ long err;
+ int i;
+
+ *prev = vma;
+ if (!is_valid_guard_vma(vma, /* allow_locked = */false))
+ return -EINVAL;
+
+ /*
+ * If we install guard markers, then the range is no longer
+ * empty from a page table perspective and therefore it's
+ * appropriate to have an anon_vma.
+ *
+ * This ensures that on fork, we copy page tables correctly.
+ */
+ err = anon_vma_prepare(vma);
+ if (err)
+ return err;
+
+ /*
+ * Optimistically try to install the guard marker pages first. If any
+ * non-guard pages are encountered, give up and zap the range before
+ * trying again.
+ *
+ * We try a few times before giving up and releasing back to userland to
+ * loop around, releasing locks in the process to avoid contention. This
+ * would only happen if there was a great many racing page faults.
+ *
+ * In most cases we should simply install the guard markers immediately
+ * with no zap or looping.
+ */
+ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
+ unsigned long nr_pages = 0;
+
+ /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
+ err = walk_page_range_mm(vma->vm_mm, start, end,
+ &guard_install_walk_ops, &nr_pages);
+ if (err < 0)
+ return err;
+
+ if (err == 0) {
+ unsigned long nr_expected_pages = PHYS_PFN(end - start);
+
+ VM_WARN_ON(nr_pages != nr_expected_pages);
+ return 0;
+ }
+
+ /*
+ * OK some of the range have non-guard pages mapped, zap
+ * them. This leaves existing guard pages in place.
+ */
+ zap_page_range_single(vma, start, end - start, NULL);
+ }
+
+ /*
+ * We were unable to install the guard pages due to being raced by page
+ * faults. This should not happen ordinarily. We return to userspace and
+ * immediately retry, relieving lock contention.
+ */
+ return restart_syscall();
+}
+
+static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pudval = pudp_get(pud);
+
+ /* If huge, cannot have guard pages present, so no-op - skip. */
+ if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmdval = pmdp_get(pmd);
+
+ /* If huge, cannot have guard pages present, so no-op - skip. */
+ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t ptent = ptep_get(pte);
+
+ if (is_guard_pte_marker(ptent)) {
+ /* Simply clear the PTE marker. */
+ pte_clear_not_present_full(walk->mm, addr, pte, false);
+ update_mmu_cache(walk->vma, addr, pte);
+ }
+
+ return 0;
+}
+
+static const struct mm_walk_ops guard_remove_walk_ops = {
+ .pud_entry = guard_remove_pud_entry,
+ .pmd_entry = guard_remove_pmd_entry,
+ .pte_entry = guard_remove_pte_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static long madvise_guard_remove(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ /*
+ * We're ok with removing guards in mlock()'d ranges, as this is a
+ * non-destructive action.
+ */
+ if (!is_valid_guard_vma(vma, /* allow_locked = */true))
+ return -EINVAL;
+
+ return walk_page_range(vma->vm_mm, start, end,
+ &guard_remove_walk_ops, NULL);
+}
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
@@ -1098,6 +1314,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
case MADV_COLLAPSE:
return madvise_collapse(vma, prev, start, end);
+ case MADV_GUARD_INSTALL:
+ return madvise_guard_install(vma, prev, start, end);
+ case MADV_GUARD_REMOVE:
+ return madvise_guard_remove(vma, prev, start, end);
}
anon_name = anon_vma_name(vma);
@@ -1197,6 +1417,8 @@ madvise_behavior_valid(int behavior)
case MADV_DODUMP:
case MADV_WIPEONFORK:
case MADV_KEEPONFORK:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
#ifdef CONFIG_MEMORY_FAILURE
case MADV_SOFT_OFFLINE:
case MADV_HWPOISON:
@@ -1208,7 +1430,8 @@ madvise_behavior_valid(int behavior)
}
}
-static bool process_madvise_behavior_valid(int behavior)
+/* Can we invoke process_madvise() on a remote mm for the specified behavior? */
+static bool process_madvise_remote_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
@@ -1477,6 +1700,45 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
return do_madvise(current->mm, start, len_in, behavior);
}
+/* Perform an madvise operation over a vector of addresses and lengths. */
+static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
+ int behavior)
+{
+ ssize_t ret = 0;
+ size_t total_len;
+
+ total_len = iov_iter_count(iter);
+
+ while (iov_iter_count(iter)) {
+ ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
+ iter_iov_len(iter), behavior);
+ /*
+ * An madvise operation is attempting to restart the syscall,
+ * but we cannot proceed as it would not be correct to repeat
+ * the operation in aggregate, and would be surprising to the
+ * user.
+ *
+ * As we have already dropped locks, it is safe to just loop and
+ * try again. We check for fatal signals in case we need exit
+ * early anyway.
+ */
+ if (ret == -ERESTARTNOINTR) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ continue;
+ }
+ if (ret < 0)
+ break;
+ iov_iter_advance(iter, iter_iov_len(iter));
+ }
+
+ ret = (total_len - iov_iter_count(iter)) ? : ret;
+
+ return ret;
+}
+
SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
size_t, vlen, int, behavior, unsigned int, flags)
{
@@ -1486,7 +1748,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
struct iov_iter iter;
struct task_struct *task;
struct mm_struct *mm;
- size_t total_len;
unsigned int f_flags;
if (flags != 0) {
@@ -1504,38 +1765,33 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto free_iov;
}
- if (!process_madvise_behavior_valid(behavior)) {
- ret = -EINVAL;
- goto release_task;
- }
-
/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
- if (IS_ERR_OR_NULL(mm)) {
- ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ if (IS_ERR(mm)) {
+ ret = PTR_ERR(mm);
goto release_task;
}
/*
+ * We need only perform this check if we are attempting to manipulate a
+ * remote process's address space.
+ */
+ if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_mm;
+ }
+
+ /*
* Require CAP_SYS_NICE for influencing process performance. Note that
- * only non-destructive hints are currently supported.
+ * only non-destructive hints are currently supported for remote
+ * processes.
*/
if (mm != current->mm && !capable(CAP_SYS_NICE)) {
ret = -EPERM;
goto release_mm;
}
- total_len = iov_iter_count(&iter);
-
- while (iov_iter_count(&iter)) {
- ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
- iter_iov_len(&iter), behavior);
- if (ret < 0)
- break;
- iov_iter_advance(&iter, iter_iov_len(&iter));
- }
-
- ret = (total_len - iov_iter_count(&iter)) ? : ret;
+ ret = vector_madvise(mm, &iter, behavior);
release_mm:
mmput(mm);
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 86527d8fa7b9..a071fa43d479 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -40,31 +40,6 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
-/* Stuffs for move charges at task migration. */
-/*
- * Types of charges to be moved.
- */
-#define MOVE_ANON 0x1ULL
-#define MOVE_FILE 0x2ULL
-#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
-
-/* "mc" and its members are protected by cgroup_mutex */
-static struct move_charge_struct {
- spinlock_t lock; /* for from, to */
- struct mm_struct *mm;
- struct mem_cgroup *from;
- struct mem_cgroup *to;
- unsigned long flags;
- unsigned long precharge;
- unsigned long moved_charge;
- unsigned long moved_swap;
- struct task_struct *moving_task; /* a task moving charges */
- wait_queue_head_t waitq; /* a waitq for other context */
-} mc = {
- .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
- .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
-};
-
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
@@ -426,196 +401,22 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
return nr_reclaimed;
}
-/*
- * A routine for checking "mem" is under move_account() or not.
- *
- * Checking a cgroup is mc.from or mc.to or under hierarchy of
- * moving cgroups. This is for waiting at high-memory pressure
- * caused by "move".
- */
-static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
-{
- struct mem_cgroup *from;
- struct mem_cgroup *to;
- bool ret = false;
- /*
- * Unlike task_move routines, we access mc.to, mc.from not under
- * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
- */
- spin_lock(&mc.lock);
- from = mc.from;
- to = mc.to;
- if (!from)
- goto unlock;
-
- ret = mem_cgroup_is_descendant(from, memcg) ||
- mem_cgroup_is_descendant(to, memcg);
-unlock:
- spin_unlock(&mc.lock);
- return ret;
-}
-
-bool memcg1_wait_acct_move(struct mem_cgroup *memcg)
-{
- if (mc.moving_task && current != mc.moving_task) {
- if (mem_cgroup_under_move(memcg)) {
- DEFINE_WAIT(wait);
- prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
- /* moving charge context might have finished. */
- if (mc.moving_task)
- schedule();
- finish_wait(&mc.waitq, &wait);
- return true;
- }
- }
- return false;
-}
-
-/**
- * folio_memcg_lock - Bind a folio to its memcg.
- * @folio: The folio.
- *
- * This function prevents unlocked LRU folios from being moved to
- * another cgroup.
- *
- * It ensures lifetime of the bound memcg. The caller is responsible
- * for the lifetime of the folio.
- */
-void folio_memcg_lock(struct folio *folio)
-{
- struct mem_cgroup *memcg;
- unsigned long flags;
-
- /*
- * The RCU lock is held throughout the transaction. The fast
- * path can get away without acquiring the memcg->move_lock
- * because page moving starts with an RCU grace period.
- */
- rcu_read_lock();
-
- if (mem_cgroup_disabled())
- return;
-again:
- memcg = folio_memcg(folio);
- if (unlikely(!memcg))
- return;
-
-#ifdef CONFIG_PROVE_LOCKING
- local_irq_save(flags);
- might_lock(&memcg->move_lock);
- local_irq_restore(flags);
-#endif
-
- if (atomic_read(&memcg->moving_account) <= 0)
- return;
-
- spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != folio_memcg(folio)) {
- spin_unlock_irqrestore(&memcg->move_lock, flags);
- goto again;
- }
-
- /*
- * When charge migration first begins, we can have multiple
- * critical sections holding the fast-path RCU lock and one
- * holding the slowpath move_lock. Track the task who has the
- * move_lock for folio_memcg_unlock().
- */
- memcg->move_lock_task = current;
- memcg->move_lock_flags = flags;
-}
-
-static void __folio_memcg_unlock(struct mem_cgroup *memcg)
-{
- if (memcg && memcg->move_lock_task == current) {
- unsigned long flags = memcg->move_lock_flags;
-
- memcg->move_lock_task = NULL;
- memcg->move_lock_flags = 0;
-
- spin_unlock_irqrestore(&memcg->move_lock, flags);
- }
-
- rcu_read_unlock();
-}
-
-/**
- * folio_memcg_unlock - Release the binding between a folio and its memcg.
- * @folio: The folio.
- *
- * This releases the binding created by folio_memcg_lock(). This does
- * not change the accounting of this folio to its memcg, but it does
- * permit others to change it.
- */
-void folio_memcg_unlock(struct folio *folio)
-{
- __folio_memcg_unlock(folio_memcg(folio));
-}
-
-#ifdef CONFIG_SWAP
-/**
- * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
- * @entry: swap entry to be moved
- * @from: mem_cgroup which the entry is moved from
- * @to: mem_cgroup which the entry is moved to
- *
- * It succeeds only when the swap_cgroup's record for this entry is the same
- * as the mem_cgroup's id of @from.
- *
- * Returns 0 on success, -EINVAL on failure.
- *
- * The caller must have charged to @to, IOW, called page_counter_charge() about
- * both res and memsw, and called css_get().
- */
-static int mem_cgroup_move_swap_account(swp_entry_t entry,
- struct mem_cgroup *from, struct mem_cgroup *to)
-{
- unsigned short old_id, new_id;
-
- old_id = mem_cgroup_id(from);
- new_id = mem_cgroup_id(to);
-
- if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
- mod_memcg_state(from, MEMCG_SWAP, -1);
- mod_memcg_state(to, MEMCG_SWAP, 1);
- return 0;
- }
- return -EINVAL;
-}
-#else
-static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
- struct mem_cgroup *from, struct mem_cgroup *to)
-{
- return -EINVAL;
-}
-#endif
-
static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return mem_cgroup_from_css(css)->move_charge_at_immigrate;
+ return 0;
}
#ifdef CONFIG_MMU
static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-
pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
"Please report your usecase to linux-mm@kvack.org if you "
"depend on this functionality.\n");
- if (val & ~MOVE_MASK)
+ if (val != 0)
return -EINVAL;
-
- /*
- * No kind of locking is needed in here, because ->can_attach() will
- * check this value once in the beginning of the process, and then carry
- * on with stale data. This means that changes to this value will only
- * affect task migrations starting after the change.
- */
- memcg->move_charge_at_immigrate = val;
return 0;
}
#else
@@ -626,785 +427,6 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
}
#endif
-#ifdef CONFIG_MMU
-/* Handlers for move charge at task migration. */
-static int mem_cgroup_do_precharge(unsigned long count)
-{
- int ret;
-
- /* Try a single bulk charge without reclaim first, kswapd may wake */
- ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
- if (!ret) {
- mc.precharge += count;
- return ret;
- }
-
- /* Try charges one by one with reclaim, but do not retry */
- while (count--) {
- ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
- if (ret)
- return ret;
- mc.precharge++;
- cond_resched();
- }
- return 0;
-}
-
-union mc_target {
- struct folio *folio;
- swp_entry_t ent;
-};
-
-enum mc_target_type {
- MC_TARGET_NONE = 0,
- MC_TARGET_PAGE,
- MC_TARGET_SWAP,
- MC_TARGET_DEVICE,
-};
-
-static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent)
-{
- struct page *page = vm_normal_page(vma, addr, ptent);
-
- if (!page)
- return NULL;
- if (PageAnon(page)) {
- if (!(mc.flags & MOVE_ANON))
- return NULL;
- } else {
- if (!(mc.flags & MOVE_FILE))
- return NULL;
- }
- get_page(page);
-
- return page;
-}
-
-#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
-static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
- pte_t ptent, swp_entry_t *entry)
-{
- struct page *page = NULL;
- swp_entry_t ent = pte_to_swp_entry(ptent);
-
- if (!(mc.flags & MOVE_ANON))
- return NULL;
-
- /*
- * Handle device private pages that are not accessible by the CPU, but
- * stored as special swap entries in the page table.
- */
- if (is_device_private_entry(ent)) {
- page = pfn_swap_entry_to_page(ent);
- if (!get_page_unless_zero(page))
- return NULL;
- return page;
- }
-
- if (non_swap_entry(ent))
- return NULL;
-
- /*
- * Because swap_cache_get_folio() updates some statistics counter,
- * we call find_get_page() with swapper_space directly.
- */
- page = find_get_page(swap_address_space(ent), swap_cache_index(ent));
- entry->val = ent.val;
-
- return page;
-}
-#else
-static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
- pte_t ptent, swp_entry_t *entry)
-{
- return NULL;
-}
-#endif
-
-static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent)
-{
- unsigned long index;
- struct folio *folio;
-
- if (!vma->vm_file) /* anonymous vma */
- return NULL;
- if (!(mc.flags & MOVE_FILE))
- return NULL;
-
- /* folio is moved even if it's not RSS of this task(page-faulted). */
- /* shmem/tmpfs may report page out on swap: account for that too. */
- index = linear_page_index(vma, addr);
- folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
- if (IS_ERR(folio))
- return NULL;
- return folio_file_page(folio, index);
-}
-
-static void memcg1_check_events(struct mem_cgroup *memcg, int nid);
-static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
-
-/**
- * mem_cgroup_move_account - move account of the folio
- * @folio: The folio.
- * @compound: charge the page as compound or small page
- * @from: mem_cgroup which the folio is moved from.
- * @to: mem_cgroup which the folio is moved to. @from != @to.
- *
- * The folio must be locked and not on the LRU.
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct folio *folio,
- bool compound,
- struct mem_cgroup *from,
- struct mem_cgroup *to)
-{
- struct lruvec *from_vec, *to_vec;
- struct pglist_data *pgdat;
- unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
- int nid, ret;
-
- VM_BUG_ON(from == to);
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON(compound && !folio_test_large(folio));
-
- ret = -EINVAL;
- if (folio_memcg(folio) != from)
- goto out;
-
- pgdat = folio_pgdat(folio);
- from_vec = mem_cgroup_lruvec(from, pgdat);
- to_vec = mem_cgroup_lruvec(to, pgdat);
-
- folio_memcg_lock(folio);
-
- if (folio_test_anon(folio)) {
- if (folio_mapped(folio)) {
- __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
- __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
- if (folio_test_pmd_mappable(folio)) {
- __mod_lruvec_state(from_vec, NR_ANON_THPS,
- -nr_pages);
- __mod_lruvec_state(to_vec, NR_ANON_THPS,
- nr_pages);
- }
- }
- } else {
- __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
-
- if (folio_test_swapbacked(folio)) {
- __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
- __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
- }
-
- if (folio_mapped(folio)) {
- __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
- }
-
- if (folio_test_dirty(folio)) {
- struct address_space *mapping = folio_mapping(folio);
-
- if (mapping_can_writeback(mapping)) {
- __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
- -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
- nr_pages);
- }
- }
- }
-
-#ifdef CONFIG_SWAP
- if (folio_test_swapcache(folio)) {
- __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
- __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
- }
-#endif
- if (folio_test_writeback(folio)) {
- __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
- __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
- }
-
- /*
- * All state has been migrated, let's switch to the new memcg.
- *
- * It is safe to change page's memcg here because the page
- * is referenced, charged, isolated, and locked: we can't race
- * with (un)charging, migration, LRU putback, or anything else
- * that would rely on a stable page's memory cgroup.
- *
- * Note that folio_memcg_lock is a memcg lock, not a page lock,
- * to save space. As soon as we switch page's memory cgroup to a
- * new memcg that isn't locked, the above state can change
- * concurrently again. Make sure we're truly done with it.
- */
- smp_mb();
-
- css_get(&to->css);
- css_put(&from->css);
-
- /* Warning should never happen, so don't worry about refcount non-0 */
- WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
- folio->memcg_data = (unsigned long)to;
-
- __folio_memcg_unlock(from);
-
- ret = 0;
- nid = folio_nid(folio);
-
- local_irq_disable();
- memcg1_charge_statistics(to, nr_pages);
- memcg1_check_events(to, nid);
- memcg1_charge_statistics(from, -nr_pages);
- memcg1_check_events(from, nid);
- local_irq_enable();
-out:
- return ret;
-}
-
-/**
- * get_mctgt_type - get target type of moving charge
- * @vma: the vma the pte to be checked belongs
- * @addr: the address corresponding to the pte to be checked
- * @ptent: the pte to be checked
- * @target: the pointer the target page or swap ent will be stored(can be NULL)
- *
- * Context: Called with pte lock held.
- * Return:
- * * MC_TARGET_NONE - If the pte is not a target for move charge.
- * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
- * move charge. If @target is not NULL, the folio is stored in target->folio
- * with extra refcnt taken (Caller should release it).
- * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
- * target for charge migration. If @target is not NULL, the entry is
- * stored in target->ent.
- * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and
- * thus not on the lru. For now such page is charged like a regular page
- * would be as it is just special memory taking the place of a regular page.
- * See Documentations/vm/hmm.txt and include/linux/hmm.h
- */
-static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent, union mc_target *target)
-{
- struct page *page = NULL;
- struct folio *folio;
- enum mc_target_type ret = MC_TARGET_NONE;
- swp_entry_t ent = { .val = 0 };
-
- if (pte_present(ptent))
- page = mc_handle_present_pte(vma, addr, ptent);
- else if (pte_none_mostly(ptent))
- /*
- * PTE markers should be treated as a none pte here, separated
- * from other swap handling below.
- */
- page = mc_handle_file_pte(vma, addr, ptent);
- else if (is_swap_pte(ptent))
- page = mc_handle_swap_pte(vma, ptent, &ent);
-
- if (page)
- folio = page_folio(page);
- if (target && page) {
- if (!folio_trylock(folio)) {
- folio_put(folio);
- return ret;
- }
- /*
- * page_mapped() must be stable during the move. This
- * pte is locked, so if it's present, the page cannot
- * become unmapped. If it isn't, we have only partial
- * control over the mapped state: the page lock will
- * prevent new faults against pagecache and swapcache,
- * so an unmapped page cannot become mapped. However,
- * if the page is already mapped elsewhere, it can
- * unmap, and there is nothing we can do about it.
- * Alas, skip moving the page in this case.
- */
- if (!pte_present(ptent) && page_mapped(page)) {
- folio_unlock(folio);
- folio_put(folio);
- return ret;
- }
- }
-
- if (!page && !ent.val)
- return ret;
- if (page) {
- /*
- * Do only loose check w/o serialization.
- * mem_cgroup_move_account() checks the page is valid or
- * not under LRU exclusion.
- */
- if (folio_memcg(folio) == mc.from) {
- ret = MC_TARGET_PAGE;
- if (folio_is_device_private(folio) ||
- folio_is_device_coherent(folio))
- ret = MC_TARGET_DEVICE;
- if (target)
- target->folio = folio;
- }
- if (!ret || !target) {
- if (target)
- folio_unlock(folio);
- folio_put(folio);
- }
- }
- /*
- * There is a swap entry and a page doesn't exist or isn't charged.
- * But we cannot move a tail-page in a THP.
- */
- if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
- mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
- ret = MC_TARGET_SWAP;
- if (target)
- target->ent = ent;
- }
- return ret;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * We don't consider PMD mapped swapping or file mapped pages because THP does
- * not support them for now.
- * Caller should make sure that pmd_trans_huge(pmd) is true.
- */
-static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
- unsigned long addr, pmd_t pmd, union mc_target *target)
-{
- struct page *page = NULL;
- struct folio *folio;
- enum mc_target_type ret = MC_TARGET_NONE;
-
- if (unlikely(is_swap_pmd(pmd))) {
- VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(pmd));
- return ret;
- }
- page = pmd_page(pmd);
- VM_BUG_ON_PAGE(!page || !PageHead(page), page);
- folio = page_folio(page);
- if (!(mc.flags & MOVE_ANON))
- return ret;
- if (folio_memcg(folio) == mc.from) {
- ret = MC_TARGET_PAGE;
- if (target) {
- folio_get(folio);
- if (!folio_trylock(folio)) {
- folio_put(folio);
- return MC_TARGET_NONE;
- }
- target->folio = folio;
- }
- }
- return ret;
-}
-#else
-static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
- unsigned long addr, pmd_t pmd, union mc_target *target)
-{
- return MC_TARGET_NONE;
-}
-#endif
-
-static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
- unsigned long addr, unsigned long end,
- struct mm_walk *walk)
-{
- struct vm_area_struct *vma = walk->vma;
- pte_t *pte;
- spinlock_t *ptl;
-
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- /*
- * Note their can not be MC_TARGET_DEVICE for now as we do not
- * support transparent huge page with MEMORY_DEVICE_PRIVATE but
- * this might change.
- */
- if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
- mc.precharge += HPAGE_PMD_NR;
- spin_unlock(ptl);
- return 0;
- }
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (!pte)
- return 0;
- for (; addr != end; pte++, addr += PAGE_SIZE)
- if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
- mc.precharge++; /* increment precharge temporarily */
- pte_unmap_unlock(pte - 1, ptl);
- cond_resched();
-
- return 0;
-}
-
-static const struct mm_walk_ops precharge_walk_ops = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .walk_lock = PGWALK_RDLOCK,
-};
-
-static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
-{
- unsigned long precharge;
-
- mmap_read_lock(mm);
- walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
- mmap_read_unlock(mm);
-
- precharge = mc.precharge;
- mc.precharge = 0;
-
- return precharge;
-}
-
-static int mem_cgroup_precharge_mc(struct mm_struct *mm)
-{
- unsigned long precharge = mem_cgroup_count_precharge(mm);
-
- VM_BUG_ON(mc.moving_task);
- mc.moving_task = current;
- return mem_cgroup_do_precharge(precharge);
-}
-
-/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
-static void __mem_cgroup_clear_mc(void)
-{
- struct mem_cgroup *from = mc.from;
- struct mem_cgroup *to = mc.to;
-
- /* we must uncharge all the leftover precharges from mc.to */
- if (mc.precharge) {
- mem_cgroup_cancel_charge(mc.to, mc.precharge);
- mc.precharge = 0;
- }
- /*
- * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
- * we must uncharge here.
- */
- if (mc.moved_charge) {
- mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
- mc.moved_charge = 0;
- }
- /* we must fixup refcnts and charges */
- if (mc.moved_swap) {
- /* uncharge swap account from the old cgroup */
- if (!mem_cgroup_is_root(mc.from))
- page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
-
- mem_cgroup_id_put_many(mc.from, mc.moved_swap);
-
- /*
- * we charged both to->memory and to->memsw, so we
- * should uncharge to->memory.
- */
- if (!mem_cgroup_is_root(mc.to))
- page_counter_uncharge(&mc.to->memory, mc.moved_swap);
-
- mc.moved_swap = 0;
- }
- memcg1_oom_recover(from);
- memcg1_oom_recover(to);
- wake_up_all(&mc.waitq);
-}
-
-static void mem_cgroup_clear_mc(void)
-{
- struct mm_struct *mm = mc.mm;
-
- /*
- * we must clear moving_task before waking up waiters at the end of
- * task migration.
- */
- mc.moving_task = NULL;
- __mem_cgroup_clear_mc();
- spin_lock(&mc.lock);
- mc.from = NULL;
- mc.to = NULL;
- mc.mm = NULL;
- spin_unlock(&mc.lock);
-
- mmput(mm);
-}
-
-int memcg1_can_attach(struct cgroup_taskset *tset)
-{
- struct cgroup_subsys_state *css;
- struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
- struct mem_cgroup *from;
- struct task_struct *leader, *p;
- struct mm_struct *mm;
- unsigned long move_flags;
- int ret = 0;
-
- /* charge immigration isn't supported on the default hierarchy */
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- return 0;
-
- /*
- * Multi-process migrations only happen on the default hierarchy
- * where charge immigration is not used. Perform charge
- * immigration if @tset contains a leader and whine if there are
- * multiple.
- */
- p = NULL;
- cgroup_taskset_for_each_leader(leader, css, tset) {
- WARN_ON_ONCE(p);
- p = leader;
- memcg = mem_cgroup_from_css(css);
- }
- if (!p)
- return 0;
-
- /*
- * We are now committed to this value whatever it is. Changes in this
- * tunable will only affect upcoming migrations, not the current one.
- * So we need to save it, and keep it going.
- */
- move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
- if (!move_flags)
- return 0;
-
- from = mem_cgroup_from_task(p);
-
- VM_BUG_ON(from == memcg);
-
- mm = get_task_mm(p);
- if (!mm)
- return 0;
- /* We move charges only when we move a owner of the mm */
- if (mm->owner == p) {
- VM_BUG_ON(mc.from);
- VM_BUG_ON(mc.to);
- VM_BUG_ON(mc.precharge);
- VM_BUG_ON(mc.moved_charge);
- VM_BUG_ON(mc.moved_swap);
-
- spin_lock(&mc.lock);
- mc.mm = mm;
- mc.from = from;
- mc.to = memcg;
- mc.flags = move_flags;
- spin_unlock(&mc.lock);
- /* We set mc.moving_task later */
-
- ret = mem_cgroup_precharge_mc(mm);
- if (ret)
- mem_cgroup_clear_mc();
- } else {
- mmput(mm);
- }
- return ret;
-}
-
-void memcg1_cancel_attach(struct cgroup_taskset *tset)
-{
- if (mc.to)
- mem_cgroup_clear_mc();
-}
-
-static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
- unsigned long addr, unsigned long end,
- struct mm_walk *walk)
-{
- int ret = 0;
- struct vm_area_struct *vma = walk->vma;
- pte_t *pte;
- spinlock_t *ptl;
- enum mc_target_type target_type;
- union mc_target target;
- struct folio *folio;
- bool tried_split_before = false;
-
-retry_pmd:
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(ptl);
- return 0;
- }
- target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
- if (target_type == MC_TARGET_PAGE) {
- folio = target.folio;
- /*
- * Deferred split queue locking depends on memcg,
- * and unqueue is unsafe unless folio refcount is 0:
- * split or skip if on the queue? first try to split.
- */
- if (!list_empty(&folio->_deferred_list)) {
- spin_unlock(ptl);
- if (!tried_split_before)
- split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (tried_split_before)
- return 0;
- tried_split_before = true;
- goto retry_pmd;
- }
- /*
- * So long as that pmd lock is held, the folio cannot
- * be racily added to the _deferred_list, because
- * __folio_remove_rmap() will find !partially_mapped.
- */
- if (folio_isolate_lru(folio)) {
- if (!mem_cgroup_move_account(folio, true,
- mc.from, mc.to)) {
- mc.precharge -= HPAGE_PMD_NR;
- mc.moved_charge += HPAGE_PMD_NR;
- }
- folio_putback_lru(folio);
- }
- folio_unlock(folio);
- folio_put(folio);
- } else if (target_type == MC_TARGET_DEVICE) {
- folio = target.folio;
- if (!mem_cgroup_move_account(folio, true,
- mc.from, mc.to)) {
- mc.precharge -= HPAGE_PMD_NR;
- mc.moved_charge += HPAGE_PMD_NR;
- }
- folio_unlock(folio);
- folio_put(folio);
- }
- spin_unlock(ptl);
- return 0;
- }
-
-retry:
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (!pte)
- return 0;
- for (; addr != end; addr += PAGE_SIZE) {
- pte_t ptent = ptep_get(pte++);
- bool device = false;
- swp_entry_t ent;
-
- if (!mc.precharge)
- break;
-
- switch (get_mctgt_type(vma, addr, ptent, &target)) {
- case MC_TARGET_DEVICE:
- device = true;
- fallthrough;
- case MC_TARGET_PAGE:
- folio = target.folio;
- /*
- * We can have a part of the split pmd here. Moving it
- * can be done but it would be too convoluted so simply
- * ignore such a partial THP and keep it in original
- * memcg. There should be somebody mapping the head.
- */
- if (folio_test_large(folio))
- goto put;
- if (!device && !folio_isolate_lru(folio))
- goto put;
- if (!mem_cgroup_move_account(folio, false,
- mc.from, mc.to)) {
- mc.precharge--;
- /* we uncharge from mc.from later. */
- mc.moved_charge++;
- }
- if (!device)
- folio_putback_lru(folio);
-put: /* get_mctgt_type() gets & locks the page */
- folio_unlock(folio);
- folio_put(folio);
- break;
- case MC_TARGET_SWAP:
- ent = target.ent;
- if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
- mc.precharge--;
- mem_cgroup_id_get_many(mc.to, 1);
- /* we fixup other refcnts and charges later. */
- mc.moved_swap++;
- }
- break;
- default:
- break;
- }
- }
- pte_unmap_unlock(pte - 1, ptl);
- cond_resched();
-
- if (addr != end) {
- /*
- * We have consumed all precharges we got in can_attach().
- * We try charge one by one, but don't do any additional
- * charges to mc.to if we have failed in charge once in attach()
- * phase.
- */
- ret = mem_cgroup_do_precharge(1);
- if (!ret)
- goto retry;
- }
-
- return ret;
-}
-
-static const struct mm_walk_ops charge_walk_ops = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .walk_lock = PGWALK_RDLOCK,
-};
-
-static void mem_cgroup_move_charge(void)
-{
- lru_add_drain_all();
- /*
- * Signal folio_memcg_lock() to take the memcg's move_lock
- * while we're moving its pages to another memcg. Then wait
- * for already started RCU-only updates to finish.
- */
- atomic_inc(&mc.from->moving_account);
- synchronize_rcu();
-retry:
- if (unlikely(!mmap_read_trylock(mc.mm))) {
- /*
- * Someone who are holding the mmap_lock might be waiting in
- * waitq. So we cancel all extra charges, wake up all waiters,
- * and retry. Because we cancel precharges, we might not be able
- * to move enough charges, but moving charge is a best-effort
- * feature anyway, so it wouldn't be a big problem.
- */
- __mem_cgroup_clear_mc();
- cond_resched();
- goto retry;
- }
- /*
- * When we have consumed all precharges and failed in doing
- * additional charge, the page walk just aborts.
- */
- walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
- mmap_read_unlock(mc.mm);
- atomic_dec(&mc.from->moving_account);
-}
-
-void memcg1_move_task(void)
-{
- if (mc.to) {
- mem_cgroup_move_charge();
- mem_cgroup_clear_mc();
- }
-}
-
-#else /* !CONFIG_MMU */
-int memcg1_can_attach(struct cgroup_taskset *tset)
-{
- return 0;
-}
-void memcg1_cancel_attach(struct cgroup_taskset *tset)
-{
-}
-void memcg1_move_task(void)
-{
-}
-#endif
-
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
@@ -2072,7 +1094,6 @@ void memcg1_memcg_init(struct mem_cgroup *memcg)
{
INIT_LIST_HEAD(&memcg->oom_notify);
mutex_init(&memcg->thresholds_lock);
- spin_lock_init(&memcg->move_lock);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
}
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index c0672e25bcdb..0e3b82951d91 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -80,12 +80,7 @@ static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg)
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
}
-bool memcg1_wait_acct_move(struct mem_cgroup *memcg);
-
struct cgroup_taskset;
-int memcg1_can_attach(struct cgroup_taskset *tset);
-void memcg1_cancel_attach(struct cgroup_taskset *tset);
-void memcg1_move_task(void);
void memcg1_css_offline(struct mem_cgroup *memcg);
/* for encoding cft->private value on file */
@@ -130,7 +125,6 @@ static inline void memcg1_free_events(struct mem_cgroup *memcg) {}
static inline void memcg1_memcg_init(struct mem_cgroup *memcg) {}
static inline void memcg1_remove_from_trees(struct mem_cgroup *memcg) {}
static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg) {}
-static inline bool memcg1_wait_acct_move(struct mem_cgroup *memcg) { return false; }
static inline void memcg1_css_offline(struct mem_cgroup *memcg) {}
static inline bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) { return true; }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53db98d2c4a1..7b3503d12aaf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -71,6 +71,10 @@
#include <linux/uaccess.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/memcg.h>
+#undef CREATE_TRACE_POINTS
+
#include <trace/events/vmscan.h>
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
@@ -114,6 +118,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
return container_of(vmpr, struct mem_cgroup, vmpressure);
}
+#define SEQ_BUF_SIZE SZ_4K
#define CURRENT_OBJCG_UPDATE_BIT 0
#define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)
@@ -310,6 +315,9 @@ static const unsigned int memcg_node_stat_items[] = {
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
+#ifdef CONFIG_HUGETLB_PAGE
+ NR_HUGETLB,
+#endif
};
static const unsigned int memcg_stat_items[] = {
@@ -418,6 +426,8 @@ static const unsigned int memcg_vm_event_stat[] = {
PGPGIN,
PGPGOUT,
#endif
+ PSWPIN,
+ PSWPOUT,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
@@ -588,8 +598,16 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
}
}
-static void do_flush_stats(struct mem_cgroup *memcg)
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
+ bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
+
+ trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
+ force, needs_flush);
+
+ if (!force && !needs_flush)
+ return;
+
if (mem_cgroup_is_root(memcg))
WRITE_ONCE(flush_last_time, jiffies_64);
@@ -613,8 +631,7 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
if (!memcg)
memcg = root_mem_cgroup;
- if (memcg_vmstats_needs_flush(memcg->vmstats))
- do_flush_stats(memcg);
+ __mem_cgroup_flush_stats(memcg, false);
}
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
@@ -630,7 +647,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
* Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
* in latency-sensitive paths is as cheap as possible.
*/
- do_flush_stats(root_mem_cgroup);
+ __mem_cgroup_flush_stats(root_mem_cgroup, true);
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
@@ -684,7 +701,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
return;
__this_cpu_add(memcg->vmstats_percpu->state[i], val);
- memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
+ val = memcg_state_val_in_pages(idx, val);
+ memcg_rstat_updated(memcg, val);
+ trace_mod_memcg_state(memcg, idx, val);
}
/* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -743,7 +762,9 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
/* Update lruvec */
__this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
- memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
+ val = memcg_state_val_in_pages(idx, val);
+ memcg_rstat_updated(memcg, val);
+ trace_mod_memcg_lruvec_state(memcg, idx, val);
memcg_stats_unlock();
}
@@ -834,6 +855,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
memcg_stats_lock();
__this_cpu_add(memcg->vmstats_percpu->events[i], count);
memcg_rstat_updated(memcg, count);
+ trace_count_memcg_events(memcg, idx, count);
memcg_stats_unlock();
}
@@ -1181,7 +1203,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
* These functions are safe to use under any of the following conditions:
* - folio locked
* - folio_test_lru false
- * - folio_memcg_lock()
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held.
@@ -1203,7 +1224,6 @@ struct lruvec *folio_lruvec_lock(struct folio *folio)
* These functions are safe to use under any of the following conditions:
* - folio locked
* - folio_test_lru false
- * - folio_memcg_lock()
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held and interrupts
@@ -1227,7 +1247,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
* These functions are safe to use under any of the following conditions:
* - folio locked
* - folio_test_lru false
- * - folio_memcg_lock()
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held and interrupts
@@ -1350,6 +1369,9 @@ static const struct memory_stat memory_stats[] = {
{ "unevictable", NR_UNEVICTABLE },
{ "slab_reclaimable", NR_SLAB_RECLAIMABLE_B },
{ "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B },
+#ifdef CONFIG_HUGETLB_PAGE
+ { "hugetlb", NR_HUGETLB },
+#endif
/* The memory events */
{ "workingset_refault_anon", WORKINGSET_REFAULT_ANON },
@@ -1445,6 +1467,11 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
+#ifdef CONFIG_HUGETLB_PAGE
+ if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
+ !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+ continue;
+#endif
size = memcg_page_state_output(memcg, memory_stats[i].idx);
seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
@@ -1520,7 +1547,7 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
/* Use static buffer, for the caller is holding oom_lock. */
- static char buf[PAGE_SIZE];
+ static char buf[SEQ_BUF_SIZE];
struct seq_buf s;
lockdep_assert_held(&oom_lock);
@@ -1546,7 +1573,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
- seq_buf_init(&s, buf, sizeof(buf));
+ seq_buf_init(&s, buf, SEQ_BUF_SIZE);
memory_stat_format(memcg, &s);
seq_buf_do_printk(&s, KERN_INFO);
}
@@ -2234,12 +2261,6 @@ retry:
*/
if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
goto retry;
- /*
- * At task move, charge accounts can be doubly counted. So, it's
- * better to wait until the end of task_move if something is going on.
- */
- if (memcg1_wait_acct_move(mem_over_limit))
- goto retry;
if (nr_retries--)
goto retry;
@@ -2373,9 +2394,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
*
* - the page lock
* - LRU isolation
- * - folio_memcg_lock()
* - exclusive reference
- * - mem_cgroup_trylock_pages()
*/
folio->memcg_data = (unsigned long)memcg;
}
@@ -3102,15 +3121,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (!parent)
parent = root_mem_cgroup;
- memcg_reparent_objcgs(memcg, parent);
+ memcg_reparent_list_lrus(memcg, parent);
/*
- * After we have finished memcg_reparent_objcgs(), all list_lrus
- * corresponding to this cgroup are guaranteed to remain empty.
- * The ordering is imposed by list_lru_node->lock taken by
- * memcg_reparent_list_lrus().
+ * Objcg's reparenting must be after list_lru's, make sure list_lru
+ * helpers won't use parent's list_lru until child is drained.
*/
- memcg_reparent_list_lrus(memcg, parent);
+ memcg_reparent_objcgs(memcg, parent);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -3733,68 +3750,90 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg_wb_domain_size_changed(memcg);
}
-static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+struct aggregate_control {
+ /* pointer to the aggregated (CPU and subtree aggregated) counters */
+ long *aggregate;
+ /* pointer to the non-hierarchichal (CPU aggregated) counters */
+ long *local;
+ /* pointer to the pending child counters during tree propagation */
+ long *pending;
+ /* pointer to the parent's pending counters, could be NULL */
+ long *ppending;
+ /* pointer to the percpu counters to be aggregated */
+ long *cstat;
+ /* pointer to the percpu counters of the last aggregation*/
+ long *cstat_prev;
+ /* size of the above counters */
+ int size;
+};
+
+static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- struct memcg_vmstats_percpu *statc;
+ int i;
long delta, delta_cpu, v;
- int i, nid;
-
- statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- for (i = 0; i < MEMCG_VMSTAT_SIZE; i++) {
+ for (i = 0; i < ac->size; i++) {
/*
* Collect the aggregated propagation counts of groups
* below us. We're in a per-cpu loop here and this is
* a global counter, so the first cycle will get them.
*/
- delta = memcg->vmstats->state_pending[i];
+ delta = ac->pending[i];
if (delta)
- memcg->vmstats->state_pending[i] = 0;
+ ac->pending[i] = 0;
/* Add CPU changes on this level since the last flush */
delta_cpu = 0;
- v = READ_ONCE(statc->state[i]);
- if (v != statc->state_prev[i]) {
- delta_cpu = v - statc->state_prev[i];
+ v = READ_ONCE(ac->cstat[i]);
+ if (v != ac->cstat_prev[i]) {
+ delta_cpu = v - ac->cstat_prev[i];
delta += delta_cpu;
- statc->state_prev[i] = v;
+ ac->cstat_prev[i] = v;
}
/* Aggregate counts on this level and propagate upwards */
if (delta_cpu)
- memcg->vmstats->state_local[i] += delta_cpu;
+ ac->local[i] += delta_cpu;
if (delta) {
- memcg->vmstats->state[i] += delta;
- if (parent)
- parent->vmstats->state_pending[i] += delta;
+ ac->aggregate[i] += delta;
+ if (ac->ppending)
+ ac->ppending[i] += delta;
}
}
+}
- for (i = 0; i < NR_MEMCG_EVENTS; i++) {
- delta = memcg->vmstats->events_pending[i];
- if (delta)
- memcg->vmstats->events_pending[i] = 0;
-
- delta_cpu = 0;
- v = READ_ONCE(statc->events[i]);
- if (v != statc->events_prev[i]) {
- delta_cpu = v - statc->events_prev[i];
- delta += delta_cpu;
- statc->events_prev[i] = v;
- }
+static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct memcg_vmstats_percpu *statc;
+ struct aggregate_control ac;
+ int nid;
- if (delta_cpu)
- memcg->vmstats->events_local[i] += delta_cpu;
+ statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- if (delta) {
- memcg->vmstats->events[i] += delta;
- if (parent)
- parent->vmstats->events_pending[i] += delta;
- }
- }
+ ac = (struct aggregate_control) {
+ .aggregate = memcg->vmstats->state,
+ .local = memcg->vmstats->state_local,
+ .pending = memcg->vmstats->state_pending,
+ .ppending = parent ? parent->vmstats->state_pending : NULL,
+ .cstat = statc->state,
+ .cstat_prev = statc->state_prev,
+ .size = MEMCG_VMSTAT_SIZE,
+ };
+ mem_cgroup_stat_aggregate(&ac);
+
+ ac = (struct aggregate_control) {
+ .aggregate = memcg->vmstats->events,
+ .local = memcg->vmstats->events_local,
+ .pending = memcg->vmstats->events_pending,
+ .ppending = parent ? parent->vmstats->events_pending : NULL,
+ .cstat = statc->events,
+ .cstat_prev = statc->events_prev,
+ .size = NR_MEMCG_EVENTS,
+ };
+ mem_cgroup_stat_aggregate(&ac);
for_each_node_state(nid, N_MEMORY) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
@@ -3807,28 +3846,17 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
- for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) {
- delta = lstats->state_pending[i];
- if (delta)
- lstats->state_pending[i] = 0;
-
- delta_cpu = 0;
- v = READ_ONCE(lstatc->state[i]);
- if (v != lstatc->state_prev[i]) {
- delta_cpu = v - lstatc->state_prev[i];
- delta += delta_cpu;
- lstatc->state_prev[i] = v;
- }
-
- if (delta_cpu)
- lstats->state_local[i] += delta_cpu;
+ ac = (struct aggregate_control) {
+ .aggregate = lstats->state,
+ .local = lstats->state_local,
+ .pending = lstats->state_pending,
+ .ppending = plstats ? plstats->state_pending : NULL,
+ .cstat = lstatc->state,
+ .cstat_prev = lstatc->state_prev,
+ .size = NR_MEMCG_NODE_STAT_ITEMS,
+ };
+ mem_cgroup_stat_aggregate(&ac);
- if (delta) {
- lstats->state[i] += delta;
- if (plstats)
- plstats->state_pending[i] += delta;
- }
- }
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
@@ -4189,12 +4217,12 @@ static int memory_events_local_show(struct seq_file *m, void *v)
int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ char *buf = kmalloc(SEQ_BUF_SIZE, GFP_KERNEL);
struct seq_buf s;
if (!buf)
return -ENOMEM;
- seq_buf_init(&s, buf, PAGE_SIZE);
+ seq_buf_init(&s, buf, SEQ_BUF_SIZE);
memory_stat_format(memcg, &s);
seq_puts(m, buf);
kfree(buf);
@@ -4433,9 +4461,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
.exit = mem_cgroup_exit,
.dfl_cftypes = memory_files,
#ifdef CONFIG_MEMCG_V1
- .can_attach = memcg1_can_attach,
- .cancel_attach = memcg1_cancel_attach,
- .post_attach = memcg1_move_task,
.legacy_cftypes = mem_cgroup_legacy_files,
#endif
.early_init = 0,
@@ -5277,11 +5302,8 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
break;
}
- /*
- * mem_cgroup_flush_stats() ignores small changes. Use
- * do_flush_stats() directly to get accurate stats for charging.
- */
- do_flush_stats(memcg);
+ /* Force flush to get accurate stats for charging */
+ __mem_cgroup_flush_stats(memcg, true);
pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
if (pages < max)
continue;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 96ce31e5a203..a7b8ccd29b6f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -100,7 +100,7 @@ static ssize_t _name##_show(struct device *dev, \
{ \
struct memory_failure_stats *mf_stats = \
&NODE_DATA(dev->id)->mf_stats; \
- return sprintf(buf, "%lu\n", mf_stats->_name); \
+ return sysfs_emit(buf, "%lu\n", mf_stats->_name); \
} \
static DEVICE_ATTR_RO(_name)
@@ -445,7 +445,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*/
-static void __add_to_kill(struct task_struct *tsk, struct page *p,
+static void __add_to_kill(struct task_struct *tsk, const struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
unsigned long addr)
{
@@ -461,7 +461,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
else
- tk->size_shift = page_shift(compound_head(p));
+ tk->size_shift = folio_shift(page_folio(p));
/*
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
@@ -486,7 +486,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
list_add_tail(&tk->nd, to_kill);
}
-static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
+static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
unsigned long addr)
{
@@ -509,7 +509,7 @@ static bool task_in_to_kill_list(struct list_head *to_kill,
return false;
}
-void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
unsigned long addr)
{
@@ -606,8 +606,9 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
/*
* Collect processes when the error hit an anonymous page.
*/
-static void collect_procs_anon(struct folio *folio, struct page *page,
- struct list_head *to_kill, int force_early)
+static void collect_procs_anon(const struct folio *folio,
+ const struct page *page, struct list_head *to_kill,
+ int force_early)
{
struct task_struct *tsk;
struct anon_vma *av;
@@ -617,7 +618,7 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
if (av == NULL) /* Not actually mapped anymore */
return;
- pgoff = page_to_pgoff(page);
+ pgoff = page_pgoff(folio, page);
rcu_read_lock();
for_each_process(tsk) {
struct vm_area_struct *vma;
@@ -643,8 +644,9 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
/*
* Collect processes when the error hit a file mapped page.
*/
-static void collect_procs_file(struct folio *folio, struct page *page,
- struct list_head *to_kill, int force_early)
+static void collect_procs_file(const struct folio *folio,
+ const struct page *page, struct list_head *to_kill,
+ int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -653,7 +655,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
i_mmap_lock_read(mapping);
rcu_read_lock();
- pgoff = page_to_pgoff(page);
+ pgoff = page_pgoff(folio, page);
for_each_process(tsk) {
struct task_struct *t = task_early_kill(tsk, force_early);
unsigned long addr;
@@ -671,7 +673,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
*/
if (vma->vm_mm != t->mm)
continue;
- addr = page_address_in_vma(page, vma);
+ addr = page_address_in_vma(folio, page, vma);
add_to_kill_anon_file(t, page, vma, to_kill, addr);
}
}
@@ -680,7 +682,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
}
#ifdef CONFIG_FS_DAX
-static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
+static void add_to_kill_fsdax(struct task_struct *tsk, const struct page *p,
struct vm_area_struct *vma,
struct list_head *to_kill, pgoff_t pgoff)
{
@@ -691,7 +693,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
/*
* Collect processes when the error hit a fsdax page.
*/
-static void collect_procs_fsdax(struct page *page,
+static void collect_procs_fsdax(const struct page *page,
struct address_space *mapping, pgoff_t pgoff,
struct list_head *to_kill, bool pre_remove)
{
@@ -725,7 +727,7 @@ static void collect_procs_fsdax(struct page *page,
/*
* Collect the processes who have the corrupted page mapped to kill.
*/
-static void collect_procs(struct folio *folio, struct page *page,
+static void collect_procs(const struct folio *folio, const struct page *page,
struct list_head *tokill, int force_early)
{
if (!folio->mapping)
diff --git a/mm/memory.c b/mm/memory.c
index bdf77a3ec47b..75c2dfd04f72 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1,4 +1,3 @@
-
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory.c
@@ -44,7 +43,6 @@
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
@@ -1061,8 +1059,7 @@ static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
if (need_zero)
new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
else
- new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
- addr, false);
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
if (!new_folio)
return NULL;
@@ -1085,6 +1082,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
+ pmd_t dummy_pmdval;
pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
int progress, max_nr, ret = 0;
@@ -1110,7 +1108,15 @@ again:
ret = -ENOMEM;
goto out;
}
- src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+
+ /*
+ * We already hold the exclusive mmap_lock, the copy_pte_range() and
+ * retract_page_tables() are using vma->anon_vma to be exclusive, so
+ * the PTE page is stable, and there is no need to get pmdval and do
+ * pmd_same() check.
+ */
+ src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval,
+ &src_ptl);
if (!src_pte) {
pte_unmap_unlock(dst_pte, dst_ptl);
/* ret == 0 */
@@ -1449,7 +1455,7 @@ static inline bool should_zap_folio(struct zap_details *details,
return !folio_test_anon(folio);
}
-static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
+static inline bool zap_drop_markers(struct zap_details *details)
{
if (!details)
return false;
@@ -1470,7 +1476,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
if (vma_is_anonymous(vma))
return;
- if (zap_drop_file_uffd_wp(details))
+ if (zap_drop_markers(details))
return;
for (;;) {
@@ -1665,7 +1671,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
* drop the marker if explicitly requested.
*/
if (!vma_is_anonymous(vma) &&
- !zap_drop_file_uffd_wp(details))
+ !zap_drop_markers(details))
+ continue;
+ } else if (is_guard_swp_entry(entry)) {
+ /*
+ * Ordinary zapping should not remove guard PTE
+ * markers. Only do so if we should remove PTE markers
+ * in general.
+ */
+ if (!zap_drop_markers(details))
continue;
} else if (is_hwpoison_entry(entry) ||
is_poisoned_swp_entry(entry)) {
@@ -3997,6 +4011,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
if (marker & PTE_MARKER_POISONED)
return VM_FAULT_HWPOISON;
+ /* Hitting a guard page is always a fatal condition. */
+ if (marker & PTE_MARKER_GUARD)
+ return VM_FAULT_SIGSEGV;
+
if (pte_marker_entry_uffd_wp(entry))
return pte_marker_handle_uffd_wp(vmf);
@@ -4010,8 +4028,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
struct folio *folio;
swp_entry_t entry;
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
- vmf->address, false);
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
if (!folio)
return NULL;
@@ -4167,7 +4184,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
gfp = vma_thp_gfp_mask(vma);
while (orders) {
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
- folio = vma_alloc_folio(gfp, order, vma, addr, true);
+ folio = vma_alloc_folio(gfp, order, vma, addr);
if (folio) {
if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
gfp, entry))
@@ -4706,7 +4723,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
gfp = vma_thp_gfp_mask(vma);
while (orders) {
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
- folio = vma_alloc_folio(gfp, order, vma, addr, true);
+ folio = vma_alloc_folio(gfp, order, vma, addr);
if (folio) {
if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
@@ -4714,7 +4731,15 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
goto next;
}
folio_throttle_swaprate(folio, gfp);
- folio_zero_user(folio, vmf->address);
+ /*
+ * When a folio is not zeroed during allocation
+ * (__GFP_ZERO not used), folio_zero_user() is used
+ * to make sure that the page corresponding to the
+ * faulting address will be hot in the cache after
+ * zeroing.
+ */
+ if (!alloc_zeroed())
+ folio_zero_user(folio, vmf->address);
return folio;
}
next:
@@ -5743,14 +5768,24 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
vmf->pte = NULL;
vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
} else {
+ pmd_t dummy_pmdval;
+
/*
* A regular pmd is established and it can't morph into a huge
* pmd by anon khugepaged, since that takes mmap_lock in write
* mode; but shmem or file collapse to THP could still morph
* it into a huge pmd: just retry later if so.
+ *
+ * Use the maywrite version to indicate that vmf->pte may be
+ * modified, but since we will use pte_same() to detect the
+ * change of the !pte_none() entry, there is no need to recheck
+ * the pmdval. Here we chooes to pass a dummy variable instead
+ * of NULL, which helps new user think about why this place is
+ * special.
*/
- vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
- vmf->address, &vmf->ptl);
+ vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &dummy_pmdval,
+ &vmf->ptl);
if (unlikely(!vmf->pte))
return 0;
vmf->orig_pte = ptep_get_lockless(vmf->pte);
@@ -6365,7 +6400,7 @@ static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
struct address_space *mapping = file ? file->f_mapping : NULL;
if (mapping)
- lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) ||
+ lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) ||
lockdep_is_held(&vma->vm_mm->mmap_lock));
else
lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 621ae1015106..c43b4e7fb298 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void)
struct range mhp_get_pluggable_range(bool need_mapping)
{
- const u64 max_phys = PHYSMEM_END;
+ const u64 max_phys = DIRECT_MAP_PHYSMEM_END;
struct range mhp_range;
if (need_mapping) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b646fab3e45e..bb37cd1a51d8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -8,7 +8,7 @@
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
*
- * Support four policies per VMA and per process:
+ * Support six policies per VMA and per process:
*
* The VMA policy has priority over the process policy for a page fault.
*
@@ -1367,7 +1367,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_entry_is_head(folio, &pagelist, lru)) {
vma_iter_init(&vmi, mm, start);
for_each_vma_range(vmi, vma, end) {
- addr = page_address_in_vma(
+ addr = page_address_in_vma(folio,
folio_page(folio, 0), vma);
if (addr != -EFAULT)
break;
@@ -2290,7 +2290,6 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
* @order: Order of the folio.
* @vma: Pointer to VMA.
* @addr: Virtual address of the allocation. Must be inside @vma.
- * @hugepage: Unused (was: For hugepages try only preferred node if possible).
*
* Allocate a folio for a specific address in @vma, using the appropriate
* NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
@@ -2301,7 +2300,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
* Return: The folio on success or NULL if allocation fails.
*/
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, bool hugepage)
+ unsigned long addr)
{
struct mempolicy *pol;
pgoff_t ilx;
diff --git a/mm/migrate.c b/mm/migrate.c
index 47a8d102ae7a..2ce6b4b814df 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -695,6 +695,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
if (folio_test_idle(folio))
folio_set_idle(newfolio);
+ folio_migrate_refs(newfolio, folio);
/*
* Copy NUMA information to the new page, to prevent over-eager
* future migrations of this same page.
@@ -1732,7 +1733,7 @@ static int migrate_pages_batch(struct list_head *from,
list_for_each_entry_safe(folio, folio2, from, lru) {
is_large = folio_test_large(folio);
- is_thp = is_large && folio_test_pmd_mappable(folio);
+ is_thp = folio_test_pmd_mappable(folio);
nr_pages = folio_nr_pages(folio);
cond_resched();
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4ba5607aaf19..1c205b0a86ed 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -83,8 +83,7 @@ void __init mminit_verify_pageflags_layout(void)
unsigned long or_mask, add_mask;
shift = BITS_PER_LONG;
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
+ width = shift - NR_NON_PAGEFLAG_BITS;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
@@ -2639,7 +2638,7 @@ void __init mm_core_init(void)
BUILD_BUG_ON(MAX_ZONELISTS > 2);
build_all_zonelists(NULL);
page_alloc_init_cpuhp();
-
+ alloc_tag_sec_init();
/*
* page_ext requires contiguous pages,
* bigger than MAX_PAGE_ORDER unless SPARSEMEM.
diff --git a/mm/mmap.c b/mm/mmap.c
index 79d541f1502b..386429f7db5a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -577,22 +577,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
-/*
- * We account for memory if it's a private writeable mapping,
- * not hugepages and VM_NORESERVE wasn't set.
- */
-static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
-{
- /*
- * hugetlb has its own accounting separate from the core VM
- * VM_HUGETLB may not be set yet so we cannot check for that flag.
- */
- if (file && is_file_hugepages(file))
- return false;
-
- return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
-}
-
/**
* unmapped_area() - Find an area between the low_limit and the high_limit with
* the correct alignment and offset, all from @info. Note: current->mm is used
@@ -776,6 +760,8 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr,
info.low_limit = mm->mmap_base;
info.high_limit = mmap_end;
info.start_gap = stack_guard_placement(vm_flags);
+ if (filp && is_file_hugepages(filp))
+ info.align_mask = huge_page_mask_align(filp);
return vm_unmapped_area(&info);
}
@@ -826,6 +812,8 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
info.low_limit = PAGE_SIZE;
info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.start_gap = stack_guard_placement(vm_flags);
+ if (filp && is_file_hugepages(filp))
+ info.align_mask = huge_page_mask_align(filp);
addr = vm_unmapped_area(&info);
/*
@@ -1051,6 +1039,8 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
+ mmap_assert_write_locked(mm);
+
/* Guard against exceeding limits of the address space. */
address &= PAGE_MASK;
if (address >= (TASK_SIZE & PAGE_MASK))
@@ -1086,11 +1076,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Lock the VMA before expanding to prevent concurrent page faults */
vma_start_write(vma);
- /*
- * vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_lock in read mode. We need the
- * anon_vma lock to serialize against concurrent expand_stacks.
- */
+ /* We update the anon VMA tree. */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
@@ -1104,16 +1090,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
- /*
- * We only hold a shared mmap_lock lock here, so
- * we need to protect against concurrent vma
- * expansions. anon_vma_lock_write() doesn't
- * help here, as we don't guarantee that all
- * growable vmas in a mm share the same root
- * anon vma. So, we reuse mm->page_table_lock
- * to guard against concurrent vma expansions.
- */
- spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
@@ -1122,7 +1098,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Overwrite old entry in mtree. */
vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
- spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
@@ -1149,6 +1124,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
if (!(vma->vm_flags & VM_GROWSDOWN))
return -EFAULT;
+ mmap_assert_write_locked(mm);
+
address &= PAGE_MASK;
if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
return -EPERM;
@@ -1178,11 +1155,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
/* Lock the VMA before expanding to prevent concurrent page faults */
vma_start_write(vma);
- /*
- * vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_lock in read mode. We need the
- * anon_vma lock to serialize against concurrent expand_stacks.
- */
+ /* We update the anon VMA tree. */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
@@ -1196,16 +1169,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
- /*
- * We only hold a shared mmap_lock lock here, so
- * we need to protect against concurrent vma
- * expansions. anon_vma_lock_write() doesn't
- * help here, as we don't guarantee that all
- * growable vmas in a mm share the same root
- * anon vma. So, we reuse mm->page_table_lock
- * to guard against concurrent vma expansions.
- */
- spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
@@ -1215,7 +1178,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
/* Overwrite old entry in mtree. */
vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
- spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
@@ -1358,224 +1320,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}
-static unsigned long __mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = NULL;
- pgoff_t pglen = PHYS_PFN(len);
- unsigned long charged = 0;
- struct vma_munmap_struct vms;
- struct ma_state mas_detach;
- struct maple_tree mt_detach;
- unsigned long end = addr + len;
- int error;
- VMA_ITERATOR(vmi, mm, addr);
- VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
-
- vmg.file = file;
- /* Find the first overlapping VMA */
- vma = vma_find(&vmi, end);
- init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false);
- if (vma) {
- mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
- mt_on_stack(mt_detach);
- mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
- /* Prepare to unmap any existing mapping in the area */
- error = vms_gather_munmap_vmas(&vms, &mas_detach);
- if (error)
- goto gather_failed;
-
- vmg.next = vms.next;
- vmg.prev = vms.prev;
- vma = NULL;
- } else {
- vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev);
- }
-
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) {
- error = -ENOMEM;
- goto abort_munmap;
- }
-
- /*
- * Private writable mapping: check memory availability
- */
- if (accountable_mapping(file, vm_flags)) {
- charged = pglen;
- charged -= vms.nr_accounted;
- if (charged) {
- error = security_vm_enough_memory_mm(mm, charged);
- if (error)
- goto abort_munmap;
- }
-
- vms.nr_accounted = 0;
- vm_flags |= VM_ACCOUNT;
- vmg.flags = vm_flags;
- }
-
- /*
- * clear PTEs while the vma is still in the tree so that rmap
- * cannot race with the freeing later in the truncate scenario.
- * This is also needed for mmap_file(), which is why vm_ops
- * close function is called.
- */
- vms_clean_up_area(&vms, &mas_detach);
- vma = vma_merge_new_range(&vmg);
- if (vma)
- goto expanded;
- /*
- * Determine the object being mapped and call the appropriate
- * specific mapper. the address has already been validated, but
- * not unmapped, but the maps are removed from the list.
- */
- vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
-
- vma_iter_config(&vmi, addr, end);
- vma_set_range(vma, addr, end, pgoff);
- vm_flags_init(vma, vm_flags);
- vma->vm_page_prot = vm_get_page_prot(vm_flags);
-
- if (vma_iter_prealloc(&vmi, vma)) {
- error = -ENOMEM;
- goto free_vma;
- }
-
- if (file) {
- vma->vm_file = get_file(file);
- error = mmap_file(file, vma);
- if (error)
- goto unmap_and_free_file_vma;
-
- /* Drivers cannot alter the address of the VMA. */
- WARN_ON_ONCE(addr != vma->vm_start);
- /*
- * Drivers should not permit writability when previously it was
- * disallowed.
- */
- VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
- !(vm_flags & VM_MAYWRITE) &&
- (vma->vm_flags & VM_MAYWRITE));
-
- vma_iter_config(&vmi, addr, end);
- /*
- * If vm_flags changed after mmap_file(), we should try merge
- * vma again as we may succeed this time.
- */
- if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
- struct vm_area_struct *merge;
-
- vmg.flags = vma->vm_flags;
- /* If this fails, state is reset ready for a reattempt. */
- merge = vma_merge_new_range(&vmg);
-
- if (merge) {
- /*
- * ->mmap() can change vma->vm_file and fput
- * the original file. So fput the vma->vm_file
- * here or we would add an extra fput for file
- * and cause general protection fault
- * ultimately.
- */
- fput(vma->vm_file);
- vm_area_free(vma);
- vma = merge;
- /* Update vm_flags to pick up the change. */
- vm_flags = vma->vm_flags;
- goto file_expanded;
- }
- vma_iter_config(&vmi, addr, end);
- }
-
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_iter_vma;
- } else {
- vma_set_anonymous(vma);
- }
-
-#ifdef CONFIG_SPARC64
- /* TODO: Fix SPARC ADI! */
- WARN_ON_ONCE(!arch_validate_flags(vm_flags));
-#endif
-
- /* Lock the VMA since it is modified after insertion into VMA tree */
- vma_start_write(vma);
- vma_iter_store(&vmi, vma);
- mm->map_count++;
- vma_link_file(vma);
-
- /*
- * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
- * call covers the non-merge case.
- */
- khugepaged_enter_vma(vma, vma->vm_flags);
-
-file_expanded:
- file = vma->vm_file;
- ksm_add_vma(vma);
-expanded:
- perf_event_mmap(vma);
-
- /* Unmap any existing mapping in the area */
- vms_complete_munmap_vmas(&vms, &mas_detach);
-
- vm_stat_account(mm, vm_flags, pglen);
- if (vm_flags & VM_LOCKED) {
- if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm))
- vm_flags_clear(vma, VM_LOCKED_MASK);
- else
- mm->locked_vm += pglen;
- }
-
- if (file)
- uprobe_mmap(vma);
-
- /*
- * New (or expanded) vma always get soft dirty status.
- * Otherwise user-space soft-dirty page tracker won't
- * be able to distinguish situation when vma area unmapped,
- * then new mapped in-place (which must be aimed as
- * a completely new data area).
- */
- vm_flags_set(vma, VM_SOFTDIRTY);
-
- vma_set_page_prot(vma);
-
- return addr;
-
-unmap_and_free_file_vma:
- fput(vma->vm_file);
- vma->vm_file = NULL;
-
- vma_iter_set(&vmi, vma->vm_end);
- /* Undo any partial mapping done by a device driver. */
- unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
-free_iter_vma:
- vma_iter_free(&vmi);
-free_vma:
- vm_area_free(vma);
-unacct_error:
- if (charged)
- vm_unacct_memory(charged);
-
-abort_munmap:
- vms_abort_munmap_vmas(&vms, &mas_detach);
-gather_failed:
- return error;
-}
-
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 368b840e7508..f186d57df2c6 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -19,43 +19,23 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
#ifdef CONFIG_MEMCG
-static atomic_t reg_refcount;
-
/*
* Size of the buffer for memcg path names. Ignoring stack trace support,
* trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
*/
#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
-int trace_mmap_lock_reg(void)
-{
- atomic_inc(&reg_refcount);
- return 0;
-}
-
-void trace_mmap_lock_unreg(void)
-{
- atomic_dec(&reg_refcount);
-}
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- do { \
- char buf[MEMCG_PATH_BUF_SIZE]; \
- get_mm_memcg_path(mm, buf, sizeof(buf)); \
- trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ do { \
+ if (trace_mmap_lock_##type##_enabled()) { \
+ char buf[MEMCG_PATH_BUF_SIZE]; \
+ get_mm_memcg_path(mm, buf, sizeof(buf)); \
+ trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
+ } \
} while (0)
#else /* !CONFIG_MEMCG */
-int trace_mmap_lock_reg(void)
-{
- return 0;
-}
-
-void trace_mmap_lock_unreg(void)
-{
-}
-
#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
@@ -65,16 +45,13 @@ void trace_mmap_lock_unreg(void)
#ifdef CONFIG_MEMCG
/*
* Write the given mm_struct's memcg path to a buffer. If the path cannot be
- * determined or the trace event is being unregistered, empty string is written.
+ * determined, empty string is written.
*/
static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
{
struct mem_cgroup *memcg;
buf[0] = '\0';
- /* No need to get path if no trace event is registered. */
- if (!atomic_read(&reg_refcount))
- return;
memcg = get_mem_cgroup_from_mm(mm);
if (memcg == NULL)
return;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6f450af3252e..516b1d847e2c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -236,9 +236,11 @@ static long change_pte_range(struct mmu_gather *tlb,
} else if (is_pte_marker_entry(entry)) {
/*
* Ignore error swap entries unconditionally,
- * because any access should sigbus anyway.
+ * because any access should sigbus/sigsegv
+ * anyway.
*/
- if (is_poisoned_swp_entry(entry))
+ if (is_poisoned_swp_entry(entry) ||
+ is_guard_swp_entry(entry))
continue;
/*
* If this is uffd-wp pte marker and we'd like
diff --git a/mm/mremap.c b/mm/mremap.c
index dee98ff2bbd6..60473413836b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -140,6 +140,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
{
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
+ pmd_t dummy_pmdval;
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
unsigned long len = old_end - old_addr;
@@ -175,7 +176,15 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
err = -EAGAIN;
goto out;
}
- new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
+ /*
+ * Now new_pte is none, so hpage_collapse_scan_file() path can not find
+ * this by traversing file->f_mapping, so there is no concurrency with
+ * retract_page_tables(). In addition, we already hold the exclusive
+ * mmap_lock, so this new_pte page is stable, so there is no need to get
+ * pmdval and do pmd_same() check.
+ */
+ new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
+ &new_ptl);
if (!new_pte) {
pte_unmap_unlock(old_pte, old_ptl);
err = -EAGAIN;
@@ -817,17 +826,24 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return new_addr;
}
-static struct vm_area_struct *vma_to_resize(unsigned long addr,
+/*
+ * resize_is_valid() - Ensure the vma can be resized to the new length at the give
+ * address.
+ *
+ * @vma: The vma to resize
+ * @addr: The old address
+ * @old_len: The current size
+ * @new_len: The desired size
+ * @flags: The vma flags
+ *
+ * Return 0 on success, error otherwise.
+ */
+static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
unsigned long old_len, unsigned long new_len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
unsigned long pgoff;
- vma = vma_lookup(mm, addr);
- if (!vma)
- return ERR_PTR(-EFAULT);
-
/*
* !old_len is a special case where an attempt is made to 'duplicate'
* a mapping. This makes no sense for private mappings as it will
@@ -838,39 +854,53 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
*/
if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
if ((flags & MREMAP_DONTUNMAP) &&
(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
if (new_len == old_len)
- return vma;
+ return 0;
/* Need to be careful about a growing mapping */
pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
if (!may_expand_vm(mm, vma->vm_flags,
(new_len - old_len) >> PAGE_SHIFT))
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
- return vma;
+ return 0;
}
+/*
+ * mremap_to() - remap a vma to a new location
+ * @addr: The old address
+ * @old_len: The old size
+ * @new_addr: The target address
+ * @new_len: The new size
+ * @locked: If the returned vma is locked (VM_LOCKED)
+ * @flags: the mremap flags
+ * @uf: The mremap userfaultfd context
+ * @uf_unmap_early: The userfaultfd unmap early context
+ * @uf_unmap: The userfaultfd unmap context
+ *
+ * Returns: The new address of the vma or an error.
+ */
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
unsigned long flags, struct vm_userfaultfd_ctx *uf,
@@ -879,18 +909,18 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long ret = -EINVAL;
+ unsigned long ret;
unsigned long map_flags = 0;
if (offset_in_page(new_addr))
- goto out;
+ return -EINVAL;
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
- goto out;
+ return -EINVAL;
/* Ensure the old/new locations do not overlap */
if (addr + old_len > new_addr && new_addr + new_len > addr)
- goto out;
+ return -EINVAL;
/*
* move_vma() need us to stay 4 maps below the threshold, otherwise
@@ -917,27 +947,28 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
*/
ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
if (ret)
- goto out;
+ return ret;
}
if (old_len > new_len) {
ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
if (ret)
- goto out;
+ return ret;
old_len = new_len;
}
- vma = vma_to_resize(addr, old_len, new_len, flags);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- goto out;
- }
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ return -EFAULT;
+
+ ret = resize_is_valid(vma, addr, old_len, new_len, flags);
+ if (ret)
+ return ret;
/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
if (flags & MREMAP_DONTUNMAP &&
!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
if (flags & MREMAP_FIXED)
@@ -950,17 +981,14 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
if (IS_ERR_VALUE(ret))
- goto out;
+ return ret;
/* We got a new mapping */
if (!(flags & MREMAP_FIXED))
new_addr = ret;
- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
- uf_unmap);
-
-out:
- return ret;
+ return move_vma(vma, addr, old_len, new_len, new_addr, locked, flags,
+ uf, uf_unmap);
}
static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
@@ -1105,11 +1133,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Ok, we need to grow..
*/
- vma = vma_to_resize(addr, old_len, new_len, flags);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
+ ret = resize_is_valid(vma, addr, old_len, new_len, flags);
+ if (ret)
goto out;
- }
/* old_len exactly to the end of the area..
*/
diff --git a/mm/mseal.c b/mm/mseal.c
index ece977bd21e1..81d6e980e8a9 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -30,6 +30,7 @@ static bool is_madv_discard(int behavior)
case MADV_REMOVE:
case MADV_DONTFORK:
case MADV_WIPEONFORK:
+ case MADV_GUARD_INSTALL:
return true;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4d7a0004df2c..1c485beb0b93 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -24,7 +24,6 @@
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/sched/debug.h>
#include <linux/swap.h>
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 72a5d8836425..fdb89ce85fff 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -917,7 +917,9 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
unsigned long thresh)
{
struct wb_domain *dom = dtc_dom(dtc);
+ struct bdi_writeback *wb = dtc->wb;
u64 wb_thresh;
+ u64 wb_max_thresh;
unsigned long numerator, denominator;
unsigned long wb_min_ratio, wb_max_ratio;
@@ -931,11 +933,28 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
wb_thresh *= numerator;
wb_thresh = div64_ul(wb_thresh, denominator);
- wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
+ wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio);
wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);
- if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE))
- wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
+ wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
+ if (wb_thresh > wb_max_thresh)
+ wb_thresh = wb_max_thresh;
+
+ /*
+ * With strictlimit flag, the wb_thresh is treated as
+ * a hard limit in balance_dirty_pages() and wb_position_ratio().
+ * It's possible that wb_thresh is close to zero, not because
+ * the device is slow, but because it has been inactive.
+ * To prevent occasional writes from being blocked, we raise wb_thresh.
+ */
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ unsigned long limit = hard_dirty_limit(dom, dtc->thresh);
+ u64 wb_scale_thresh = 0;
+
+ if (limit > dtc->dirty)
+ wb_scale_thresh = (limit - dtc->dirty) / 100;
+ wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4));
+ }
return wb_thresh;
}
@@ -2724,8 +2743,6 @@ EXPORT_SYMBOL(noop_dirty_folio);
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold folio_memcg_lock().
- *
* NOTE: This relies on being atomic wrt interrupts.
*/
static void folio_account_dirtied(struct folio *folio,
@@ -2758,7 +2775,6 @@ static void folio_account_dirtied(struct folio *folio,
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold folio_memcg_lock().
*/
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
@@ -2776,9 +2792,8 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
* If warn is true, then emit a warning if the folio is not uptodate and has
* not been truncated.
*
- * The caller must hold folio_memcg_lock(). It is the caller's
- * responsibility to prevent the folio from being truncated while
- * this function is in progress, although it may have been truncated
+ * It is the caller's responsibility to prevent the folio from being truncated
+ * while this function is in progress, although it may have been truncated
* before this function is called. Most callers have the folio locked.
* A few have the folio blocked from truncation through other means (e.g.
* zap_vma_pages() has it mapped and is holding the page table lock).
@@ -2822,14 +2837,10 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
*/
bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- folio_memcg_lock(folio);
- if (folio_test_set_dirty(folio)) {
- folio_memcg_unlock(folio);
+ if (folio_test_set_dirty(folio))
return false;
- }
__folio_mark_dirty(folio, mapping, !folio_test_private(folio));
- folio_memcg_unlock(folio);
if (mapping->host) {
/* !PageAnon && !swapper_space */
@@ -2956,14 +2967,12 @@ void __folio_cancel_dirty(struct folio *folio)
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
- folio_memcg_lock(folio);
wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (folio_test_clear_dirty(folio))
folio_account_cleaned(folio, wb);
unlocked_inode_to_wb_end(inode, &cookie);
- folio_memcg_unlock(folio);
} else {
folio_clear_dirty(folio);
}
@@ -3074,7 +3083,6 @@ bool __folio_end_writeback(struct folio *folio)
struct address_space *mapping = folio_mapping(folio);
bool ret;
- folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -3105,7 +3113,6 @@ bool __folio_end_writeback(struct folio *folio)
lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
node_stat_mod_folio(folio, NR_WRITTEN, nr);
- folio_memcg_unlock(folio);
return ret;
}
@@ -3118,7 +3125,6 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
- folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
struct inode *inode = mapping->host;
@@ -3159,7 +3165,6 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
- folio_memcg_unlock(folio);
access_ret = arch_make_folio_accessible(folio);
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 119a3a52f96e..1cb4b8c8886d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5377,7 +5377,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
- page_group_by_mobility_disabled ? "off" : "on",
+ str_off_on(page_group_by_mobility_disabled),
vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
diff --git a/mm/page_io.c b/mm/page_io.c
index 01749b99fb54..4b4ea8e49cf6 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -277,6 +277,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
swap_zeromap_folio_clear(folio);
}
if (zswap_store(folio)) {
+ count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
folio_unlock(folio);
return 0;
}
@@ -296,8 +297,9 @@ static inline void count_swpout_vm_event(struct folio *folio)
count_memcg_folio_events(folio, THP_SWPOUT, 1);
count_vm_event(THP_SWPOUT);
}
- count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT);
#endif
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT);
+ count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio));
count_vm_events(PSWPOUT, folio_nr_pages(folio));
}
@@ -493,6 +495,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
for (p = 0; p < sio->pages; p++) {
struct folio *folio = page_folio(sio->bvec[p].bv_page);
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
+ count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
folio_mark_uptodate(folio);
folio_unlock(folio);
}
@@ -586,6 +590,8 @@ static void swap_read_folio_bdev_sync(struct folio *folio,
* attempt to access it in the page fault retry time check.
*/
get_task_struct(current);
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
+ count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
count_vm_events(PSWPIN, folio_nr_pages(folio));
submit_bio_wait(&bio);
__end_swap_bio_read(&bio);
@@ -601,6 +607,8 @@ static void swap_read_folio_bdev_async(struct folio *folio,
bio->bi_iter.bi_sector = swap_folio_sector(folio);
bio->bi_end_io = end_swap_bio_read;
bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
+ count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
count_vm_events(PSWPIN, folio_nr_pages(folio));
submit_bio(bio);
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index ae5cc42aa208..81839a9e74f1 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -13,7 +13,8 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
return false;
}
-static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
+static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
+ spinlock_t **ptlp)
{
pte_t ptent;
@@ -25,6 +26,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
return !!pvmw->pte;
}
+again:
/*
* It is important to return the ptl corresponding to pte,
* in case *pvmw->pmd changes underneath us; so we need to
@@ -32,8 +34,8 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
* proceeds to loop over next ptes, and finds a match later.
* Though, in most cases, page lock already protects this.
*/
- pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
- pvmw->address, ptlp);
+ pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd,
+ pvmw->address, pmdvalp, ptlp);
if (!pvmw->pte)
return false;
@@ -67,8 +69,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
} else if (!pte_present(ptent)) {
return false;
}
+ spin_lock(*ptlp);
+ if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) {
+ pte_unmap_unlock(pvmw->pte, *ptlp);
+ goto again;
+ }
pvmw->ptl = *ptlp;
- spin_lock(pvmw->ptl);
+
return true;
}
@@ -278,7 +285,7 @@ restart:
step_forward(pvmw, PMD_SIZE);
continue;
}
- if (!map_pte(pvmw, &ptl)) {
+ if (!map_pte(pvmw, &pmde, &ptl)) {
if (!pvmw->pte)
goto restart;
goto next_pte;
@@ -305,8 +312,13 @@ next_pte:
} while (pte_none(ptep_get(pvmw->pte)));
if (!pvmw->ptl) {
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
+ pte_unmap_unlock(pvmw->pte, ptl);
+ pvmw->pte = NULL;
+ goto restart;
+ }
pvmw->ptl = ptl;
- spin_lock(pvmw->ptl);
}
goto this_pte;
} while (pvmw->address < end);
@@ -325,10 +337,10 @@ next_pte:
* outside the VMA or not present, returns -EFAULT.
* Only valid for normal file or anonymous VMAs.
*/
-unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+unsigned long page_mapped_in_vma(const struct page *page,
+ struct vm_area_struct *vma)
{
- struct folio *folio = page_folio(page);
- pgoff_t pgoff = folio->index + folio_page_idx(folio, page);
+ const struct folio *folio = page_folio(page);
struct page_vma_mapped_walk pvmw = {
.pfn = page_to_pfn(page),
.nr_pages = 1,
@@ -336,7 +348,7 @@ unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
.flags = PVMW_SYNC,
};
- pvmw.address = vma_address(vma, pgoff, 1);
+ pvmw.address = vma_address(vma, page_pgoff(folio, page), 1);
if (pvmw.address == -EFAULT)
goto out;
if (!page_vma_mapped_walk(&pvmw))
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 5f9f01532e67..e478777c86e1 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -3,9 +3,14 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
+#include <linux/mmu_context.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
/*
* We want to know the real level where a entry is located ignoring any
* folding of levels which may be happening. For example if p4d is folded then
@@ -29,9 +34,23 @@ static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
int err = 0;
for (;;) {
- err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
- if (err)
- break;
+ if (ops->install_pte && pte_none(ptep_get(pte))) {
+ pte_t new_pte;
+
+ err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte,
+ walk);
+ if (err)
+ break;
+
+ set_pte_at(walk->mm, addr, pte, new_pte);
+ /* Non-present before, so for arches that need it. */
+ if (!WARN_ON_ONCE(walk->no_vma))
+ update_mmu_cache(walk->vma, addr, pte);
+ } else {
+ err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+ if (err)
+ break;
+ }
if (addr >= end - PAGE_SIZE)
break;
addr += PAGE_SIZE;
@@ -81,6 +100,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
pmd_t *pmd;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
+ bool has_handler = ops->pte_entry;
+ bool has_install = ops->install_pte;
int err = 0;
int depth = real_depth(3);
@@ -89,11 +110,14 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
again:
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd)) {
- if (ops->pte_hole)
+ if (has_install)
+ err = __pte_alloc(walk->mm, pmd);
+ else if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
- continue;
+ if (!has_install)
+ continue;
}
walk->action = ACTION_SUBTREE;
@@ -109,18 +133,25 @@ again:
if (walk->action == ACTION_AGAIN)
goto again;
-
- /*
- * Check this here so we only break down trans_huge
- * pages when we _need_ to
- */
- if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
- walk->action == ACTION_CONTINUE ||
- !(ops->pte_entry))
+ if (walk->action == ACTION_CONTINUE)
continue;
+ if (!has_handler) { /* No handlers for lower page tables. */
+ if (!has_install)
+ continue; /* Nothing to do. */
+ /*
+ * We are ONLY installing, so avoid unnecessarily
+ * splitting a present huge page.
+ */
+ if (pmd_present(*pmd) &&
+ (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ continue;
+ }
+
if (walk->vma)
split_huge_pmd(walk->vma, pmd, addr);
+ else if (pmd_leaf(*pmd) || !pmd_present(*pmd))
+ continue; /* Nothing to do. */
err = walk_pte_range(pmd, addr, next, walk);
if (err)
@@ -140,6 +171,8 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
pud_t *pud;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
+ bool has_handler = ops->pmd_entry || ops->pte_entry;
+ bool has_install = ops->install_pte;
int err = 0;
int depth = real_depth(2);
@@ -148,11 +181,14 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
again:
next = pud_addr_end(addr, end);
if (pud_none(*pud)) {
- if (ops->pte_hole)
+ if (has_install)
+ err = __pmd_alloc(walk->mm, pud, addr);
+ else if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
- continue;
+ if (!has_install)
+ continue;
}
walk->action = ACTION_SUBTREE;
@@ -164,14 +200,26 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (walk->action == ACTION_AGAIN)
goto again;
-
- if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
- walk->action == ACTION_CONTINUE ||
- !(ops->pmd_entry || ops->pte_entry))
+ if (walk->action == ACTION_CONTINUE)
continue;
+ if (!has_handler) { /* No handlers for lower page tables. */
+ if (!has_install)
+ continue; /* Nothing to do. */
+ /*
+ * We are ONLY installing, so avoid unnecessarily
+ * splitting a present huge page.
+ */
+ if (pud_present(*pud) &&
+ (pud_trans_huge(*pud) || pud_devmap(*pud)))
+ continue;
+ }
+
if (walk->vma)
split_huge_pud(walk->vma, pud, addr);
+ else if (pud_leaf(*pud) || !pud_present(*pud))
+ continue; /* Nothing to do. */
+
if (pud_none(*pud))
goto again;
@@ -189,6 +237,8 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
p4d_t *p4d;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
+ bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry;
+ bool has_install = ops->install_pte;
int err = 0;
int depth = real_depth(1);
@@ -196,18 +246,21 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d)) {
- if (ops->pte_hole)
+ if (has_install)
+ err = __pud_alloc(walk->mm, p4d, addr);
+ else if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
- continue;
+ if (!has_install)
+ continue;
}
if (ops->p4d_entry) {
err = ops->p4d_entry(p4d, addr, next, walk);
if (err)
break;
}
- if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+ if (has_handler || has_install)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -222,6 +275,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
pgd_t *pgd;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
+ bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
+ ops->pte_entry;
+ bool has_install = ops->install_pte;
int err = 0;
if (walk->pgd)
@@ -231,18 +287,21 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
- if (ops->pte_hole)
+ if (has_install)
+ err = __p4d_alloc(walk->mm, pgd, addr);
+ else if (ops->pte_hole)
err = ops->pte_hole(addr, next, 0, walk);
if (err)
break;
- continue;
+ if (!has_install)
+ continue;
}
if (ops->pgd_entry) {
err = ops->pgd_entry(pgd, addr, next, walk);
if (err)
break;
}
- if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+ if (has_handler || has_install)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
@@ -334,6 +393,11 @@ static int __walk_page_range(unsigned long start, unsigned long end,
int err = 0;
struct vm_area_struct *vma = walk->vma;
const struct mm_walk_ops *ops = walk->ops;
+ bool is_hugetlb = is_vm_hugetlb_page(vma);
+
+ /* We do not support hugetlb PTE installation. */
+ if (ops->install_pte && is_hugetlb)
+ return -EINVAL;
if (ops->pre_vma) {
err = ops->pre_vma(start, end, walk);
@@ -341,7 +405,7 @@ static int __walk_page_range(unsigned long start, unsigned long end,
return err;
}
- if (is_vm_hugetlb_page(vma)) {
+ if (is_hugetlb) {
if (ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
@@ -380,47 +444,14 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
#endif
}
-/**
- * walk_page_range - walk page table with caller specific callbacks
- * @mm: mm_struct representing the target process of page table walk
- * @start: start address of the virtual address range
- * @end: end address of the virtual address range
- * @ops: operation to call during the walk
- * @private: private data for callbacks' usage
- *
- * Recursively walk the page table tree of the process represented by @mm
- * within the virtual address range [@start, @end). During walking, we can do
- * some caller-specific works for each entry, by setting up pmd_entry(),
- * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
- * callbacks, the associated entries/pages are just ignored.
- * The return values of these callbacks are commonly defined like below:
- *
- * - 0 : succeeded to handle the current entry, and if you don't reach the
- * end address yet, continue to walk.
- * - >0 : succeeded to handle the current entry, and return to the caller
- * with caller specific value.
- * - <0 : failed to handle the current entry, and return to the caller
- * with error code.
- *
- * Before starting to walk page table, some callers want to check whether
- * they really want to walk over the current vma, typically by checking
- * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
- * purpose.
- *
- * If operations need to be staged before and committed after a vma is walked,
- * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
- * since it is intended to handle commit-type operations, can't return any
- * errors.
- *
- * struct mm_walk keeps current values of some common data like vma and pmd,
- * which are useful for the access from callbacks. If you want to pass some
- * caller-specific data to callbacks, @private should be helpful.
+/*
+ * See the comment for walk_page_range(), this performs the heavy lifting of the
+ * operation, only sets no restrictions on how the walk proceeds.
*
- * Locking:
- * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
- * because these function traverse vma list and/or access to vma's data.
+ * We usually restrict the ability to install PTEs, but this functionality is
+ * available to internal memory management code and provided in mm/internal.h.
*/
-int walk_page_range(struct mm_struct *mm, unsigned long start,
+int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private)
{
@@ -479,6 +510,80 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
return err;
}
+/*
+ * Determine if the walk operations specified are permitted to be used for a
+ * page table walk.
+ *
+ * This check is performed on all functions which are parameterised by walk
+ * operations and exposed in include/linux/pagewalk.h.
+ *
+ * Internal memory management code can use the walk_page_range_mm() function to
+ * be able to use all page walking operations.
+ */
+static bool check_ops_valid(const struct mm_walk_ops *ops)
+{
+ /*
+ * The installation of PTEs is solely under the control of memory
+ * management logic and subject to many subtle locking, security and
+ * cache considerations so we cannot permit other users to do so, and
+ * certainly not for exported symbols.
+ */
+ if (ops->install_pte)
+ return false;
+
+ return true;
+}
+
+/**
+ * walk_page_range - walk page table with caller specific callbacks
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @private: private data for callbacks' usage
+ *
+ * Recursively walk the page table tree of the process represented by @mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ *
+ * - 0 : succeeded to handle the current entry, and if you don't reach the
+ * end address yet, continue to walk.
+ * - >0 : succeeded to handle the current entry, and return to the caller
+ * with caller specific value.
+ * - <0 : failed to handle the current entry, and return to the caller
+ * with error code.
+ *
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
+ * purpose.
+ *
+ * If operations need to be staged before and committed after a vma is walked,
+ * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
+ * since it is intended to handle commit-type operations, can't return any
+ * errors.
+ *
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @private should be helpful.
+ *
+ * Locking:
+ * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
+ * because these function traverse vma list and/or access to vma's data.
+ */
+int walk_page_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
+{
+ if (!check_ops_valid(ops))
+ return -EINVAL;
+
+ return walk_page_range_mm(mm, start, end, ops, private);
+}
+
/**
* walk_page_range_novma - walk a range of pagetables not backed by a vma
* @mm: mm_struct representing the target process of page table walk
@@ -494,7 +599,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
* walking the kernel pages tables or page tables for firmware.
*
* Note: Be careful to walk the kernel pages tables, the caller may be need to
- * take other effective approache (mmap lock may be insufficient) to prevent
+ * take other effective approaches (mmap lock may be insufficient) to prevent
* the intermediate kernel page tables belonging to the specified address range
* from being freed (e.g. memory hot-remove).
*/
@@ -513,6 +618,8 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
if (start >= end || !walk.mm)
return -EINVAL;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
/*
* 1) For walking the user virtual address space:
@@ -556,6 +663,8 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
return -EINVAL;
if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
process_mm_walk_lock(walk.mm, ops->walk_lock);
process_vma_walk_lock(vma, ops->walk_lock);
@@ -574,6 +683,8 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
if (!walk.mm)
return -EINVAL;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
process_mm_walk_lock(walk.mm, ops->walk_lock);
process_vma_walk_lock(vma, ops->walk_lock);
@@ -623,6 +734,9 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
unsigned long start_addr, end_addr;
int err = 0;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
+
lockdep_assert_held(&mapping->i_mmap_rwsem);
vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
first_index + nr - 1) {
diff --git a/mm/percpu.c b/mm/percpu.c
index da21680ff294..d8dd31a2e407 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -253,13 +253,13 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
- page->index = (unsigned long)pcpu;
+ page->private = (unsigned long)pcpu;
}
/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
- return (struct pcpu_chunk *)page->index;
+ return (struct pcpu_chunk *)page->private;
}
static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
@@ -1864,6 +1864,10 @@ restart:
area_found:
pcpu_stats_area_alloc(chunk, size);
+
+ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
+ pcpu_schedule_balance_work();
+
spin_unlock_irqrestore(&pcpu_lock, flags);
/* populate if not all pages are already there */
@@ -1891,9 +1895,6 @@ area_found:
mutex_unlock(&pcpu_alloc_mutex);
}
- if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
- pcpu_schedule_balance_work();
-
/* clear the areas and return address relative to base address */
for_each_possible_cpu(cpu)
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index a78a4adf711a..5297dcc38c37 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -305,8 +305,8 @@ nomap:
return NULL;
}
-pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, spinlock_t **ptlp)
+pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp)
{
pmd_t pmdval;
pte_t *pte;
@@ -317,6 +317,19 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
return pte;
}
+pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, pmd_t *pmdvalp,
+ spinlock_t **ptlp)
+{
+ pte_t *pte;
+
+ VM_WARN_ON_ONCE(!pmdvalp);
+ pte = __pte_offset_map(pmd, addr, pmdvalp);
+ if (likely(pte))
+ *ptlp = pte_lockptr(mm, pmdvalp);
+ return pte;
+}
+
/*
* pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation
* __pte_offset_map_lock() below, is usually called with the pmd pointer for
@@ -347,14 +360,28 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
* and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s
* afterwards.
*
- * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
+ * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
* but when successful, it also outputs a pointer to the spinlock in ptlp - as
* pte_offset_map_lock() does, but in this case without locking it. This helps
* the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
- * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
- * pointer for the page table that it returns. In principle, the caller should
- * recheck *pmd once the lock is taken; in practice, no callsite needs that -
- * either the mmap_lock for write, or pte_same() check on contents, is enough.
+ * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock
+ * pointer for the page table that it returns. Even after grabbing the spinlock,
+ * we might be looking either at a page table that is still mapped or one that
+ * was unmapped and is about to get freed. But for R/O access this is sufficient.
+ * So it is only applicable for read-only cases where any modification operations
+ * to the page table are not allowed even if the corresponding spinlock is held
+ * afterwards.
+ *
+ * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like
+ * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval.
+ * It is applicable for may-write cases where any modification operations to the
+ * page table may happen after the corresponding spinlock is held afterwards.
+ * But the users should make sure the page table is stable like checking pte_same()
+ * or checking pmd_same() by using the output pmdval before performing the write
+ * operations.
+ *
+ * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will
+ * be read-only/read-write protected.
*
* Note that free_pgtables(), used after unmapping detached vmas, or when
* exiting the whole mm, does not take page table lock before freeing a page
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index b308e96cd05a..656d3e88755b 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -201,8 +201,8 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
}
mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
- if (!mm || IS_ERR(mm)) {
- rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ if (IS_ERR(mm)) {
+ rc = PTR_ERR(mm);
/*
* Explicitly map EACCES to EPERM as EPERM is a more
* appropriate error code for process_vw_readv/writev
diff --git a/mm/readahead.c b/mm/readahead.c
index 9a807727d809..8f1cf599b572 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -206,9 +206,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct address_space *mapping = ractl->mapping;
- unsigned long ra_folio_index, index = readahead_index(ractl);
+ unsigned long index = readahead_index(ractl);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
- unsigned long mark, i = 0;
+ unsigned long mark = ULONG_MAX, i = 0;
unsigned int min_nrpages = mapping_min_folio_nrpages(mapping);
/*
@@ -232,9 +232,14 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* index that only has lookahead or "async_region" to set the
* readahead flag.
*/
- ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size,
- min_nrpages);
- mark = ra_folio_index - index;
+ if (lookahead_size <= nr_to_read) {
+ unsigned long ra_folio_index;
+
+ ra_folio_index = round_up(readahead_index(ractl) +
+ nr_to_read - lookahead_size,
+ min_nrpages);
+ mark = ra_folio_index - index;
+ }
nr_to_read += readahead_index(ractl) - index;
ractl->_index = index;
diff --git a/mm/rmap.c b/mm/rmap.c
index 73d5998677d4..c6c4d4ea29a7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,7 +32,6 @@
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in block_dirty_folio)
- * folio_lock_memcg move_lock (in block_dirty_folio)
* i_pages lock (widely used)
* lruvec->lru_lock (in folio_lruvec_lock_irq)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
@@ -497,7 +496,7 @@ void __init anon_vma_init(void)
* concurrently without folio lock protection). See folio_lock_anon_vma_read()
* which has already covered that, and comment above remap_pages().
*/
-struct anon_vma *folio_get_anon_vma(struct folio *folio)
+struct anon_vma *folio_get_anon_vma(const struct folio *folio)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
@@ -541,7 +540,7 @@ out:
* reference like with folio_get_anon_vma() and then block on the mutex
* on !rwc->try_lock case.
*/
-struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
+struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma = NULL;
@@ -768,15 +767,27 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
-/*
- * At what user virtual address is page expected in vma?
- * Caller should check the page is actually part of the vma.
+/**
+ * page_address_in_vma - The virtual address of a page in this VMA.
+ * @folio: The folio containing the page.
+ * @page: The page within the folio.
+ * @vma: The VMA we need to know the address in.
+ *
+ * Calculates the user virtual address of this page in the specified VMA.
+ * It is the caller's responsibililty to check the page is actually
+ * within the VMA. There may not currently be a PTE pointing at this
+ * page, but if a page fault occurs at this address, this is the page
+ * which will be accessed.
+ *
+ * Context: Caller should hold a reference to the folio. Caller should
+ * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the
+ * VMA from being altered.
+ *
+ * Return: The virtual address corresponding to this page in the VMA.
*/
-unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
+unsigned long page_address_in_vma(const struct folio *folio,
+ const struct page *page, const struct vm_area_struct *vma)
{
- struct folio *folio = page_folio(page);
- pgoff_t pgoff;
-
if (folio_test_anon(folio)) {
struct anon_vma *page__anon_vma = folio_anon_vma(folio);
/*
@@ -792,9 +803,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return -EFAULT;
}
- /* The !page__anon_vma above handles KSM folios */
- pgoff = folio->index + folio_page_idx(folio, page);
- return vma_address(vma, pgoff, 1);
+ /* KSM folios don't reach here because of the !page__anon_vma check */
+ return vma_address(vma, page_pgoff(folio, page), 1);
}
/*
@@ -1261,8 +1271,9 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*/
-static void __page_check_anon_rmap(struct folio *folio, struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+static void __page_check_anon_rmap(const struct folio *folio,
+ const struct page *page, struct vm_area_struct *vma,
+ unsigned long address)
{
/*
* The page's anon-rmap details (mapping and index) are guaranteed to
@@ -1277,7 +1288,7 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page,
*/
VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
folio);
- VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
+ VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
page);
}
@@ -2559,7 +2570,7 @@ void __put_anon_vma(struct anon_vma *anon_vma)
anon_vma_free(root);
}
-static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
+static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
diff --git a/mm/shmem.c b/mm/shmem.c
index c7881e16f4be..ccb9629a0f70 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -141,6 +141,7 @@ static unsigned long huge_shmem_orders_always __read_mostly;
static unsigned long huge_shmem_orders_madvise __read_mostly;
static unsigned long huge_shmem_orders_inherit __read_mostly;
static unsigned long huge_shmem_orders_within_size __read_mostly;
+static bool shmem_orders_configured __initdata;
#endif
#ifdef CONFIG_TMPFS
@@ -553,17 +554,15 @@ static bool shmem_confirm_swap(struct address_space *mapping,
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
-static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- struct vm_area_struct *vma,
- unsigned long vm_flags)
+static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ unsigned long vm_flags)
{
- struct mm_struct *mm = vma ? vma->vm_mm : NULL;
loff_t i_size;
- if (!S_ISREG(inode->i_mode))
+ if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
return false;
- if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
+ if (!S_ISREG(inode->i_mode))
return false;
if (shmem_huge == SHMEM_HUGE_DENY)
return false;
@@ -581,7 +580,7 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
return true;
fallthrough;
case SHMEM_HUGE_ADVISE:
- if (mm && (vm_flags & VM_HUGEPAGE))
+ if (vm_flags & VM_HUGEPAGE)
return true;
fallthrough;
default:
@@ -589,35 +588,39 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
}
}
-static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- struct vm_area_struct *vma, unsigned long vm_flags)
+static int shmem_parse_huge(const char *str)
{
- if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
- return false;
+ int huge;
- return __shmem_huge_global_enabled(inode, index, write_end,
- shmem_huge_force, vma, vm_flags);
-}
+ if (!str)
+ return -EINVAL;
-#if defined(CONFIG_SYSFS)
-static int shmem_parse_huge(const char *str)
-{
if (!strcmp(str, "never"))
- return SHMEM_HUGE_NEVER;
- if (!strcmp(str, "always"))
- return SHMEM_HUGE_ALWAYS;
- if (!strcmp(str, "within_size"))
- return SHMEM_HUGE_WITHIN_SIZE;
- if (!strcmp(str, "advise"))
- return SHMEM_HUGE_ADVISE;
- if (!strcmp(str, "deny"))
- return SHMEM_HUGE_DENY;
- if (!strcmp(str, "force"))
- return SHMEM_HUGE_FORCE;
- return -EINVAL;
+ huge = SHMEM_HUGE_NEVER;
+ else if (!strcmp(str, "always"))
+ huge = SHMEM_HUGE_ALWAYS;
+ else if (!strcmp(str, "within_size"))
+ huge = SHMEM_HUGE_WITHIN_SIZE;
+ else if (!strcmp(str, "advise"))
+ huge = SHMEM_HUGE_ADVISE;
+ else if (!strcmp(str, "deny"))
+ huge = SHMEM_HUGE_DENY;
+ else if (!strcmp(str, "force"))
+ huge = SHMEM_HUGE_FORCE;
+ else
+ return -EINVAL;
+
+ if (!has_transparent_hugepage() &&
+ huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
+ return -EINVAL;
+
+ /* Do not override huge allocation policy with non-PMD sized mTHP */
+ if (huge == SHMEM_HUGE_FORCE &&
+ huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
+ return -EINVAL;
+
+ return huge;
}
-#endif
#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static const char *shmem_format_huge(int huge)
@@ -777,8 +780,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
}
static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- struct vm_area_struct *vma, unsigned long vm_flags)
+ loff_t write_end, bool shmem_huge_force,
+ unsigned long vm_flags)
{
return false;
}
@@ -1173,7 +1176,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
STATX_ATTR_NODUMP);
generic_fillattr(idmap, request_mask, inode, stat);
- if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
+ if (shmem_huge_global_enabled(inode, 0, 0, false, 0))
stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) {
@@ -1658,6 +1661,23 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+bool shmem_hpage_pmd_enabled(void)
+{
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return false;
+ if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always))
+ return true;
+ if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise))
+ return true;
+ if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size))
+ return true;
+ if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
+ shmem_huge != SHMEM_HUGE_NEVER)
+ return true;
+
+ return false;
+}
+
unsigned long shmem_allowable_huge_orders(struct inode *inode,
struct vm_area_struct *vma, pgoff_t index,
loff_t write_end, bool shmem_huge_force)
@@ -1673,7 +1693,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
return 0;
global_huge = shmem_huge_global_enabled(inode, index, write_end,
- shmem_huge_force, vma, vm_flags);
+ shmem_huge_force, vm_flags);
if (!vma || !vma_is_anon_shmem(vma)) {
/*
* For tmpfs, we now only support PMD sized THP if huge page
@@ -2879,7 +2899,10 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
cache_no_acl(inode);
if (sbinfo->noswap)
mapping_set_unevictable(inode->i_mapping);
- mapping_set_large_folios(inode->i_mapping);
+
+ /* Don't consider 'deny' for emergencies and 'force' for testing */
+ if (sbinfo->huge)
+ mapping_set_large_folios(inode->i_mapping);
switch (mode & S_IFMT) {
default:
@@ -3140,27 +3163,19 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
unsigned long offset;
int error = 0;
ssize_t retval = 0;
- loff_t *ppos = &iocb->ki_pos;
-
- index = *ppos >> PAGE_SHIFT;
- offset = *ppos & ~PAGE_MASK;
for (;;) {
struct folio *folio = NULL;
struct page *page = NULL;
- pgoff_t end_index;
unsigned long nr, ret;
- loff_t i_size = i_size_read(inode);
+ loff_t end_offset, i_size = i_size_read(inode);
+ bool fallback_page_copy = false;
+ size_t fsize;
- end_index = i_size >> PAGE_SHIFT;
- if (index > end_index)
+ if (unlikely(iocb->ki_pos >= i_size))
break;
- if (index == end_index) {
- nr = i_size & ~PAGE_MASK;
- if (nr <= offset)
- break;
- }
+ index = iocb->ki_pos >> PAGE_SHIFT;
error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
if (error) {
if (error == -EINVAL)
@@ -3176,24 +3191,29 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
error = -EIO;
break;
}
+
+ if (folio_test_large(folio) &&
+ folio_test_has_hwpoisoned(folio))
+ fallback_page_copy = true;
}
/*
* We must evaluate after, since reads (unlike writes)
* are called without i_rwsem protection against truncate
*/
- nr = PAGE_SIZE;
i_size = i_size_read(inode);
- end_index = i_size >> PAGE_SHIFT;
- if (index == end_index) {
- nr = i_size & ~PAGE_MASK;
- if (nr <= offset) {
- if (folio)
- folio_put(folio);
- break;
- }
+ if (unlikely(iocb->ki_pos >= i_size)) {
+ if (folio)
+ folio_put(folio);
+ break;
}
- nr -= offset;
+ end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count);
+ if (folio && likely(!fallback_page_copy))
+ fsize = folio_size(folio);
+ else
+ fsize = PAGE_SIZE;
+ offset = iocb->ki_pos & (fsize - 1);
+ nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset);
if (folio) {
/*
@@ -3201,10 +3221,15 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ if (mapping_writably_mapped(mapping)) {
+ if (likely(!fallback_page_copy))
+ flush_dcache_folio(folio);
+ else
+ flush_dcache_page(page);
+ }
+
/*
- * Mark the page accessed if we read the beginning.
+ * Mark the folio accessed if we read the beginning.
*/
if (!offset)
folio_mark_accessed(folio);
@@ -3212,9 +3237,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*/
- ret = copy_page_to_iter(page, offset, nr, to);
+ if (likely(!fallback_page_copy))
+ ret = copy_folio_to_iter(folio, offset, nr, to);
+ else
+ ret = copy_page_to_iter(page, offset, nr, to);
folio_put(folio);
-
} else if (user_backed_iter(to)) {
/*
* Copy to user tends to be so well optimized, but
@@ -3232,9 +3259,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
retval += ret;
- offset += ret;
- index += offset >> PAGE_SHIFT;
- offset &= ~PAGE_MASK;
+ iocb->ki_pos += ret;
if (!iov_iter_count(to))
break;
@@ -3245,7 +3270,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
cond_resched();
}
- *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
file_accessed(file);
return retval ? retval : error;
}
@@ -3334,11 +3358,16 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
len = min_t(size_t, len, npages * PAGE_SIZE);
do {
+ bool fallback_page_splice = false;
+ struct page *page = NULL;
+ pgoff_t index;
+ size_t size;
+
if (*ppos >= i_size_read(inode))
break;
- error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio,
- SGP_READ);
+ index = *ppos >> PAGE_SHIFT;
+ error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
@@ -3347,12 +3376,15 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (folio) {
folio_unlock(folio);
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) &&
- folio_test_has_hwpoisoned(folio))) {
+ page = folio_file_page(folio, index);
+ if (PageHWPoison(page)) {
error = -EIO;
break;
}
+
+ if (folio_test_large(folio) &&
+ folio_test_has_hwpoisoned(folio))
+ fallback_page_splice = true;
}
/*
@@ -3366,7 +3398,17 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
isize = i_size_read(inode);
if (unlikely(*ppos >= isize))
break;
- part = min_t(loff_t, isize - *ppos, len);
+ /*
+ * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned
+ * pages.
+ */
+ size = len;
+ if (unlikely(fallback_page_splice)) {
+ size_t offset = *ppos & ~PAGE_MASK;
+
+ size = umin(size, PAGE_SIZE - offset);
+ }
+ part = min_t(loff_t, isize - *ppos, size);
if (folio) {
/*
@@ -3374,8 +3416,12 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (mapping_writably_mapped(mapping))
- flush_dcache_folio(folio);
+ if (mapping_writably_mapped(mapping)) {
+ if (likely(!fallback_page_splice))
+ flush_dcache_folio(folio);
+ else
+ flush_dcache_page(page);
+ }
folio_mark_accessed(folio);
/*
* Ok, we have the page, and it's up-to-date, so we can
@@ -5224,7 +5270,8 @@ void __init shmem_init(void)
* Default to setting PMD-sized THP to inherit the global setting and
* disable all other multi-size THPs.
*/
- huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
+ if (!shmem_orders_configured)
+ huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
#endif
return;
@@ -5267,7 +5314,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
char tmp[16];
- int huge;
+ int huge, err;
if (count + 1 > sizeof(tmp))
return -EINVAL;
@@ -5278,20 +5325,14 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
huge = shmem_parse_huge(tmp);
if (huge == -EINVAL)
- return -EINVAL;
- if (!has_transparent_hugepage() &&
- huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
- return -EINVAL;
-
- /* Do not override huge allocation policy with non-PMD sized mTHP */
- if (huge == SHMEM_HUGE_FORCE &&
- huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
- return -EINVAL;
+ return huge;
shmem_huge = huge;
if (shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
- return count;
+
+ err = start_stop_khugepaged();
+ return err ? err : count;
}
struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
@@ -5368,6 +5409,12 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
ret = -EINVAL;
}
+ if (ret > 0) {
+ int err = start_stop_khugepaged();
+
+ if (err)
+ ret = err;
+ }
return ret;
}
@@ -5375,6 +5422,126 @@ struct kobj_attribute thpsize_shmem_enabled_attr =
__ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
+
+static int __init setup_transparent_hugepage_shmem(char *str)
+{
+ int huge;
+
+ huge = shmem_parse_huge(str);
+ if (huge == -EINVAL) {
+ pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n");
+ return huge;
+ }
+
+ shmem_huge = huge;
+ return 1;
+}
+__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
+
+static char str_dup[PAGE_SIZE] __initdata;
+static int __init setup_thp_shmem(char *str)
+{
+ char *token, *range, *policy, *subtoken;
+ unsigned long always, inherit, madvise, within_size;
+ char *start_size, *end_size;
+ int start, end, nr;
+ char *p;
+
+ if (!str || strlen(str) + 1 > PAGE_SIZE)
+ goto err;
+ strscpy(str_dup, str);
+
+ always = huge_shmem_orders_always;
+ inherit = huge_shmem_orders_inherit;
+ madvise = huge_shmem_orders_madvise;
+ within_size = huge_shmem_orders_within_size;
+ p = str_dup;
+ while ((token = strsep(&p, ";")) != NULL) {
+ range = strsep(&token, ":");
+ policy = token;
+
+ if (!policy)
+ goto err;
+
+ while ((subtoken = strsep(&range, ",")) != NULL) {
+ if (strchr(subtoken, '-')) {
+ start_size = strsep(&subtoken, "-");
+ end_size = subtoken;
+
+ start = get_order_from_str(start_size,
+ THP_ORDERS_ALL_FILE_DEFAULT);
+ end = get_order_from_str(end_size,
+ THP_ORDERS_ALL_FILE_DEFAULT);
+ } else {
+ start_size = end_size = subtoken;
+ start = end = get_order_from_str(subtoken,
+ THP_ORDERS_ALL_FILE_DEFAULT);
+ }
+
+ if (start == -EINVAL) {
+ pr_err("invalid size %s in thp_shmem boot parameter\n",
+ start_size);
+ goto err;
+ }
+
+ if (end == -EINVAL) {
+ pr_err("invalid size %s in thp_shmem boot parameter\n",
+ end_size);
+ goto err;
+ }
+
+ if (start < 0 || end < 0 || start > end)
+ goto err;
+
+ nr = end - start + 1;
+ if (!strcmp(policy, "always")) {
+ bitmap_set(&always, start, nr);
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&within_size, start, nr);
+ } else if (!strcmp(policy, "advise")) {
+ bitmap_set(&madvise, start, nr);
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&always, start, nr);
+ bitmap_clear(&within_size, start, nr);
+ } else if (!strcmp(policy, "inherit")) {
+ bitmap_set(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&always, start, nr);
+ bitmap_clear(&within_size, start, nr);
+ } else if (!strcmp(policy, "within_size")) {
+ bitmap_set(&within_size, start, nr);
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&always, start, nr);
+ } else if (!strcmp(policy, "never")) {
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&always, start, nr);
+ bitmap_clear(&within_size, start, nr);
+ } else {
+ pr_err("invalid policy %s in thp_shmem boot parameter\n", policy);
+ goto err;
+ }
+ }
+ }
+
+ huge_shmem_orders_always = always;
+ huge_shmem_orders_madvise = madvise;
+ huge_shmem_orders_inherit = inherit;
+ huge_shmem_orders_within_size = within_size;
+ shmem_orders_configured = true;
+ return 1;
+
+err:
+ pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str);
+ return 0;
+}
+__setup("thp_shmem=", setup_thp_shmem);
+
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
#else /* !CONFIG_SHMEM */
/*
diff --git a/mm/show_mem.c b/mm/show_mem.c
index ec885a398fa0..43afb56abbd3 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -285,8 +285,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
- "yes" : "no");
+ str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES));
}
for_each_populated_zone(zone) {
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c0388b2e959d..cec67c5f37d8 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -184,10 +184,6 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
return p;
}
-void __weak __meminit kernel_pte_init(void *addr)
-{
-}
-
pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
{
pmd_t *pmd = pmd_offset(pud, addr);
@@ -201,10 +197,6 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
return pmd;
}
-void __weak __meminit pmd_init(void *addr)
-{
-}
-
pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
pud_t *pud = pud_offset(p4d, addr);
@@ -218,10 +210,6 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
return pud;
}
-void __weak __meminit pud_init(void *addr)
-{
-}
-
p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
{
p4d_t *p4d = p4d_offset(pgd, addr);
diff --git a/mm/sparse.c b/mm/sparse.c
index dc38539f8560..13b6624d3562 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section)
static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
- unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT;
+ unsigned long max_sparsemem_pfn = (DIRECT_MAP_PHYSMEM_END + 1) >> PAGE_SHIFT;
/*
* Sanity checks - do not allow an architecture to pass
@@ -720,19 +720,19 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
static void free_map_bootmem(struct page *memmap)
{
unsigned long maps_section_nr, removing_section_nr, i;
- unsigned long magic, nr_pages;
+ unsigned long type, nr_pages;
struct page *page = virt_to_page(memmap);
nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
>> PAGE_SHIFT;
for (i = 0; i < nr_pages; i++, page++) {
- magic = page->index;
+ type = bootmem_type(page);
- BUG_ON(magic == NODE_INFO);
+ BUG_ON(type == NODE_INFO);
maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
- removing_section_nr = page_private(page);
+ removing_section_nr = bootmem_info(page);
/*
* When this function is called, the removing section is
diff --git a/mm/swap.c b/mm/swap.c
index 59f30a981c6f..10decd9dffa1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -113,37 +113,6 @@ void __folio_put(struct folio *folio)
}
EXPORT_SYMBOL(__folio_put);
-/**
- * put_pages_list() - release a list of pages
- * @pages: list of pages threaded on page->lru
- *
- * Release a list of pages which are strung together on page.lru.
- */
-void put_pages_list(struct list_head *pages)
-{
- struct folio_batch fbatch;
- struct folio *folio, *next;
-
- folio_batch_init(&fbatch);
- list_for_each_entry_safe(folio, next, pages, lru) {
- if (!folio_put_testzero(folio))
- continue;
- if (folio_test_hugetlb(folio)) {
- free_huge_folio(folio);
- continue;
- }
- /* LRU flag must be clear because it's passed using the lru */
- if (folio_batch_add(&fbatch, folio) > 0)
- continue;
- free_unref_folios(&fbatch);
- }
-
- if (fbatch.nr)
- free_unref_folios(&fbatch);
- INIT_LIST_HEAD(pages);
-}
-EXPORT_SYMBOL(put_pages_list);
-
typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
static void lru_add(struct lruvec *lruvec, struct folio *folio)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4669f29cf555..e0c0321b8ff7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -889,8 +889,7 @@ struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%s\n",
- enable_vma_readahead ? "true" : "false");
+ return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead));
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
diff --git a/mm/truncate.c b/mm/truncate.c
index 09fa809f921d..7c304d2f0052 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -23,42 +23,28 @@
#include <linux/rmap.h>
#include "internal.h"
-/*
- * Regular page slots are stabilized by the page lock even without the tree
- * itself locked. These unlocked entries need verification under the tree
- * lock.
- */
-static inline void __clear_shadow_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
-{
- XA_STATE(xas, &mapping->i_pages, index);
-
- xas_set_update(&xas, workingset_update_node);
- if (xas_load(&xas) != entry)
- return;
- xas_store(&xas, NULL);
-}
-
static void clear_shadow_entries(struct address_space *mapping,
- struct folio_batch *fbatch, pgoff_t *indices)
+ unsigned long start, unsigned long max)
{
- int i;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct folio *folio;
/* Handled by shmem itself, or for DAX we do nothing. */
if (shmem_mapping(mapping) || dax_mapping(mapping))
return;
- spin_lock(&mapping->host->i_lock);
- xa_lock_irq(&mapping->i_pages);
+ xas_set_update(&xas, workingset_update_node);
- for (i = 0; i < folio_batch_count(fbatch); i++) {
- struct folio *folio = fbatch->folios[i];
+ spin_lock(&mapping->host->i_lock);
+ xas_lock_irq(&xas);
+ /* Clear all shadow entries from start to max */
+ xas_for_each(&xas, folio, max) {
if (xa_is_value(folio))
- __clear_shadow_entry(mapping, indices[i], folio);
+ xas_store(&xas, NULL);
}
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
@@ -68,54 +54,53 @@ static void clear_shadow_entries(struct address_space *mapping,
* Unconditionally remove exceptional entries. Usually called from truncate
* path. Note that the folio_batch may be altered by this function by removing
* exceptional entries similar to what folio_batch_remove_exceptionals() does.
+ * Please note that indices[] has entries in ascending order as guaranteed by
+ * either find_get_entries() or find_lock_entries().
*/
static void truncate_folio_batch_exceptionals(struct address_space *mapping,
struct folio_batch *fbatch, pgoff_t *indices)
{
+ XA_STATE(xas, &mapping->i_pages, indices[0]);
+ int nr = folio_batch_count(fbatch);
+ struct folio *folio;
int i, j;
- bool dax;
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return;
- for (j = 0; j < folio_batch_count(fbatch); j++)
+ for (j = 0; j < nr; j++)
if (xa_is_value(fbatch->folios[j]))
break;
- if (j == folio_batch_count(fbatch))
+ if (j == nr)
return;
- dax = dax_mapping(mapping);
- if (!dax) {
- spin_lock(&mapping->host->i_lock);
- xa_lock_irq(&mapping->i_pages);
+ if (dax_mapping(mapping)) {
+ for (i = j; i < nr; i++) {
+ if (xa_is_value(fbatch->folios[i]))
+ dax_delete_mapping_entry(mapping, indices[i]);
+ }
+ goto out;
}
- for (i = j; i < folio_batch_count(fbatch); i++) {
- struct folio *folio = fbatch->folios[i];
- pgoff_t index = indices[i];
-
- if (!xa_is_value(folio)) {
- fbatch->folios[j++] = folio;
- continue;
- }
+ xas_set(&xas, indices[j]);
+ xas_set_update(&xas, workingset_update_node);
- if (unlikely(dax)) {
- dax_delete_mapping_entry(mapping, index);
- continue;
- }
+ spin_lock(&mapping->host->i_lock);
+ xas_lock_irq(&xas);
- __clear_shadow_entry(mapping, index, folio);
+ xas_for_each(&xas, folio, indices[nr-1]) {
+ if (xa_is_value(folio))
+ xas_store(&xas, NULL);
}
- if (!dax) {
- xa_unlock_irq(&mapping->i_pages);
- if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
- spin_unlock(&mapping->host->i_lock);
- }
- fbatch->nr = j;
+ xas_unlock_irq(&xas);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+out:
+ folio_batch_remove_exceptionals(fbatch);
}
/**
@@ -477,11 +462,13 @@ unsigned long mapping_try_invalidate(struct address_space *mapping,
unsigned long ret;
unsigned long count = 0;
int i;
- bool xa_has_values = false;
folio_batch_init(&fbatch);
while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ bool xa_has_values = false;
+ int nr = folio_batch_count(&fbatch);
+
+ for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing folio->index */
@@ -508,7 +495,7 @@ unsigned long mapping_try_invalidate(struct address_space *mapping,
}
if (xa_has_values)
- clear_shadow_entries(mapping, &fbatch, indices);
+ clear_shadow_entries(mapping, indices[0], indices[nr-1]);
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
@@ -604,7 +591,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
int ret = 0;
int ret2 = 0;
int did_range_unmap = 0;
- bool xa_has_values = false;
if (mapping_empty(mapping))
return 0;
@@ -612,7 +598,10 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
folio_batch_init(&fbatch);
index = start;
while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ bool xa_has_values = false;
+ int nr = folio_batch_count(&fbatch);
+
+ for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing folio->index */
@@ -658,7 +647,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
if (xa_has_values)
- clear_shadow_entries(mapping, &fbatch, indices);
+ clear_shadow_entries(mapping, indices[0], indices[nr-1]);
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index ce13c4062647..60a0be33766f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -251,7 +251,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
if (!*foliop) {
ret = -ENOMEM;
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
- dst_addr, false);
+ dst_addr);
if (!folio)
goto out;
@@ -1135,7 +1135,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
spinlock_t *src_ptl, *dst_ptl;
pte_t *src_pte = NULL;
pte_t *dst_pte = NULL;
-
+ pmd_t dummy_pmdval;
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
@@ -1146,7 +1146,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
src_addr, src_addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
retry:
- dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl);
+ /*
+ * Use the maywrite version to indicate that dst_pte will be modified,
+ * but since we will use pte_same() to detect the change of the pte
+ * entry, there is no need to get pmdval, so just pass a dummy variable
+ * to it.
+ */
+ dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval,
+ &dst_ptl);
/* Retry if a huge pmd materialized from under us */
if (unlikely(!dst_pte)) {
@@ -1154,7 +1161,9 @@ retry:
goto out;
}
- src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl);
+ /* same as dst_pte */
+ src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval,
+ &src_ptl);
/*
* We held the mmap_lock for reading so MADV_DONTNEED
diff --git a/mm/util.c b/mm/util.c
index 4f1275023eb7..60017d2a9e48 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -820,7 +820,7 @@ void *vcalloc_noprof(size_t n, size_t size)
}
EXPORT_SYMBOL(vcalloc_noprof);
-struct anon_vma *folio_anon_vma(struct folio *folio)
+struct anon_vma *folio_anon_vma(const struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
diff --git a/mm/vma.c b/mm/vma.c
index 7621384d64cf..8a454a7bbc80 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -7,6 +7,57 @@
#include "vma_internal.h"
#include "vma.h"
+struct mmap_state {
+ struct mm_struct *mm;
+ struct vma_iterator *vmi;
+
+ unsigned long addr;
+ unsigned long end;
+ pgoff_t pgoff;
+ unsigned long pglen;
+ unsigned long flags;
+ struct file *file;
+
+ unsigned long charged;
+ bool retry_merge;
+
+ struct vm_area_struct *prev;
+ struct vm_area_struct *next;
+
+ /* Unmapping state. */
+ struct vma_munmap_struct vms;
+ struct ma_state mas_detach;
+ struct maple_tree mt_detach;
+};
+
+#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \
+ struct mmap_state name = { \
+ .mm = mm_, \
+ .vmi = vmi_, \
+ .addr = addr_, \
+ .end = (addr_) + len, \
+ .pgoff = pgoff_, \
+ .pglen = PHYS_PFN(len_), \
+ .flags = flags_, \
+ .file = file_, \
+ }
+
+#define VMG_MMAP_STATE(name, map_, vma_) \
+ struct vma_merge_struct name = { \
+ .mm = (map_)->mm, \
+ .vmi = (map_)->vmi, \
+ .start = (map_)->addr, \
+ .end = (map_)->end, \
+ .flags = (map_)->flags, \
+ .pgoff = (map_)->pgoff, \
+ .file = (map_)->file, \
+ .prev = (map_)->prev, \
+ .vma = vma_, \
+ .next = (vma_) ? NULL : (map_)->next, \
+ .state = VMA_MERGE_START, \
+ .merge_flags = VMG_FLAG_DEFAULT, \
+ }
+
static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
@@ -564,7 +615,11 @@ void validate_mm(struct mm_struct *mm)
anon_vma_unlock_read(anon_vma);
}
#endif
- i++;
+ /* Check for a infinite loop */
+ if (++i > mm->map_count + 10) {
+ i = -1;
+ break;
+ }
}
if (i != mm->map_count) {
pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
@@ -911,10 +966,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
{
struct vm_area_struct *prev = vmg->prev;
struct vm_area_struct *next = vmg->next;
- unsigned long start = vmg->start;
unsigned long end = vmg->end;
- pgoff_t pgoff = vmg->pgoff;
- pgoff_t pglen = PHYS_PFN(end - start);
bool can_merge_left, can_merge_right;
bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
@@ -936,7 +988,6 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
if (can_merge_right) {
vmg->end = next->vm_end;
vmg->vma = next;
- vmg->pgoff = next->vm_pgoff - pglen;
}
/* If we can merge with the previous VMA, adjust vmg accordingly. */
@@ -970,16 +1021,6 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
return vmg->vma;
}
- /* If expansion failed, reset state. Allows us to retry merge later. */
- if (!just_expand) {
- vmg->vma = NULL;
- vmg->start = start;
- vmg->end = end;
- vmg->pgoff = pgoff;
- if (vmg->vma == prev)
- vma_iter_set(vmg->vmi, start);
- }
-
return NULL;
}
@@ -1103,7 +1144,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
vms->clear_ptes = false;
}
-void vms_clean_up_area(struct vma_munmap_struct *vms,
+static void vms_clean_up_area(struct vma_munmap_struct *vms,
struct ma_state *mas_detach)
{
struct vm_area_struct *vma;
@@ -1126,7 +1167,7 @@ void vms_clean_up_area(struct vma_munmap_struct *vms,
* used for the munmap() and may downgrade the lock - if requested. Everything
* needed to be done once the vma maple tree is updated.
*/
-void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
+static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
struct ma_state *mas_detach)
{
struct vm_area_struct *vma;
@@ -1168,6 +1209,23 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
}
/*
+ * reattach_vmas() - Undo any munmap work and free resources
+ * @mas_detach: The maple state with the detached maple tree
+ *
+ * Reattach any detached vmas and free up the maple tree used to track the vmas.
+ */
+static void reattach_vmas(struct ma_state *mas_detach)
+{
+ struct vm_area_struct *vma;
+
+ mas_set(mas_detach, 0);
+ mas_for_each(mas_detach, vma, ULONG_MAX)
+ vma_mark_detached(vma, false);
+
+ __mt_destroy(mas_detach->tree);
+}
+
+/*
* vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
* for removal at a later date. Handles splitting first and last if necessary
* and marking the vmas as isolated.
@@ -1177,7 +1235,7 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
*
* Return: 0 on success, error otherwise
*/
-int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
+static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
struct ma_state *mas_detach)
{
struct vm_area_struct *next = NULL;
@@ -1254,7 +1312,7 @@ int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
else if (is_data_mapping(next->vm_flags))
vms->data_vm += nrpages;
- if (unlikely(vms->uf)) {
+ if (vms->uf) {
/*
* If userfaultfd_unmap_prep returns an error the vmas
* will remain split, but userland will get a
@@ -1316,6 +1374,39 @@ map_count_exceeded:
}
/*
+ * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
+ * @vms: The vma munmap struct
+ * @vmi: The vma iterator
+ * @vma: The first vm_area_struct to munmap
+ * @start: The aligned start address to munmap
+ * @end: The aligned end address to munmap
+ * @uf: The userfaultfd list_head
+ * @unlock: Unlock after the operation. Only unlocked on success
+ */
+static void init_vma_munmap(struct vma_munmap_struct *vms,
+ struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, struct list_head *uf,
+ bool unlock)
+{
+ vms->vmi = vmi;
+ vms->vma = vma;
+ if (vma) {
+ vms->start = start;
+ vms->end = end;
+ } else {
+ vms->start = vms->end = 0;
+ }
+ vms->unlock = unlock;
+ vms->uf = uf;
+ vms->vma_count = 0;
+ vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
+ vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
+ vms->unmap_start = FIRST_USER_ADDRESS;
+ vms->unmap_end = USER_PGTABLES_CEILING;
+ vms->clear_ptes = false;
+}
+
+/*
* do_vmi_align_munmap() - munmap the aligned region from @start to @end.
* @vmi: The vma iterator
* @vma: The starting vm_area_struct
@@ -2069,3 +2160,321 @@ void mm_drop_all_locks(struct mm_struct *mm)
mutex_unlock(&mm_all_locks_mutex);
}
+
+/*
+ * We account for memory if it's a private writeable mapping,
+ * not hugepages and VM_NORESERVE wasn't set.
+ */
+static bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
+{
+ /*
+ * hugetlb has its own accounting separate from the core VM
+ * VM_HUGETLB may not be set yet so we cannot check for that flag.
+ */
+ if (file && is_file_hugepages(file))
+ return false;
+
+ return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
+
+/*
+ * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
+ * operation.
+ * @vms: The vma unmap structure
+ * @mas_detach: The maple state with the detached maple tree
+ *
+ * Reattach any detached vmas, free up the maple tree used to track the vmas.
+ * If that's not possible because the ptes are cleared (and vm_ops->closed() may
+ * have been called), then a NULL is written over the vmas and the vmas are
+ * removed (munmap() completed).
+ */
+static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach)
+{
+ struct ma_state *mas = &vms->vmi->mas;
+
+ if (!vms->nr_pages)
+ return;
+
+ if (vms->clear_ptes)
+ return reattach_vmas(mas_detach);
+
+ /*
+ * Aborting cannot just call the vm_ops open() because they are often
+ * not symmetrical and state data has been lost. Resort to the old
+ * failure method of leaving a gap where the MAP_FIXED mapping failed.
+ */
+ mas_set_range(mas, vms->start, vms->end - 1);
+ mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
+ /* Clean up the insertion of the unfortunate gap */
+ vms_complete_munmap_vmas(vms, mas_detach);
+}
+
+/*
+ * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be
+ * unmapped once the map operation is completed, check limits, account mapping
+ * and clean up any pre-existing VMAs.
+ *
+ * @map: Mapping state.
+ * @uf: Userfaultfd context list.
+ *
+ * Returns: 0 on success, error code otherwise.
+ */
+static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
+{
+ int error;
+ struct vma_iterator *vmi = map->vmi;
+ struct vma_munmap_struct *vms = &map->vms;
+
+ /* Find the first overlapping VMA and initialise unmap state. */
+ vms->vma = vma_find(vmi, map->end);
+ init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf,
+ /* unlock = */ false);
+
+ /* OK, we have overlapping VMAs - prepare to unmap them. */
+ if (vms->vma) {
+ mt_init_flags(&map->mt_detach,
+ vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
+ mt_on_stack(map->mt_detach);
+ mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0);
+ /* Prepare to unmap any existing mapping in the area */
+ error = vms_gather_munmap_vmas(vms, &map->mas_detach);
+ if (error) {
+ /* On error VMAs will already have been reattached. */
+ vms->nr_pages = 0;
+ return error;
+ }
+
+ map->next = vms->next;
+ map->prev = vms->prev;
+ } else {
+ map->next = vma_iter_next_rewind(vmi, &map->prev);
+ }
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages))
+ return -ENOMEM;
+
+ /* Private writable mapping: check memory availability. */
+ if (accountable_mapping(map->file, map->flags)) {
+ map->charged = map->pglen;
+ map->charged -= vms->nr_accounted;
+ if (map->charged) {
+ error = security_vm_enough_memory_mm(map->mm, map->charged);
+ if (error)
+ return error;
+ }
+
+ vms->nr_accounted = 0;
+ map->flags |= VM_ACCOUNT;
+ }
+
+ /*
+ * Clear PTEs while the vma is still in the tree so that rmap
+ * cannot race with the freeing later in the truncate scenario.
+ * This is also needed for mmap_file(), which is why vm_ops
+ * close function is called.
+ */
+ vms_clean_up_area(vms, &map->mas_detach);
+
+ return 0;
+}
+
+
+static int __mmap_new_file_vma(struct mmap_state *map,
+ struct vm_area_struct *vma)
+{
+ struct vma_iterator *vmi = map->vmi;
+ int error;
+
+ vma->vm_file = get_file(map->file);
+ error = mmap_file(vma->vm_file, vma);
+ if (error) {
+ fput(vma->vm_file);
+ vma->vm_file = NULL;
+
+ vma_iter_set(vmi, vma->vm_end);
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(&vmi->mas, vma, map->prev, map->next);
+
+ return error;
+ }
+
+ /* Drivers cannot alter the address of the VMA. */
+ WARN_ON_ONCE(map->addr != vma->vm_start);
+ /*
+ * Drivers should not permit writability when previously it was
+ * disallowed.
+ */
+ VM_WARN_ON_ONCE(map->flags != vma->vm_flags &&
+ !(map->flags & VM_MAYWRITE) &&
+ (vma->vm_flags & VM_MAYWRITE));
+
+ /* If the flags change (and are mergeable), let's retry later. */
+ map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL);
+ map->flags = vma->vm_flags;
+
+ return 0;
+}
+
+/*
+ * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not
+ * possible.
+ *
+ * @map: Mapping state.
+ * @vmap: Output pointer for the new VMA.
+ *
+ * Returns: Zero on success, or an error.
+ */
+static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
+{
+ struct vma_iterator *vmi = map->vmi;
+ int error = 0;
+ struct vm_area_struct *vma;
+
+ /*
+ * Determine the object being mapped and call the appropriate
+ * specific mapper. the address has already been validated, but
+ * not unmapped, but the maps are removed from the list.
+ */
+ vma = vm_area_alloc(map->mm);
+ if (!vma)
+ return -ENOMEM;
+
+ vma_iter_config(vmi, map->addr, map->end);
+ vma_set_range(vma, map->addr, map->end, map->pgoff);
+ vm_flags_init(vma, map->flags);
+ vma->vm_page_prot = vm_get_page_prot(map->flags);
+
+ if (vma_iter_prealloc(vmi, vma)) {
+ error = -ENOMEM;
+ goto free_vma;
+ }
+
+ if (map->file)
+ error = __mmap_new_file_vma(map, vma);
+ else if (map->flags & VM_SHARED)
+ error = shmem_zero_setup(vma);
+ else
+ vma_set_anonymous(vma);
+
+ if (error)
+ goto free_iter_vma;
+
+#ifdef CONFIG_SPARC64
+ /* TODO: Fix SPARC ADI! */
+ WARN_ON_ONCE(!arch_validate_flags(map->flags));
+#endif
+
+ /* Lock the VMA since it is modified after insertion into VMA tree */
+ vma_start_write(vma);
+ vma_iter_store(vmi, vma);
+ map->mm->map_count++;
+ vma_link_file(vma);
+
+ /*
+ * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
+ * call covers the non-merge case.
+ */
+ khugepaged_enter_vma(vma, map->flags);
+ ksm_add_vma(vma);
+ *vmap = vma;
+ return 0;
+
+free_iter_vma:
+ vma_iter_free(vmi);
+free_vma:
+ vm_area_free(vma);
+ return error;
+}
+
+/*
+ * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping
+ * statistics, handle locking and finalise the VMA.
+ *
+ * @map: Mapping state.
+ * @vma: Merged or newly allocated VMA for the mmap()'d region.
+ */
+static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = map->mm;
+ unsigned long vm_flags = vma->vm_flags;
+
+ perf_event_mmap(vma);
+
+ /* Unmap any existing mapping in the area. */
+ vms_complete_munmap_vmas(&map->vms, &map->mas_detach);
+
+ vm_stat_account(mm, vma->vm_flags, map->pglen);
+ if (vm_flags & VM_LOCKED) {
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(mm))
+ vm_flags_clear(vma, VM_LOCKED_MASK);
+ else
+ mm->locked_vm += map->pglen;
+ }
+
+ if (vma->vm_file)
+ uprobe_mmap(vma);
+
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vm_flags_set(vma, VM_SOFTDIRTY);
+
+ vma_set_page_prot(vma);
+}
+
+unsigned long __mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ int error;
+ VMA_ITERATOR(vmi, mm, addr);
+ MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
+
+ error = __mmap_prepare(&map, uf);
+ if (error)
+ goto abort_munmap;
+
+ /* Attempt to merge with adjacent VMAs... */
+ if (map.prev || map.next) {
+ VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
+
+ vma = vma_merge_new_range(&vmg);
+ }
+
+ /* ...but if we can't, allocate a new VMA. */
+ if (!vma) {
+ error = __mmap_new_vma(&map, &vma);
+ if (error)
+ goto unacct_error;
+ }
+
+ /* If flags changed, we might be able to merge, so try again. */
+ if (map.retry_merge) {
+ VMG_MMAP_STATE(vmg, &map, vma);
+
+ vma_iter_config(map.vmi, map.addr, map.end);
+ vma_merge_existing_range(&vmg);
+ }
+
+ __mmap_complete(&map, vma);
+
+ return addr;
+
+ /* Accounting was done by __mmap_prepare(). */
+unacct_error:
+ if (map.charged)
+ vm_unacct_memory(map.charged);
+abort_munmap:
+ vms_abort_munmap_vmas(&map.vms, &map.mas_detach);
+ return error;
+}
diff --git a/mm/vma.h b/mm/vma.h
index d58068c0ff2e..388d34748674 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -165,99 +165,6 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
return 0;
}
-#ifdef CONFIG_MMU
-/*
- * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
- * @vms: The vma munmap struct
- * @vmi: The vma iterator
- * @vma: The first vm_area_struct to munmap
- * @start: The aligned start address to munmap
- * @end: The aligned end address to munmap
- * @uf: The userfaultfd list_head
- * @unlock: Unlock after the operation. Only unlocked on success
- */
-static inline void init_vma_munmap(struct vma_munmap_struct *vms,
- struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, struct list_head *uf,
- bool unlock)
-{
- vms->vmi = vmi;
- vms->vma = vma;
- if (vma) {
- vms->start = start;
- vms->end = end;
- } else {
- vms->start = vms->end = 0;
- }
- vms->unlock = unlock;
- vms->uf = uf;
- vms->vma_count = 0;
- vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
- vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
- vms->unmap_start = FIRST_USER_ADDRESS;
- vms->unmap_end = USER_PGTABLES_CEILING;
- vms->clear_ptes = false;
-}
-#endif
-
-int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
- struct ma_state *mas_detach);
-
-void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
- struct ma_state *mas_detach);
-
-void vms_clean_up_area(struct vma_munmap_struct *vms,
- struct ma_state *mas_detach);
-
-/*
- * reattach_vmas() - Undo any munmap work and free resources
- * @mas_detach: The maple state with the detached maple tree
- *
- * Reattach any detached vmas and free up the maple tree used to track the vmas.
- */
-static inline void reattach_vmas(struct ma_state *mas_detach)
-{
- struct vm_area_struct *vma;
-
- mas_set(mas_detach, 0);
- mas_for_each(mas_detach, vma, ULONG_MAX)
- vma_mark_detached(vma, false);
-
- __mt_destroy(mas_detach->tree);
-}
-
-/*
- * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
- * operation.
- * @vms: The vma unmap structure
- * @mas_detach: The maple state with the detached maple tree
- *
- * Reattach any detached vmas, free up the maple tree used to track the vmas.
- * If that's not possible because the ptes are cleared (and vm_ops->closed() may
- * have been called), then a NULL is written over the vmas and the vmas are
- * removed (munmap() completed).
- */
-static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
- struct ma_state *mas_detach)
-{
- struct ma_state *mas = &vms->vmi->mas;
- if (!vms->nr_pages)
- return;
-
- if (vms->clear_ptes)
- return reattach_vmas(mas_detach);
-
- /*
- * Aborting cannot just call the vm_ops open() because they are often
- * not symmetrical and state data has been lost. Resort to the old
- * failure method of leaving a gap where the MAP_FIXED mapping failed.
- */
- mas_set_range(mas, vms->start, vms->end - 1);
- mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
- /* Clean up the insertion of the unfortunate gap */
- vms_complete_munmap_vmas(vms, mas_detach);
-}
-
int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
@@ -336,6 +243,10 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
int mm_take_all_locks(struct mm_struct *mm);
void mm_drop_all_locks(struct mm_struct *mm);
+unsigned long __mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf);
+
static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
/*
diff --git a/mm/vma_internal.h b/mm/vma_internal.h
index b930ab12a587..fc5f172a36bd 100644
--- a/mm/vma_internal.h
+++ b/mm/vma_internal.h
@@ -17,8 +17,10 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/huge_mm.h>
+#include <linux/hugetlb.h>
#include <linux/hugetlb_inline.h>
#include <linux/kernel.h>
+#include <linux/ksm.h>
#include <linux/khugepaged.h>
#include <linux/list.h>
#include <linux/maple_tree.h>
@@ -32,11 +34,14 @@
#include <linux/mmu_context.h>
#include <linux/mutex.h>
#include <linux/pagemap.h>
+#include <linux/perf_event.h>
#include <linux/pfn.h>
#include <linux/rcupdate.h>
#include <linux/rmap.h>
#include <linux/rwsem.h>
#include <linux/sched/signal.h>
+#include <linux/security.h>
+#include <linux/shmem_fs.h>
#include <linux/swap.h>
#include <linux/uprobes.h>
#include <linux/userfaultfd_k.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 634162271c00..7ed39d104201 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -653,7 +653,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
* RETURNS:
* 0 on success, -errno on failure.
*/
-static int vmap_pages_range(unsigned long addr, unsigned long end,
+int vmap_pages_range(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
int err;
@@ -2182,6 +2182,25 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
reclaim_list_global(&decay_list);
}
+static void
+kasan_release_vmalloc_node(struct vmap_node *vn)
+{
+ struct vmap_area *va;
+ unsigned long start, end;
+
+ start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
+ end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
+
+ list_for_each_entry(va, &vn->purge_list, list) {
+ if (is_vmalloc_or_module_addr((void *) va->va_start))
+ kasan_release_vmalloc(va->va_start, va->va_end,
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE);
+ }
+
+ kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
+}
+
static void purge_vmap_node(struct work_struct *work)
{
struct vmap_node *vn = container_of(work,
@@ -2190,20 +2209,17 @@ static void purge_vmap_node(struct work_struct *work)
struct vmap_area *va, *n_va;
LIST_HEAD(local_list);
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ kasan_release_vmalloc_node(vn);
+
vn->nr_purged = 0;
list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
unsigned long nr = va_size(va) >> PAGE_SHIFT;
- unsigned long orig_start = va->va_start;
- unsigned long orig_end = va->va_end;
unsigned int vn_id = decode_vn_id(va->flags);
list_del_init(&va->list);
- if (is_vmalloc_or_module_addr((void *)orig_start))
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
-
nr_purged_pages += nr;
vn->nr_purged++;
@@ -3007,6 +3023,11 @@ static inline unsigned int vm_area_page_order(struct vm_struct *vm)
#endif
}
+unsigned int get_vm_area_page_order(struct vm_struct *vm)
+{
+ return vm_area_page_order(vm);
+}
+
static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
@@ -3085,7 +3106,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
vm->flags &= ~VM_UNINITIALIZED;
}
-static struct vm_struct *__get_vm_area_node(unsigned long size,
+struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift, unsigned long flags,
unsigned long start, unsigned long end, int node,
gfp_t gfp_mask, const void *caller)
@@ -3763,8 +3784,6 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
}
if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
- unsigned long size_per_node;
-
/*
* Try huge pages. Only try for PAGE_KERNEL allocations,
* others like modules don't yet expect huge pages in
@@ -3772,13 +3791,10 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
* supporting them.
*/
- size_per_node = size;
- if (node == NUMA_NO_NODE)
- size_per_node /= num_online_nodes();
- if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
+ if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
shift = PMD_SHIFT;
else
- shift = arch_vmap_pte_supported_shift(size_per_node);
+ shift = arch_vmap_pte_supported_shift(size);
align = max(real_align, 1UL << shift);
size = ALIGN(real_size, 1UL << shift);
@@ -4784,7 +4800,8 @@ recovery:
&free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
vas[area] = NULL;
}
@@ -4834,7 +4851,8 @@ err_free_shadow:
&free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
vas[area] = NULL;
kfree(vms[area]);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 28ba2b06fc7d..76378bc257e3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1258,8 +1258,8 @@ retry:
THP_SWPOUT_FALLBACK, 1);
count_vm_event(THP_SWPOUT_FALLBACK);
}
- count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
#endif
+ count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
if (!add_to_swap(folio))
goto activate_locked_split;
}
@@ -2129,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
static unsigned int reclaim_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat)
{
- struct reclaim_stat dummy_stat;
+ struct reclaim_stat stat;
unsigned int nr_reclaimed;
struct folio *folio;
struct scan_control sc = {
@@ -2140,12 +2140,13 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
folio_putback_lru(folio);
}
+ trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat);
return nr_reclaimed;
}
@@ -2602,8 +2603,6 @@ static bool should_clear_pmd_young(void)
* shorthand helpers
******************************************************************************/
-#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
-
#define DEFINE_MAX_SEQ(lruvec) \
unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
@@ -3138,7 +3137,6 @@ static int folio_update_gen(struct folio *folio, int gen)
unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
- VM_WARN_ON_ONCE(!rcu_read_lock_held());
do {
/* lru_gen_del_folio() has isolated this page? */
@@ -3354,7 +3352,7 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
if (folio_nid(folio) != pgdat->node_id)
return NULL;
- if (folio_memcg_rcu(folio) != memcg)
+ if (folio_memcg(folio) != memcg)
return NULL;
/* file VMAs can contain anon pages from COW */
@@ -3386,8 +3384,10 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ pmd_t pmdval;
- pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
+ pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval,
+ &ptl);
if (!pte)
return false;
if (!spin_trylock(ptl)) {
@@ -3395,6 +3395,11 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
return false;
}
+ if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
+ pte_unmap_unlock(pte, ptl);
+ return false;
+ }
+
arch_enter_lazy_mmu_mode();
restart:
for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
@@ -3643,10 +3648,8 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
.p4d_entry = walk_pud_range,
.walk_lock = PGWALK_RDLOCK,
};
-
int err;
struct lruvec *lruvec = walk->lruvec;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
walk->next_addr = FIRST_USER_ADDRESS;
@@ -3659,10 +3662,6 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
if (walk->seq != max_seq)
break;
- /* folio_update_gen() requires stable folio_memcg() */
- if (!mem_cgroup_trylock_pages(memcg))
- break;
-
/* the caller might be holding the lock for write */
if (mmap_read_trylock(mm)) {
err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
@@ -3670,8 +3669,6 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
mmap_read_unlock(mm);
}
- mem_cgroup_unlock_pages();
-
if (walk->batched) {
spin_lock_irq(&lruvec->lru_lock);
reset_batch_size(walk);
@@ -4093,10 +4090,6 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
}
}
- /* folio_update_gen() requires stable folio_memcg() */
- if (!mem_cgroup_trylock_pages(memcg))
- return true;
-
arch_enter_lazy_mmu_mode();
pte -= (addr - start) / PAGE_SIZE;
@@ -4134,12 +4127,13 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
old_gen = folio_lru_gen(folio);
if (old_gen < 0)
folio_set_referenced(folio);
- else if (old_gen != new_gen)
+ else if (old_gen != new_gen) {
+ folio_clear_lru_refs(folio);
folio_activate(folio);
+ }
}
arch_leave_lazy_mmu_mode();
- mem_cgroup_unlock_pages();
/* feedback from rmap walkers to page table walkers */
if (mm_state && suitable_to_scan(i, young))
@@ -4290,6 +4284,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
int tier_idx)
{
bool success;
+ bool dirty, writeback;
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
@@ -4335,9 +4330,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
return true;
}
+ dirty = folio_test_dirty(folio);
+ writeback = folio_test_writeback(folio);
+ if (type == LRU_GEN_FILE && dirty) {
+ sc->nr.file_taken += delta;
+ if (!writeback)
+ sc->nr.unqueued_dirty += delta;
+ }
+
/* waiting for writeback */
- if (folio_test_locked(folio) || folio_test_writeback(folio) ||
- (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
+ if (folio_test_locked(folio) || writeback ||
+ (type == LRU_GEN_FILE && dirty)) {
gen = folio_inc_gen(lruvec, folio, true);
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4368,7 +4371,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
/* see the comment on MAX_NR_TIERS */
if (!folio_test_referenced(folio))
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+ folio_clear_lru_refs(folio);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
@@ -4453,7 +4456,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
-
+ if (type == LRU_GEN_FILE)
+ sc->nr.file_taken += isolated;
/*
* There might not be eligible folios due to reclaim_idx. Check the
* remaining to prevent livelock if it's not making progress.
@@ -4587,6 +4591,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
return scanned;
retry:
reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
scanned, reclaimed, &stat, sc->priority,
@@ -4795,6 +4800,13 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
cond_resched();
}
+ /*
+ * If too many file cache in the coldest generation can't be evicted
+ * due to being dirty, wake up the flusher.
+ */
+ if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken)
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
+
/* whether this lruvec should be rotated */
return nr_to_scan < 0;
}
@@ -5940,6 +5952,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
bool reclaimable = false;
if (lru_gen_enabled() && root_reclaim(sc)) {
+ memset(&sc->nr, 0, sizeof(sc->nr));
lru_gen_shrink_node(pgdat, sc);
return;
}
@@ -5990,7 +6003,8 @@ again:
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/* Allow kswapd to start writing pages during reclaim.*/
- if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+ if (sc->nr.unqueued_dirty &&
+ sc->nr.unqueued_dirty == sc->nr.file_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ac6a5aa34eab..4d016314a56c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1273,6 +1273,9 @@ const char * const vmstat_text[] = {
"pgdemote_kswapd",
"pgdemote_direct",
"pgdemote_khugepaged",
+#ifdef CONFIG_HUGETLB_PAGE
+ "nr_hugetlb",
+#endif
/* system-wide enum vm_stat_item counters */
"nr_dirty_threshold",
"nr_dirty_background_threshold",
@@ -1780,6 +1783,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone_page_state(zone, i));
#ifdef CONFIG_NUMA
+ fold_vm_zone_numa_events(zone);
for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
zone_numa_event_state(zone, i));
@@ -1793,13 +1797,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
seq_printf(m,
"\n cpu: %i"
- "\n count: %i"
- "\n high: %i"
- "\n batch: %i",
+ "\n count: %i"
+ "\n high: %i"
+ "\n batch: %i"
+ "\n high_min: %i"
+ "\n high_max: %i",
i,
pcp->count,
pcp->high,
- pcp->batch);
+ pcp->batch,
+ pcp->high_min,
+ pcp->high_max);
#ifdef CONFIG_SMP
pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
seq_printf(m, "\n vm stats threshold: %d",
@@ -1931,6 +1939,7 @@ static const struct seq_operations vmstat_op = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
+static int vmstat_late_init_done;
#ifdef CONFIG_PROC_FS
static void refresh_vm_stats(struct work_struct *work)
@@ -2133,7 +2142,8 @@ static void __init init_cpu_node_state(void)
static int vmstat_cpu_online(unsigned int cpu)
{
- refresh_zone_stat_thresholds();
+ if (vmstat_late_init_done)
+ refresh_zone_stat_thresholds();
if (!node_state(cpu_to_node(cpu), N_CPU)) {
node_set_state(cpu_to_node(cpu), N_CPU);
@@ -2165,6 +2175,14 @@ static int vmstat_cpu_dead(unsigned int cpu)
return 0;
}
+static int __init vmstat_late_init(void)
+{
+ refresh_zone_stat_thresholds();
+ vmstat_late_init_done = 1;
+
+ return 0;
+}
+late_initcall(vmstat_late_init);
#endif
struct workqueue_struct *mm_percpu_wq;
diff --git a/mm/workingset.c b/mm/workingset.c
index a2b28e356e68..a4705e196545 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -591,22 +591,12 @@ void workingset_refault(struct folio *folio, void *shadow)
*/
void workingset_activation(struct folio *folio)
{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
/*
* Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages.
- *
- * XXX: See workingset_refault() - this should return
- * root_mem_cgroup even for !CONFIG_MEMCG.
*/
- memcg = folio_memcg_rcu(folio);
- if (!mem_cgroup_disabled() && !memcg)
- goto out;
- workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
-out:
- rcu_read_unlock();
+ if (mem_cgroup_disabled() || folio_memcg_charged(folio))
+ workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
}
/*
@@ -712,8 +702,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
static enum lru_status shadow_lru_isolate(struct list_head *item,
struct list_lru_one *lru,
- spinlock_t *lru_lock,
- void *arg) __must_hold(lru_lock)
+ void *arg) __must_hold(lru->lock)
{
struct xa_node *node = container_of(item, struct xa_node, private_list);
struct address_space *mapping;
@@ -722,20 +711,20 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
/*
* Page cache insertions and deletions synchronously maintain
* the shadow node LRU under the i_pages lock and the
- * lru_lock. Because the page cache tree is emptied before
- * the inode can be destroyed, holding the lru_lock pins any
+ * &lru->lock. Because the page cache tree is emptied before
+ * the inode can be destroyed, holding the &lru->lock pins any
* address_space that has nodes on the LRU.
*
* We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
- * to reclaim, take the node off-LRU, and drop the lru_lock.
+ * to reclaim, take the node off-LRU, and drop the &lru->lock.
*/
mapping = container_of(node->array, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
- spin_unlock_irq(lru_lock);
+ spin_unlock_irq(&lru->lock);
ret = LRU_RETRY;
goto out;
}
@@ -744,7 +733,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (mapping->host != NULL) {
if (!spin_trylock(&mapping->host->i_lock)) {
xa_unlock(&mapping->i_pages);
- spin_unlock_irq(lru_lock);
+ spin_unlock_irq(&lru->lock);
ret = LRU_RETRY;
goto out;
}
@@ -753,7 +742,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
list_lru_isolate(lru, item);
__dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
- spin_unlock(lru_lock);
+ spin_unlock(&lru->lock);
/*
* The nodes should only contain one or more shadow entries,
@@ -777,7 +766,6 @@ out_invalid:
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
- spin_lock_irq(lru_lock);
return ret;
}
@@ -823,8 +811,8 @@ static int __init workingset_init(void)
if (!workingset_shadow_shrinker)
goto err;
- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
- workingset_shadow_shrinker);
+ ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
+ &shadow_nodes_key);
if (ret)
goto err_list_lru;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 16a07def09c9..64b66a4d3e6e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
/*
* zsmalloc memory allocator
*
@@ -261,7 +263,7 @@ struct zspage {
struct mapping_area {
local_lock_t lock;
char *vm_buf; /* copy buffer for objects that span pages */
- char *vm_addr; /* address of kmap_atomic()'ed pages */
+ char *vm_addr; /* address of kmap_local_page()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
};
@@ -898,7 +900,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
set_first_obj_offset(page, off);
- vaddr = kmap_atomic(page);
+ vaddr = kmap_local_page(page);
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
@@ -921,7 +923,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
*/
link->next = -1UL << OBJ_TAG_BITS;
}
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
page = next_page;
off %= PAGE_SIZE;
}
@@ -1044,11 +1046,10 @@ static inline void __zs_cpu_down(struct mapping_area *area)
static void *__zs_map_object(struct mapping_area *area,
struct page *pages[2], int off, int size)
{
- int sizes[2];
- void *addr;
+ size_t sizes[2];
char *buf = area->vm_buf;
- /* disable page faults to match kmap_atomic() return conditions */
+ /* disable page faults to match kmap_local_page() return conditions */
pagefault_disable();
/* no read fastpath */
@@ -1059,12 +1060,8 @@ static void *__zs_map_object(struct mapping_area *area,
sizes[1] = size - sizes[0];
/* copy object to per-cpu buffer */
- addr = kmap_atomic(pages[0]);
- memcpy(buf, addr + off, sizes[0]);
- kunmap_atomic(addr);
- addr = kmap_atomic(pages[1]);
- memcpy(buf + sizes[0], addr, sizes[1]);
- kunmap_atomic(addr);
+ memcpy_from_page(buf, pages[0], off, sizes[0]);
+ memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]);
out:
return area->vm_buf;
}
@@ -1072,8 +1069,7 @@ out:
static void __zs_unmap_object(struct mapping_area *area,
struct page *pages[2], int off, int size)
{
- int sizes[2];
- void *addr;
+ size_t sizes[2];
char *buf;
/* no write fastpath */
@@ -1089,15 +1085,11 @@ static void __zs_unmap_object(struct mapping_area *area,
sizes[1] = size - sizes[0];
/* copy per-cpu buffer to object */
- addr = kmap_atomic(pages[0]);
- memcpy(addr + off, buf, sizes[0]);
- kunmap_atomic(addr);
- addr = kmap_atomic(pages[1]);
- memcpy(addr, buf + sizes[0], sizes[1]);
- kunmap_atomic(addr);
+ memcpy_to_page(pages[0], off, buf, sizes[0]);
+ memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]);
out:
- /* enable page faults to match kunmap_atomic() return conditions */
+ /* enable page faults to match kunmap_local() return conditions */
pagefault_enable();
}
@@ -1223,7 +1215,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
- area->vm_addr = kmap_atomic(page);
+ area->vm_addr = kmap_local_page(page);
ret = area->vm_addr + off;
goto out;
}
@@ -1260,7 +1252,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
area = this_cpu_ptr(&zs_map_area);
if (off + class->size <= PAGE_SIZE)
- kunmap_atomic(area->vm_addr);
+ kunmap_local(area->vm_addr);
else {
struct page *pages[2];
@@ -1318,7 +1310,7 @@ static unsigned long obj_malloc(struct zs_pool *pool,
for (i = 0; i < nr_page; i++)
m_page = get_next_page(m_page);
- vaddr = kmap_atomic(m_page);
+ vaddr = kmap_local_page(m_page);
link = (struct link_free *)vaddr + m_offset / sizeof(*link);
set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
if (likely(!ZsHugePage(zspage)))
@@ -1328,7 +1320,7 @@ static unsigned long obj_malloc(struct zs_pool *pool,
/* record handle to page->index */
zspage->first_page->index = handle | OBJ_ALLOCATED_TAG;
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
mod_zspage_inuse(zspage, 1);
obj = location_to_obj(m_page, obj);
@@ -1419,7 +1411,7 @@ static void obj_free(int class_size, unsigned long obj)
f_offset = offset_in_page(class_size * f_objidx);
zspage = get_zspage(f_page);
- vaddr = kmap_atomic(f_page);
+ vaddr = kmap_local_page(f_page);
link = (struct link_free *)(vaddr + f_offset);
/* Insert this object in containing zspage's freelist */
@@ -1429,7 +1421,7 @@ static void obj_free(int class_size, unsigned long obj)
f_page->index = 0;
set_freeobj(zspage, f_objidx);
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
mod_zspage_inuse(zspage, -1);
}
@@ -1492,8 +1484,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
if (d_off + class->size > PAGE_SIZE)
d_size = PAGE_SIZE - d_off;
- s_addr = kmap_atomic(s_page);
- d_addr = kmap_atomic(d_page);
+ s_addr = kmap_local_page(s_page);
+ d_addr = kmap_local_page(d_page);
while (1) {
size = min(s_size, d_size);
@@ -1509,33 +1501,33 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
d_size -= size;
/*
- * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic()
- * calls must occurs in reverse order of calls to kmap_atomic().
- * So, to call kunmap_atomic(s_addr) we should first call
- * kunmap_atomic(d_addr). For more details see
+ * Calling kunmap_local(d_addr) is necessary. kunmap_local()
+ * calls must occurs in reverse order of calls to kmap_local_page().
+ * So, to call kunmap_local(s_addr) we should first call
+ * kunmap_local(d_addr). For more details see
* Documentation/mm/highmem.rst.
*/
if (s_off >= PAGE_SIZE) {
- kunmap_atomic(d_addr);
- kunmap_atomic(s_addr);
+ kunmap_local(d_addr);
+ kunmap_local(s_addr);
s_page = get_next_page(s_page);
- s_addr = kmap_atomic(s_page);
- d_addr = kmap_atomic(d_page);
+ s_addr = kmap_local_page(s_page);
+ d_addr = kmap_local_page(d_page);
s_size = class->size - written;
s_off = 0;
}
if (d_off >= PAGE_SIZE) {
- kunmap_atomic(d_addr);
+ kunmap_local(d_addr);
d_page = get_next_page(d_page);
- d_addr = kmap_atomic(d_page);
+ d_addr = kmap_local_page(d_page);
d_size = class->size - written;
d_off = 0;
}
}
- kunmap_atomic(d_addr);
- kunmap_atomic(s_addr);
+ kunmap_local(d_addr);
+ kunmap_local(s_addr);
}
/*
@@ -1548,7 +1540,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
unsigned int offset;
int index = *obj_idx;
unsigned long handle = 0;
- void *addr = kmap_atomic(page);
+ void *addr = kmap_local_page(page);
offset = get_first_obj_offset(page);
offset += class->size * index;
@@ -1561,7 +1553,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
index++;
}
- kunmap_atomic(addr);
+ kunmap_local(addr);
*obj_idx = index;
@@ -1798,14 +1790,14 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
migrate_write_lock(zspage);
offset = get_first_obj_offset(page);
- s_addr = kmap_atomic(page);
+ s_addr = kmap_local_page(page);
/*
* Here, any user cannot access all objects in the zspage so let's move.
*/
- d_addr = kmap_atomic(newpage);
+ d_addr = kmap_local_page(newpage);
copy_page(d_addr, s_addr);
- kunmap_atomic(d_addr);
+ kunmap_local(d_addr);
for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE;
addr += class->size) {
@@ -1818,7 +1810,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
record_obj(handle, new_obj);
}
}
- kunmap_atomic(s_addr);
+ kunmap_local(s_addr);
replace_sub_page(class, zspage, newpage, page);
/*
diff --git a/mm/zswap.c b/mm/zswap.c
index 0030ce8fecfc..f6316b66fb23 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,7 +43,7 @@
* statistics
**********************************/
/* The number of compressed pages currently stored in zswap */
-atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_long_t zswap_stored_pages = ATOMIC_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -402,7 +402,7 @@ static void __zswap_pool_empty(struct percpu_ref *ref)
spin_unlock_bh(&zswap_pools_lock);
}
-static int __must_check zswap_pool_get(struct zswap_pool *pool)
+static int __must_check zswap_pool_tryget(struct zswap_pool *pool)
{
if (!pool)
return 0;
@@ -410,6 +410,12 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
return percpu_ref_tryget(&pool->ref);
}
+/* The caller must already have a reference. */
+static void zswap_pool_get(struct zswap_pool *pool)
+{
+ percpu_ref_get(&pool->ref);
+}
+
static void zswap_pool_put(struct zswap_pool *pool)
{
percpu_ref_put(&pool->ref);
@@ -440,7 +446,7 @@ static struct zswap_pool *zswap_pool_current_get(void)
rcu_read_lock();
pool = __zswap_pool_current();
- if (!zswap_pool_get(pool))
+ if (!zswap_pool_tryget(pool))
pool = NULL;
rcu_read_unlock();
@@ -461,7 +467,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
if (strcmp(zpool_get_type(pool->zpool), type))
continue;
/* if we can't get it, it's about to be destroyed */
- if (!zswap_pool_get(pool))
+ if (!zswap_pool_tryget(pool))
continue;
return pool;
}
@@ -703,12 +709,11 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
/*
* Note that it is safe to use rcu_read_lock() here, even in the face of
- * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
- * used in list_lru lookup, only two scenarios are possible:
+ * concurrent memcg offlining:
*
- * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
+ * 1. list_lru_add() is called before list_lru_one is dead. The
* new entry will be reparented to memcg's parent's list_lru.
- * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
+ * 2. list_lru_add() is called after list_lru_one is dead. The
* new entry will be added directly to memcg's parent's list_lru.
*
* Similar reasoning holds for list_lru_del().
@@ -802,7 +807,7 @@ static void zswap_entry_free(struct zswap_entry *entry)
obj_cgroup_put(entry->objcg);
}
zswap_entry_cache_free(entry);
- atomic_dec(&zswap_stored_pages);
+ atomic_long_dec(&zswap_stored_pages);
}
/*********************************
@@ -875,7 +880,8 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
return 0;
}
-static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
+static bool zswap_compress(struct page *page, struct zswap_entry *entry,
+ struct zswap_pool *pool)
{
struct crypto_acomp_ctx *acomp_ctx;
struct scatterlist input, output;
@@ -887,13 +893,13 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
gfp_t gfp;
u8 *dst;
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
mutex_lock(&acomp_ctx->mutex);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
- sg_set_folio(&input, folio, PAGE_SIZE, 0);
+ sg_set_page(&input, page, PAGE_SIZE, 0);
/*
* We need PAGE_SIZE * 2 here since there maybe over-compression case,
@@ -920,7 +926,7 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
if (comp_ret)
goto unlock;
- zpool = entry->pool->zpool;
+ zpool = pool->zpool;
gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
if (zpool_malloc_support_movable(zpool))
gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
@@ -1096,7 +1102,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* for reclaim by this ratio.
*/
static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
- spinlock_t *lock, void *arg)
+ void *arg)
{
struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
bool *encountered_page_in_swapcache = (bool *)arg;
@@ -1152,7 +1158,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
* It's safe to drop the lock here because we return either
* LRU_REMOVED_RETRY or LRU_RETRY.
*/
- spin_unlock(lock);
+ spin_unlock(&l->lock);
writeback_result = zswap_writeback_entry(entry, swpentry);
@@ -1173,7 +1179,6 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
zswap_written_back_pages++;
}
- spin_lock(lock);
return ret;
}
@@ -1233,7 +1238,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
} else {
nr_backing = zswap_total_pages();
- nr_stored = atomic_read(&zswap_stored_pages);
+ nr_stored = atomic_long_read(&zswap_stored_pages);
}
if (!nr_stored)
@@ -1403,68 +1408,27 @@ resched:
/*********************************
* main API
**********************************/
-bool zswap_store(struct folio *folio)
+
+static ssize_t zswap_store_page(struct page *page,
+ struct obj_cgroup *objcg,
+ struct zswap_pool *pool)
{
- swp_entry_t swp = folio->swap;
- pgoff_t offset = swp_offset(swp);
- struct xarray *tree = swap_zswap_tree(swp);
+ swp_entry_t page_swpentry = page_swap_entry(page);
struct zswap_entry *entry, *old;
- struct obj_cgroup *objcg = NULL;
- struct mem_cgroup *memcg = NULL;
-
- VM_WARN_ON_ONCE(!folio_test_locked(folio));
- VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
-
- /* Large folios aren't supported */
- if (folio_test_large(folio))
- return false;
-
- if (!zswap_enabled)
- goto check_old;
-
- /* Check cgroup limits */
- objcg = get_obj_cgroup_from_folio(folio);
- if (objcg && !obj_cgroup_may_zswap(objcg)) {
- memcg = get_mem_cgroup_from_objcg(objcg);
- if (shrink_memcg(memcg)) {
- mem_cgroup_put(memcg);
- goto reject;
- }
- mem_cgroup_put(memcg);
- }
-
- if (zswap_check_limits())
- goto reject;
/* allocate entry */
- entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
+ entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
if (!entry) {
zswap_reject_kmemcache_fail++;
- goto reject;
- }
-
- /* if entry is successfully added, it keeps the reference */
- entry->pool = zswap_pool_current_get();
- if (!entry->pool)
- goto freepage;
-
- if (objcg) {
- memcg = get_mem_cgroup_from_objcg(objcg);
- if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) {
- mem_cgroup_put(memcg);
- goto put_pool;
- }
- mem_cgroup_put(memcg);
+ return -EINVAL;
}
- if (!zswap_compress(folio, entry))
- goto put_pool;
-
- entry->swpentry = swp;
- entry->objcg = objcg;
- entry->referenced = true;
+ if (!zswap_compress(page, entry, pool))
+ goto compress_failed;
- old = xa_store(tree, offset, entry, GFP_KERNEL);
+ old = xa_store(swap_zswap_tree(page_swpentry),
+ swp_offset(page_swpentry),
+ entry, GFP_KERNEL);
if (xa_is_err(old)) {
int err = xa_err(old);
@@ -1481,10 +1445,15 @@ bool zswap_store(struct folio *folio)
if (old)
zswap_entry_free(old);
- if (objcg) {
- obj_cgroup_charge_zswap(objcg, entry->length);
- count_objcg_events(objcg, ZSWPOUT, 1);
- }
+ /*
+ * The entry is successfully compressed and stored in the tree, there is
+ * no further possibility of failure. Grab refs to the pool and objcg.
+ * These refs will be dropped by zswap_entry_free() when the entry is
+ * removed from the tree.
+ */
+ zswap_pool_get(pool);
+ if (objcg)
+ obj_cgroup_get(objcg);
/*
* We finish initializing the entry while it's already in xarray.
@@ -1496,37 +1465,115 @@ bool zswap_store(struct folio *folio)
* The publishing order matters to prevent writeback from seeing
* an incoherent entry.
*/
+ entry->pool = pool;
+ entry->swpentry = page_swpentry;
+ entry->objcg = objcg;
+ entry->referenced = true;
if (entry->length) {
INIT_LIST_HEAD(&entry->lru);
zswap_lru_add(&zswap_list_lru, entry);
}
- /* update stats */
- atomic_inc(&zswap_stored_pages);
- count_vm_event(ZSWPOUT);
-
- return true;
+ return entry->length;
store_failed:
- zpool_free(entry->pool->zpool, entry->handle);
-put_pool:
- zswap_pool_put(entry->pool);
-freepage:
+ zpool_free(pool->zpool, entry->handle);
+compress_failed:
zswap_entry_cache_free(entry);
-reject:
+ return -EINVAL;
+}
+
+bool zswap_store(struct folio *folio)
+{
+ long nr_pages = folio_nr_pages(folio);
+ swp_entry_t swp = folio->swap;
+ struct obj_cgroup *objcg = NULL;
+ struct mem_cgroup *memcg = NULL;
+ struct zswap_pool *pool;
+ size_t compressed_bytes = 0;
+ bool ret = false;
+ long index;
+
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
+ VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
+
+ if (!zswap_enabled)
+ goto check_old;
+
+ objcg = get_obj_cgroup_from_folio(folio);
+ if (objcg && !obj_cgroup_may_zswap(objcg)) {
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ if (shrink_memcg(memcg)) {
+ mem_cgroup_put(memcg);
+ goto put_objcg;
+ }
+ mem_cgroup_put(memcg);
+ }
+
+ if (zswap_check_limits())
+ goto put_objcg;
+
+ pool = zswap_pool_current_get();
+ if (!pool)
+ goto put_objcg;
+
+ if (objcg) {
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) {
+ mem_cgroup_put(memcg);
+ goto put_pool;
+ }
+ mem_cgroup_put(memcg);
+ }
+
+ for (index = 0; index < nr_pages; ++index) {
+ struct page *page = folio_page(folio, index);
+ ssize_t bytes;
+
+ bytes = zswap_store_page(page, objcg, pool);
+ if (bytes < 0)
+ goto put_pool;
+ compressed_bytes += bytes;
+ }
+
+ if (objcg) {
+ obj_cgroup_charge_zswap(objcg, compressed_bytes);
+ count_objcg_events(objcg, ZSWPOUT, nr_pages);
+ }
+
+ atomic_long_add(nr_pages, &zswap_stored_pages);
+ count_vm_events(ZSWPOUT, nr_pages);
+
+ ret = true;
+
+put_pool:
+ zswap_pool_put(pool);
+put_objcg:
obj_cgroup_put(objcg);
- if (zswap_pool_reached_full)
+ if (!ret && zswap_pool_reached_full)
queue_work(shrink_wq, &zswap_shrink_work);
check_old:
/*
- * If the zswap store fails or zswap is disabled, we must invalidate the
- * possibly stale entry which was previously stored at this offset.
- * Otherwise, writeback could overwrite the new data in the swapfile.
+ * If the zswap store fails or zswap is disabled, we must invalidate
+ * the possibly stale entries which were previously stored at the
+ * offsets corresponding to each page of the folio. Otherwise,
+ * writeback could overwrite the new data in the swapfile.
*/
- entry = xa_erase(tree, offset);
- if (entry)
- zswap_entry_free(entry);
- return false;
+ if (!ret) {
+ unsigned type = swp_type(swp);
+ pgoff_t offset = swp_offset(swp);
+ struct zswap_entry *entry;
+ struct xarray *tree;
+
+ for (index = 0; index < nr_pages; ++index) {
+ tree = swap_zswap_tree(swp_entry(type, offset + index));
+ entry = xa_erase(tree, offset + index);
+ if (entry)
+ zswap_entry_free(entry);
+ }
+ }
+
+ return ret;
}
bool zswap_load(struct folio *folio)
@@ -1594,6 +1641,9 @@ void zswap_invalidate(swp_entry_t swp)
struct xarray *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
+ if (xa_empty(tree))
+ return;
+
entry = xa_erase(tree, offset);
if (entry)
zswap_entry_free(entry);
@@ -1651,6 +1701,13 @@ static int debugfs_get_total_size(void *data, u64 *val)
}
DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n");
+static int debugfs_get_stored_pages(void *data, u64 *val)
+{
+ *val = atomic_long_read(&zswap_stored_pages);
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n");
+
static int zswap_debugfs_init(void)
{
if (!debugfs_initialized())
@@ -1674,8 +1731,8 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_written_back_pages);
debugfs_create_file("pool_total_size", 0444,
zswap_debugfs_root, NULL, &total_size_fops);
- debugfs_create_atomic_t("stored_pages", 0444,
- zswap_debugfs_root, &zswap_stored_pages);
+ debugfs_create_file("stored_pages", 0444,
+ zswap_debugfs_root, NULL, &stored_pages_fops);
return 0;
}