summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/init-mm.c4
-rw-r--r--mm/kfence/Makefile2
-rw-r--r--mm/kfence/core.c42
-rw-r--r--mm/ksm.c11
-rw-r--r--mm/memory.c16
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/page_alloc.c3
-rw-r--r--mm/slab.c2
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/vmalloc.c36
12 files changed, 100 insertions, 45 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 07abcb6eb203..245038a9fe4e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5478,7 +5478,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
struct folio *pagecache_folio, spinlock_t *ptl)
{
const bool unshare = flags & FAULT_FLAG_UNSHARE;
- pte_t pte;
+ pte_t pte = huge_ptep_get(ptep);
struct hstate *h = hstate_vma(vma);
struct page *old_page;
struct folio *new_folio;
@@ -5488,6 +5488,17 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
struct mmu_notifier_range range;
/*
+ * Never handle CoW for uffd-wp protected pages. It should be only
+ * handled when the uffd-wp protection is removed.
+ *
+ * Note that only the CoW optimization path (in hugetlb_no_page())
+ * can trigger this, because hugetlb_fault() will always resolve
+ * uffd-wp bit first.
+ */
+ if (!unshare && huge_pte_uffd_wp(pte))
+ return 0;
+
+ /*
* hugetlb does not support FOLL_FORCE-style write faults that keep the
* PTE mapped R/O such as maybe_mkwrite() would do.
*/
@@ -5500,7 +5511,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
- pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
delayacct_wpcopy_start();
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c9327abb771c..a084039f55d8 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -10,7 +10,7 @@
#include <linux/atomic.h>
#include <linux/user_namespace.h>
-#include <linux/ioasid.h>
+#include <linux/iommu.h>
#include <asm/mmu.h>
#ifndef INIT_MM_CONTEXT
@@ -40,7 +40,7 @@ struct mm_struct init_mm = {
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
#ifdef CONFIG_IOMMU_SVA
- .pasid = INVALID_IOASID,
+ .pasid = IOMMU_PASID_INVALID,
#endif
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
index 0bb95728a784..2de2a58d11a1 100644
--- a/mm/kfence/Makefile
+++ b/mm/kfence/Makefile
@@ -2,5 +2,5 @@
obj-y := core.o report.o
-CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls
+CFLAGS_kfence_test.o := -fno-omit-frame-pointer -fno-optimize-sibling-calls
obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 5349c37a5dac..1065e0568d05 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -556,15 +556,11 @@ static unsigned long kfence_init_pool(void)
* enters __slab_free() slow-path.
*/
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
- struct slab *slab = page_slab(&pages[i]);
+ struct slab *slab = page_slab(nth_page(pages, i));
if (!i || (i % 2))
continue;
- /* Verify we do not have a compound head page. */
- if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
- return addr;
-
__folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG
slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
@@ -597,12 +593,26 @@ static unsigned long kfence_init_pool(void)
/* Protect the right redzone. */
if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
- return addr;
+ goto reset_slab;
addr += 2 * PAGE_SIZE;
}
return 0;
+
+reset_slab:
+ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
+ struct slab *slab = page_slab(nth_page(pages, i));
+
+ if (!i || (i % 2))
+ continue;
+#ifdef CONFIG_MEMCG
+ slab->memcg_data = 0;
+#endif
+ __folio_clear_slab(slab_folio(slab));
+ }
+
+ return addr;
}
static bool __init kfence_init_pool_early(void)
@@ -632,16 +642,6 @@ static bool __init kfence_init_pool_early(void)
* fails for the first page, and therefore expect addr==__kfence_pool in
* most failure cases.
*/
- for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) {
- struct slab *slab = virt_to_slab(p);
-
- if (!slab)
- continue;
-#ifdef CONFIG_MEMCG
- slab->memcg_data = 0;
-#endif
- __folio_clear_slab(slab_folio(slab));
- }
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
return false;
@@ -726,10 +726,14 @@ static const struct seq_operations objects_sops = {
};
DEFINE_SEQ_ATTRIBUTE(objects);
-static int __init kfence_debugfs_init(void)
+static int kfence_debugfs_init(void)
{
- struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL);
+ struct dentry *kfence_dir;
+
+ if (!READ_ONCE(kfence_enabled))
+ return 0;
+ kfence_dir = debugfs_create_dir("kfence", NULL);
debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
return 0;
@@ -883,6 +887,8 @@ static int kfence_init_late(void)
}
kfence_init_enable();
+ kfence_debugfs_init();
+
return 0;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index ad591b779d53..2b8d30068cbb 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -988,9 +988,15 @@ static int unmerge_and_remove_all_rmap_items(void)
mm = mm_slot->slot.mm;
mmap_read_lock(mm);
+
+ /*
+ * Exit right away if mm is exiting to avoid lockdep issue in
+ * the maple tree
+ */
+ if (ksm_test_exit(mm))
+ goto mm_exiting;
+
for_each_vma(vmi, vma) {
- if (ksm_test_exit(mm))
- break;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
continue;
err = unmerge_ksm_pages(vma,
@@ -999,6 +1005,7 @@ static int unmerge_and_remove_all_rmap_items(void)
goto error;
}
+mm_exiting:
remove_trailing_rmap_items(&mm_slot->rmap_list);
mmap_read_unlock(mm);
diff --git a/mm/memory.c b/mm/memory.c
index f456f3b5049c..01a23ad48a04 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3563,8 +3563,21 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct mmu_notifier_range range;
- if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
+ /*
+ * We need a reference to lock the folio because we don't hold
+ * the PTL so a racing thread can remove the device-exclusive
+ * entry and unmap it. If the folio is free the entry must
+ * have been removed already. If it happens to have already
+ * been re-allocated after being freed all we do is lock and
+ * unlock it.
+ */
+ if (!folio_try_get(folio))
+ return 0;
+
+ if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
+ folio_put(folio);
return VM_FAULT_RETRY;
+ }
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
@@ -3577,6 +3590,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
folio_unlock(folio);
+ folio_put(folio);
mmu_notifier_invalidate_range_end(&range);
return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index 740b54be3ed4..ff68a67a2a7c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2277,7 +2277,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
int count = 0;
int error = -ENOMEM;
MA_STATE(mas_detach, &mt_detach, 0, 0);
- mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
+ mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
mt_set_external_lock(&mt_detach, &mm->mmap_lock);
/*
@@ -2621,12 +2621,7 @@ cannot_expand:
if (map_deny_write_exec(vma, vma->vm_flags)) {
error = -EACCES;
- if (file)
- goto close_and_free_vma;
- else if (vma->vm_file)
- goto unmap_and_free_vma;
- else
- goto free_vma;
+ goto close_and_free_vma;
}
/* Allow architectures to sanity-check the vm_flags */
@@ -3042,6 +3037,7 @@ void exit_mmap(struct mm_struct *mm)
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
+ mt_clear_in_rcu(&mm->mm_mt);
free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 231929f119d9..13e84d8c0797 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -805,7 +805,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (map_deny_write_exec(vma, newflags)) {
error = -EACCES;
- goto out;
+ break;
}
/* Allow architectures to sanity-check the new flags */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ac1fc986af44..7136c36c5d01 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1398,6 +1398,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
bool init = want_init_on_free();
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1470,7 +1471,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
- if (!should_skip_kasan_poison(page, fpi_flags)) {
+ if (!skip_kasan_poison) {
kasan_poison_pages(page, order, init);
/* Memory is already initialized if KASAN did it internally. */
diff --git a/mm/slab.c b/mm/slab.c
index dabc2a671fc6..edbe722fb906 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -839,7 +839,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
return 0;
}
-#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP)
+#if defined(CONFIG_NUMA) || defined(CONFIG_SMP)
/*
* Allocates and initializes node for a node on each slab cache, used for
* either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 62ba2bf577d7..2c718f45745f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -679,6 +679,7 @@ static void __del_from_avail_list(struct swap_info_struct *p)
{
int nid;
+ assert_spin_locked(&p->lock);
for_each_node(nid)
plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
}
@@ -2434,8 +2435,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- del_from_avail_list(p);
spin_lock(&p->lock);
+ del_from_avail_list(p);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ef910bf349e1..a50072066221 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2883,6 +2883,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
+ gfp_t alloc_gfp = gfp;
+ bool nofail = false;
struct page *page;
int i;
@@ -2893,6 +2895,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* more permissive.
*/
if (!order) {
+ /* bulk allocator doesn't support nofail req. officially */
gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
while (nr_allocated < nr_pages) {
@@ -2931,20 +2934,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (nr != nr_pages_request)
break;
}
+ } else if (gfp & __GFP_NOFAIL) {
+ /*
+ * Higher order nofail allocations are really expensive and
+ * potentially dangerous (pre-mature OOM, disruptive reclaim
+ * and compaction etc.
+ */
+ alloc_gfp &= ~__GFP_NOFAIL;
+ nofail = true;
}
/* High-order pages or fallback path if "bulk" fails. */
-
while (nr_allocated < nr_pages) {
if (fatal_signal_pending(current))
break;
if (nid == NUMA_NO_NODE)
- page = alloc_pages(gfp, order);
+ page = alloc_pages(alloc_gfp, order);
else
- page = alloc_pages_node(nid, gfp, order);
- if (unlikely(!page))
- break;
+ page = alloc_pages_node(nid, alloc_gfp, order);
+ if (unlikely(!page)) {
+ if (!nofail)
+ break;
+
+ /* fall back to the zero order allocations */
+ alloc_gfp |= __GFP_NOFAIL;
+ order = 0;
+ continue;
+ }
+
/*
* Higher order allocations must be able to be treated as
* indepdenent small pages by callers (as they can with
@@ -3024,9 +3042,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* allocation request, free them via vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
- warn_alloc(gfp_mask, NULL,
- "vmalloc error: size %lu, page order %u, failed to allocate pages",
- area->nr_pages * PAGE_SIZE, page_order);
+ /* vm_area_alloc_pages() can also fail due to a fatal signal */
+ if (!fatal_signal_pending(current))
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, page order %u, failed to allocate pages",
+ area->nr_pages * PAGE_SIZE, page_order);
goto fail;
}