diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2023-02-15 12:33:28 -0500 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2023-02-15 12:33:28 -0500 |
commit | 33436335e93a1788a58443fc99c5ab320ce4b9d9 (patch) | |
tree | d92f88768c8dbd00f8b65164f47a8a091768b95a /mm | |
parent | 27b025ebb0f6092d5c0a88de2ab73545bc1c496e (diff) | |
parent | c39cea6f38eefe356d64d0bc1e1f2267e282cdd3 (diff) |
Merge tag 'kvm-riscv-6.3-1' of https://github.com/kvm-riscv/linux into HEAD
KVM/riscv changes for 6.3
- Fix wrong usage of PGDIR_SIZE to check page sizes
- Fix privilege mode setting in kvm_riscv_vcpu_trap_redirect()
- Redirect illegal instruction traps to guest
- SBI PMU support for guest
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 102 | ||||
-rw-r--r-- | mm/kasan/report.c | 2 | ||||
-rw-r--r-- | mm/khugepaged.c | 38 | ||||
-rw-r--r-- | mm/kmemleak.c | 5 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memblock.c | 8 | ||||
-rw-r--r-- | mm/memcontrol.c | 67 | ||||
-rw-r--r-- | mm/memory.c | 14 | ||||
-rw-r--r-- | mm/mempolicy.c | 3 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/mprotect.c | 8 | ||||
-rw-r--r-- | mm/mremap.c | 25 | ||||
-rw-r--r-- | mm/nommu.c | 9 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 1 | ||||
-rw-r--r-- | mm/vmscan.c | 9 | ||||
-rw-r--r-- | mm/zsmalloc.c | 237 |
19 files changed, 383 insertions, 164 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index ca1603524bbe..8238e83385a7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1839,6 +1839,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; + set_pageblock_skip(freepage); break; } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index db895230ee7e..bdbfeb6fb393 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -94,6 +94,8 @@ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, unsigned long end); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -1181,7 +1183,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) /* * Reset and decrement one ref on hugepage private reservation. - * Called with mm->mmap_sem writer semaphore held. + * Called with mm->mmap_lock writer semaphore held. * This function should be only used by move_vma() and operate on * same sized vma. It should never come here with last ref on the * reservation. @@ -4834,6 +4836,25 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; + + /* + * PMD sharing is only possible for PUD_SIZE-aligned address ranges + * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this + * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + */ + if (addr & ~PUD_MASK) { + /* + * hugetlb_vm_op_split is called right before we attempt to + * split the VMA. We will need to unshare PMDs in the old and + * new VMAs, so let's unshare before we split. + */ + unsigned long floor = addr & PUD_MASK; + unsigned long ceil = floor + PUD_SIZE; + + if (floor >= vma->vm_start && ceil <= vma->vm_end) + hugetlb_unshare_pmds(vma, floor, ceil); + } + return 0; } @@ -5030,6 +5051,9 @@ again: entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { + /* No swap on hugetlb */ + WARN_ON_ONCE( + is_swapin_error_entry(pte_to_swp_entry(entry))); /* * We copy the pte marker only if the dst vma has * uffd-wp enabled. @@ -5131,7 +5155,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, /* * We don't have to worry about the ordering of src and dst ptlocks - * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock. + * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock. */ if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -6639,8 +6663,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); if (!ptep) { - address |= last_addr_mask; - continue; + if (!uffd_wp) { + address |= last_addr_mask; + continue; + } + /* + * Userfaultfd wr-protect requires pgtable + * pre-allocations to install pte markers. + */ + ptep = huge_pte_alloc(mm, vma, address, psize); + if (!ptep) + break; } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { @@ -6658,16 +6691,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, } pte = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { - spin_unlock(ptl); - continue; - } - if (unlikely(is_hugetlb_entry_migration(pte))) { + /* Nothing to do. */ + } else if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); struct page *page = pfn_swap_entry_to_page(entry); + pte_t newpte = pte; - if (!is_readable_migration_entry(entry)) { - pte_t newpte; - + if (is_writable_migration_entry(entry)) { if (PageAnon(page)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); @@ -6675,25 +6705,22 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, entry = make_readable_migration_entry( swp_offset(entry)); newpte = swp_entry_to_pte(entry); - if (uffd_wp) - newpte = pte_swp_mkuffd_wp(newpte); - else if (uffd_wp_resolve) - newpte = pte_swp_clear_uffd_wp(newpte); - set_huge_pte_at(mm, address, ptep, newpte); pages++; } - spin_unlock(ptl); - continue; - } - if (unlikely(pte_marker_uffd_wp(pte))) { - /* - * This is changing a non-present pte into a none pte, - * no need for huge_ptep_modify_prot_start/commit(). - */ + + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + if (!pte_same(pte, newpte)) + set_huge_pte_at(mm, address, ptep, newpte); + } else if (unlikely(is_pte_marker(pte))) { + /* No other markers apply for now. */ + WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); if (uffd_wp_resolve) + /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); - } - if (!huge_pte_none(pte)) { + } else if (!huge_pte_none(pte)) { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); @@ -7328,26 +7355,21 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re } } -/* - * This function will unconditionally remove all the shared pmd pgtable entries - * within the specific vma for a hugetlbfs memory range. - */ -void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; - unsigned long address, start, end; + unsigned long address; spinlock_t *ptl; pte_t *ptep; if (!(vma->vm_flags & VM_MAYSHARE)) return; - start = ALIGN(vma->vm_start, PUD_SIZE); - end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); - if (start >= end) return; @@ -7379,6 +7401,16 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_invalidate_range_end(&range); } +/* + * This function will unconditionally remove all the shared pmd pgtable entries + * within the specific vma for a hugetlbfs memory range. + */ +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +{ + hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), + ALIGN_DOWN(vma->vm_end, PUD_SIZE)); +} + #ifdef CONFIG_CMA static bool cma_reserve_called __initdata; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 1d02757e90a3..22598b20c7b7 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); * Whether the KASAN KUnit test suite is currently being executed. * Updated in kasan_test.c. */ -bool kasan_kunit_executing; +static bool kasan_kunit_executing; void kasan_kunit_test_suite_start(void) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5cb401aa2b9d..90acfea40c13 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -847,6 +847,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } +/* + * See pmd_trans_unstable() for how the result may change out from + * underneath us, even if we hold mmap_lock in read. + */ static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) @@ -865,8 +869,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, #endif if (pmd_none(pmde)) return SCAN_PMD_NONE; + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; + if (pmd_devmap(pmde)) + return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; @@ -1460,14 +1468,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return SCAN_VMA_CHECK; - /* - * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings - * that got written to. Without this, we'd have to also lock the - * anon_vma if one exists. - */ - if (vma->anon_vma) - return SCAN_VMA_CHECK; - /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP; @@ -1567,8 +1567,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, } /* step 4: remove pte entries */ + /* we make no change to anon, but protect concurrent anon page lookup */ + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); + collapse_and_free_pmd(mm, vma, haddr, pmd); + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); i_mmap_unlock_write(vma->vm_file->f_mapping); maybe_install_pmd: @@ -1644,7 +1650,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, * has higher cost too. It would also probably require locking * the anon_vma. */ - if (vma->anon_vma) { + if (READ_ONCE(vma->anon_vma)) { result = SCAN_PAGE_ANON; goto next; } @@ -1673,6 +1679,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, if ((cc->is_khugepaged || is_target) && mmap_write_trylock(mm)) { /* + * Re-check whether we have an ->anon_vma, because + * collapse_and_free_pmd() requires that either no + * ->anon_vma exists or the anon_vma is locked. + * We already checked ->anon_vma above, but that check + * is racy because ->anon_vma can be populated under the + * mmap lock in read mode. + */ + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto unlock_next; + } + /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte * markers installed. Skip it only, so the rest mm/vma @@ -2649,7 +2667,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, goto out_nolock; } - hend = vma->vm_end & HPAGE_PMD_MASK; + hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 92f670edbf51..55dc8b8b0616 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -2070,8 +2070,10 @@ static int __init kmemleak_boot_config(char *str) return -EINVAL; if (strcmp(str, "off") == 0) kmemleak_disable(); - else if (strcmp(str, "on") == 0) + else if (strcmp(str, "on") == 0) { kmemleak_skip_disable = 1; + stack_depot_want_early_init(); + } else return -EINVAL; return 0; @@ -2093,7 +2095,6 @@ void __init kmemleak_init(void) if (kmemleak_error) return; - stack_depot_init(); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); diff --git a/mm/madvise.c b/mm/madvise.c index a56a6d17e201..b6ea204d4e23 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -130,7 +130,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma, #endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as - * necessary. Must be called with mmap_sem held for writing; + * necessary. Must be called with mmap_lock held for writing; * Caller should ensure anon_name stability by raising its refcount even when * anon_name belongs to a valid vma because this function might free that vma. */ diff --git a/mm/memblock.c b/mm/memblock.c index d036c7861310..685e30e6d27c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1640,7 +1640,13 @@ void __init memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - memblock_free_pages(pfn_to_page(cursor), cursor, 0); + /* + * Reserved pages are always initialized by the end of + * memblock_free_all() (by memmap_init() and, if deferred + * initialization is enabled, memmap_init_reserved_pages()), so + * these pages can be released directly to the buddy allocator. + */ + __free_pages_core(pfn_to_page(cursor), 0); totalram_pages_inc(); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab457f0394ab..73afff8062f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,7 +63,6 @@ #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/seq_buf.h> -#include <linux/parser.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -2393,8 +2392,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP, - NULL); + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2685,8 +2683,7 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options, - NULL); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3506,8 +3503,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, - NULL)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3618,8 +3614,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP, - NULL)) + MEMCG_RECLAIM_MAY_SWAP)) nr_retries--; } @@ -6429,8 +6424,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -6479,8 +6473,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -6603,54 +6596,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } -enum { - MEMORY_RECLAIM_NODES = 0, - MEMORY_RECLAIM_NULL, -}; - -static const match_table_t if_tokens = { - { MEMORY_RECLAIM_NODES, "nodes=%s" }, - { MEMORY_RECLAIM_NULL, NULL }, -}; - static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | - MEMCG_RECLAIM_PROACTIVE; - char *old_buf, *start; - substring_t args[MAX_OPT_ARGS]; - int token; - char value[256]; - nodemask_t nodemask = NODE_MASK_ALL; - - buf = strstrip(buf); - - old_buf = buf; - nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; - if (buf == old_buf) - return -EINVAL; + unsigned int reclaim_options; + int err; buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; - while ((start = strsep(&buf, " ")) != NULL) { - if (!strlen(start)) - continue; - token = match_token(start, if_tokens, args); - match_strlcpy(value, args, sizeof(value)); - switch (token) { - case MEMORY_RECLAIM_NODES: - if (nodelist_parse(value, nodemask) < 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - } - + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6667,8 +6627,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options, - &nodemask); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/memory.c b/mm/memory.c index aad226daf41b..3e836fecd035 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -828,12 +828,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { - /* - * We're copying the pgtable should only because dst_vma has - * uffd-wp enabled, do sanity check. - */ - WARN_ON_ONCE(!userfaultfd_wp(dst_vma)); - set_pte_at(dst_mm, addr, dst_pte, pte); + if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma)) + set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } if (!userfaultfd_wp(dst_vma)) @@ -3629,8 +3625,12 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) /* * Be careful so that we will only recover a special uffd-wp pte into a * none pte. Otherwise it means the pte could have changed, so retry. + * + * This should also cover the case where e.g. the pte changed + * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. + * So is_pte_marker() check is not enough to safely drop the pte. */ - if (is_pte_marker(*vmf->pte)) + if (pte_same(vmf->orig_pte, *vmf->pte)) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02c8a712282f..f940395667c8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -600,7 +600,8 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + !hugetlb_pmd_shared(pte))) { if (isolate_hugetlb(page, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* diff --git a/mm/mmap.c b/mm/mmap.c index 87d929316d57..425a9349e610 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1524,6 +1524,10 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return 1; + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_wp(vma)) + return 1; + /* Specialty mapping? */ if (vm_flags & VM_PFNMAP) return 0; @@ -2290,7 +2294,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma, * @start: The aligned start address to munmap. * @end: The aligned end address to munmap. * @uf: The userfaultfd list_head - * @downgrade: Set to true to attempt a write downgrade of the mmap_sem + * @downgrade: Set to true to attempt a write downgrade of the mmap_lock * * If @downgrade is true, check return code for potential release of the lock. */ @@ -2465,7 +2469,7 @@ map_count_exceeded: * @len: The length of the range to munmap * @uf: The userfaultfd list_head * @downgrade: set to true if the user wants to attempt to write_downgrade the - * mmap_sem + * mmap_lock * * This function takes a @mas that is either pointing to the previous VMA or set * to MA_START and sets it up to remove the mapping(s). The @len will be diff --git a/mm/mprotect.c b/mm/mprotect.c index 908df12caa26..61cf60015a8b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -245,7 +245,13 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, newpte = pte_swp_mksoft_dirty(newpte); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (pte_marker_entry_uffd_wp(entry)) { + } else if (is_pte_marker_entry(entry)) { + /* + * Ignore swapin errors unconditionally, + * because any access should sigbus anyway. + */ + if (is_swapin_error_entry(entry)) + continue; /* * If this is uffd-wp pte marker and we'd like * to unprotect it, drop it; the next page diff --git a/mm/mremap.c b/mm/mremap.c index fe587c5d6591..930f65c315c0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1027,16 +1027,29 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* - * Function vma_merge() is called on the extension we are adding to - * the already existing vma, vma_merge() will merge this extension with - * the already existing vma (expand operation itself) and possibly also - * with the next vma if it becomes adjacent to the expanded vma and - * otherwise compatible. + * Function vma_merge() is called on the extension we + * are adding to the already existing vma, vma_merge() + * will merge this extension with the already existing + * vma (expand operation itself) and possibly also with + * the next vma if it becomes adjacent to the expanded + * vma and otherwise compatible. + * + * However, vma_merge() can currently fail due to + * is_mergeable_vma() check for vm_ops->close (see the + * comment there). Yet this should not prevent vma + * expanding, so perform a simple expand for such vma. + * Ideally the check for close op should be only done + * when a vma would be actually removed due to a merge. */ - vma = vma_merge(mm, vma, extension_start, extension_end, + if (!vma->vm_ops || !vma->vm_ops->close) { + vma = vma_merge(mm, vma, extension_start, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + } else if (vma_adjust(vma, vma->vm_start, addr + new_len, + vma->vm_pgoff, NULL)) { + vma = NULL; + } if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; diff --git a/mm/nommu.c b/mm/nommu.c index 214c70e1d059..5b83938ecb67 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -559,7 +559,6 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) { - mm->map_count++; vma->vm_mm = mm; /* add the VMA to the mapping */ @@ -587,6 +586,7 @@ static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, BUG_ON(!vma->vm_region); setup_vma_to_mm(vma, mm); + mm->map_count++; /* add the VMA to the tree */ vma_mas_store(vma, mas); @@ -1240,6 +1240,7 @@ share: error_just_free: up_write(&nommu_region_sem); error: + mas_destroy(&mas); if (region->vm_file) fput(region->vm_file); kmem_cache_free(vm_region_jar, region); @@ -1250,7 +1251,6 @@ error: sharing_violation: up_write(&nommu_region_sem); - mas_destroy(&mas); pr_warn("Attempt to share mismatched mappings\n"); ret = -EINVAL; goto error; @@ -1347,6 +1347,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_file) return -ENOMEM; + mm = vma->vm_mm; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1398,6 +1399,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); mas_store(&mas, vma); vma_mas_store(new, &mas); + mm->map_count++; return 0; err_mas_preallocate: @@ -1509,7 +1511,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list erase_whole_vma: if (delete_vma_from_mm(vma)) ret = -ENOMEM; - delete_vma(mm, vma); + else + delete_vma(mm, vma); return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index c301487be5fb..0005ab2c29af 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -478,12 +478,10 @@ bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return false; - if (shmem_huge_force) - return true; - if (shmem_huge == SHMEM_HUGE_FORCE) - return true; if (shmem_huge == SHMEM_HUGE_DENY) return false; + if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) + return true; switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: diff --git a/mm/slab.c b/mm/slab.c index 7a269db050ee..29300fc1289a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2211,6 +2211,8 @@ static int drain_freelist(struct kmem_cache *cache, raw_spin_unlock_irq(&n->list_lock); slab_destroy(cache, slab); nr_freed++; + + cond_resched(); } out: return nr_freed; diff --git a/mm/swapfile.c b/mm/swapfile.c index 908a529bca12..4fa440e87cd6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1100,6 +1100,7 @@ start_over: goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); + cond_resched(); spin_lock(&swap_avail_lock); nextsi: diff --git a/mm/vmscan.c b/mm/vmscan.c index bd6637fcd8f9..bf3eedf0209c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3323,13 +3323,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm) if (mem_cgroup_disabled()) return; + /* migration can happen before addition */ + if (!mm->lru_gen.memcg) + return; + rcu_read_lock(); memcg = mem_cgroup_from_task(task); rcu_read_unlock(); if (memcg == mm->lru_gen.memcg) return; - VM_WARN_ON_ONCE(!mm->lru_gen.memcg); VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); lru_gen_del_mm(mm); @@ -6754,8 +6757,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options, - nodemask_t *nodemask) + unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -6770,7 +6772,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), - .nodemask = nodemask, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9445bee6b014..702bc3fd687a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -113,7 +113,23 @@ * have room for two bit at least. */ #define OBJ_ALLOCATED_TAG 1 -#define OBJ_TAG_BITS 1 + +#ifdef CONFIG_ZPOOL +/* + * The second least-significant bit in the object's header identifies if the + * value stored at the header is a deferred handle from the last reclaim + * attempt. + * + * As noted above, this is valid because we have room for two bits. + */ +#define OBJ_DEFERRED_HANDLE_TAG 2 +#define OBJ_TAG_BITS 2 +#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG) +#else +#define OBJ_TAG_BITS 1 +#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG +#endif /* CONFIG_ZPOOL */ + #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) @@ -222,6 +238,12 @@ struct link_free { * Handle of allocated object. */ unsigned long handle; +#ifdef CONFIG_ZPOOL + /* + * Deferred handle of a reclaimed object. + */ + unsigned long deferred_handle; +#endif }; }; @@ -272,8 +294,6 @@ struct zspage { /* links the zspage to the lru list in the pool */ struct list_head lru; bool under_reclaim; - /* list of unfreed handles whose objects have been reclaimed */ - unsigned long *deferred_handles; #endif struct zs_pool *pool; @@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) +static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, + int tag) { unsigned long handle; struct zspage *zspage = get_zspage(page); @@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) } else handle = *(unsigned long *)obj; - if (!(handle & OBJ_ALLOCATED_TAG)) + if (!(handle & tag)) return false; - *phandle = handle & ~OBJ_ALLOCATED_TAG; + /* Clear all tags before returning the handle */ + *phandle = handle & ~OBJ_TAG_MASK; return true; } +static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) +{ + return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG); +} + +#ifdef CONFIG_ZPOOL +static bool obj_stores_deferred_handle(struct page *page, void *obj, + unsigned long *phandle) +{ + return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG); +} +#endif + static void reset_page(struct page *page) { __ClearPageMovable(page); @@ -946,22 +981,36 @@ unlock: } #ifdef CONFIG_ZPOOL +static unsigned long find_deferred_handle_obj(struct size_class *class, + struct page *page, int *obj_idx); + /* * Free all the deferred handles whose objects are freed in zs_free. */ -static void free_handles(struct zs_pool *pool, struct zspage *zspage) +static void free_handles(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) { - unsigned long handle = (unsigned long)zspage->deferred_handles; + int obj_idx = 0; + struct page *page = get_first_page(zspage); + unsigned long handle; - while (handle) { - unsigned long nxt_handle = handle_to_obj(handle); + while (1) { + handle = find_deferred_handle_obj(class, page, &obj_idx); + if (!handle) { + page = get_next_page(page); + if (!page) + break; + obj_idx = 0; + continue; + } cache_free_handle(pool, handle); - handle = nxt_handle; + obj_idx++; } } #else -static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {} +static inline void free_handles(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) {} #endif static void __free_zspage(struct zs_pool *pool, struct size_class *class, @@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(fg != ZS_EMPTY); /* Free all deferred handles from zs_free */ - free_handles(pool, zspage); + free_handles(pool, class, zspage); next = page = get_first_page(zspage); do { @@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) #ifdef CONFIG_ZPOOL INIT_LIST_HEAD(&zspage->lru); zspage->under_reclaim = false; - zspage->deferred_handles = NULL; #endif set_freeobj(zspage, 0); @@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(int class_size, unsigned long obj) +static void obj_free(int class_size, unsigned long obj, unsigned long *handle) { struct link_free *link; struct zspage *zspage; @@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj) zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); - - /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - if (likely(!ZsHugePage(zspage))) - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; - else - f_page->index = 0; + + if (handle) { +#ifdef CONFIG_ZPOOL + /* Stores the (deferred) handle in the object's header */ + *handle |= OBJ_DEFERRED_HANDLE_TAG; + *handle &= ~OBJ_ALLOCATED_TAG; + + if (likely(!ZsHugePage(zspage))) + link->deferred_handle = *handle; + else + f_page->index = *handle; +#endif + } else { + /* Insert this object in containing zspage's freelist */ + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; + set_freeobj(zspage, f_objidx); + } + kunmap_atomic(vaddr); - set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); } @@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle) zspage = get_zspage(f_page); class = zspage_class(pool, zspage); - obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); #ifdef CONFIG_ZPOOL @@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle) * Reclaim needs the handles during writeback. It'll free * them along with the zspage when it's done with them. * - * Record current deferred handle at the memory location - * whose address is given by handle. + * Record current deferred handle in the object's header. */ - record_obj(handle, (unsigned long)zspage->deferred_handles); - zspage->deferred_handles = (unsigned long *)handle; + obj_free(class->size, obj, &handle); spin_unlock(&pool->lock); return; } #endif + obj_free(class->size, obj, NULL); + fullness = fix_fullness_group(class, zspage); if (fullness == ZS_EMPTY) free_zspage(pool, class, zspage); @@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, } /* - * Find alloced object in zspage from index object and + * Find object with a certain tag in zspage from index object and * return handle. */ -static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) +static unsigned long find_tagged_obj(struct size_class *class, + struct page *page, int *obj_idx, int tag) { unsigned int offset; int index = *obj_idx; @@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_allocated(page, addr + offset, &handle)) + if (obj_tagged(page, addr + offset, &handle, tag)) break; offset += class->size; @@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class, return handle; } +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int *obj_idx) +{ + return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); +} + +#ifdef CONFIG_ZPOOL +/* + * Find object storing a deferred handle in header in zspage from index object + * and return handle. + */ +static unsigned long find_deferred_handle_obj(struct size_class *class, + struct page *page, int *obj_idx) +{ + return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG); +} +#endif + struct zs_compact_control { /* Source spage for migration which could be a subpage of zspage */ struct page *s_page; @@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, zs_object_copy(class, free_obj, used_obj); obj_idx++; record_obj(handle, free_obj); - obj_free(class->size, used_obj); + obj_free(class->size, used_obj, NULL); } /* Remember last position in this iteration */ @@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool) EXPORT_SYMBOL_GPL(zs_destroy_pool); #ifdef CONFIG_ZPOOL +static void restore_freelist(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + unsigned int obj_idx = 0; + unsigned long handle, off = 0; /* off is within-page offset */ + struct page *page = get_first_page(zspage); + struct link_free *prev_free = NULL; + void *prev_page_vaddr = NULL; + + /* in case no free object found */ + set_freeobj(zspage, (unsigned int)(-1UL)); + + while (page) { + void *vaddr = kmap_atomic(page); + struct page *next_page; + + while (off < PAGE_SIZE) { + void *obj_addr = vaddr + off; + + /* skip allocated object */ + if (obj_allocated(page, obj_addr, &handle)) { + obj_idx++; + off += class->size; + continue; + } + + /* free deferred handle from reclaim attempt */ + if (obj_stores_deferred_handle(page, obj_addr, &handle)) + cache_free_handle(pool, handle); + + if (prev_free) + prev_free->next = obj_idx << OBJ_TAG_BITS; + else /* first free object found */ + set_freeobj(zspage, obj_idx); + + prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free); + /* if last free object in a previous page, need to unmap */ + if (prev_page_vaddr) { + kunmap_atomic(prev_page_vaddr); + prev_page_vaddr = NULL; + } + + obj_idx++; + off += class->size; + } + + /* + * Handle the last (full or partial) object on this page. + */ + next_page = get_next_page(page); + if (next_page) { + if (!prev_free || prev_page_vaddr) { + /* + * There is no free object in this page, so we can safely + * unmap it. + */ + kunmap_atomic(vaddr); + } else { + /* update prev_page_vaddr since prev_free is on this page */ + prev_page_vaddr = vaddr; + } + } else { /* this is the last page */ + if (prev_free) { + /* + * Reset OBJ_TAG_BITS bit to last link to tell + * whether it's allocated object or not. + */ + prev_free->next = -1UL << OBJ_TAG_BITS; + } + + /* unmap previous page (if not done yet) */ + if (prev_page_vaddr) { + kunmap_atomic(prev_page_vaddr); + prev_page_vaddr = NULL; + } + + kunmap_atomic(vaddr); + } + + page = next_page; + off %= PAGE_SIZE; + } +} + static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) { int i, obj_idx, ret = 0; @@ -2561,6 +2728,12 @@ next: return 0; } + /* + * Eviction fails on one of the handles, so we need to restore zspage. + * We need to rebuild its freelist (and free stored deferred handles), + * put it back to the correct size class, and add it to the LRU list. + */ + restore_freelist(pool, class, zspage); putback_zspage(class, zspage); list_add(&zspage->lru, &pool->lru); unlock_zspage(zspage); |