diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-05 16:32:45 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-05 16:32:45 -0700 |
commit | 6614a3c3164a5df2b54abb0b3559f51041cf705b (patch) | |
tree | 1c25c23d9efed988705287fc2ccb78e0e76e311d /mm/hugetlb.c | |
parent | 74cae210a335d159f2eb822e261adee905b6951a (diff) | |
parent | 360614c01f81f48a89d8b13f8fa69c3ae0a1f5c7 (diff) |
Merge tag 'mm-stable-2022-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Most of the MM queue. A few things are still pending.
Liam's maple tree rework didn't make it. This has resulted in a few
other minor patch series being held over for next time.
Multi-gen LRU still isn't merged as we were waiting for mapletree to
stabilize. The current plan is to merge MGLRU into -mm soon and to
later reintroduce mapletree, with a view to hopefully getting both
into 6.1-rc1.
Summary:
- The usual batches of cleanups from Baoquan He, Muchun Song, Miaohe
Lin, Yang Shi, Anshuman Khandual and Mike Rapoport
- Some kmemleak fixes from Patrick Wang and Waiman Long
- DAMON updates from SeongJae Park
- memcg debug/visibility work from Roman Gushchin
- vmalloc speedup from Uladzislau Rezki
- more folio conversion work from Matthew Wilcox
- enhancements for coherent device memory mapping from Alex Sierra
- addition of shared pages tracking and CoW support for fsdax, from
Shiyang Ruan
- hugetlb optimizations from Mike Kravetz
- Mel Gorman has contributed some pagealloc changes to improve
latency and realtime behaviour.
- mprotect soft-dirty checking has been improved by Peter Xu
- Many other singleton patches all over the place"
[ XFS merge from hell as per Darrick Wong in
https://lore.kernel.org/all/YshKnxb4VwXycPO8@magnolia/ ]
* tag 'mm-stable-2022-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (282 commits)
tools/testing/selftests/vm/hmm-tests.c: fix build
mm: Kconfig: fix typo
mm: memory-failure: convert to pr_fmt()
mm: use is_zone_movable_page() helper
hugetlbfs: fix inaccurate comment in hugetlbfs_statfs()
hugetlbfs: cleanup some comments in inode.c
hugetlbfs: remove unneeded header file
hugetlbfs: remove unneeded hugetlbfs_ops forward declaration
hugetlbfs: use helper macro SZ_1{K,M}
mm: cleanup is_highmem()
mm/hmm: add a test for cross device private faults
selftests: add soft-dirty into run_vmtests.sh
selftests: soft-dirty: add test for mprotect
mm/mprotect: fix soft-dirty check in can_change_pte_writable()
mm: memcontrol: fix potential oom_lock recursion deadlock
mm/gup.c: fix formatting in check_and_migrate_movable_page()
xfs: fail dax mount if reflink is enabled on a partition
mm/memcontrol.c: remove the redundant updating of stats_flush_threshold
userfaultfd: don't fail on unrecognized features
hugetlb_cgroup: fix wrong hugetlb cgroup numa stat
...
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 169 |
1 files changed, 97 insertions, 72 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aa39534898e0..f044962ad9df 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -66,12 +66,6 @@ static bool hugetlb_cma_page(struct page *page, unsigned int order) #endif static unsigned long hugetlb_cma_size __initdata; -/* - * Minimum page order among possible hugepage sizes, set to a proper value - * at boot time. - */ -static unsigned int minimum_order __read_mostly = UINT_MAX; - __initdata LIST_HEAD(huge_boot_pages); /* for command line parsing */ @@ -1135,7 +1129,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && !is_pinnable_page(page)) + if (pin && !is_longterm_pinnable_page(page)) continue; if (PageHWPoison(page)) @@ -2152,11 +2146,17 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; struct page *page; int rc = 0; + unsigned int order; + struct hstate *h; if (!hugepages_supported()) return rc; - for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { + order = huge_page_order(&default_hstate); + for_each_hstate(h) + order = min(order, huge_page_order(h)); + + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { page = pfn_to_page(pfn); rc = dissolve_free_huge_page(page); if (rc) @@ -2766,8 +2766,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - if (!isolate_huge_page(old_page, list)) - ret = -EBUSY; + ret = isolate_hugetlb(old_page, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!HPageFreed(old_page)) { @@ -2843,7 +2842,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (page_count(head) && isolate_huge_page(head, list)) + if (page_count(head) && !isolate_hugetlb(head, list)) ret = 0; else if (!page_count(head)) ret = alloc_and_dissolve_huge_page(h, head, list); @@ -3149,9 +3148,6 @@ static void __init hugetlb_init_hstates(void) struct hstate *h, *h2; for_each_hstate(h) { - if (minimum_order > huge_page_order(h)) - minimum_order = huge_page_order(h); - /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -3176,7 +3172,6 @@ static void __init hugetlb_init_hstates(void) h->demote_order = h2->order; } } - VM_BUG_ON(minimum_order == UINT_MAX); } static void __init report_hugepages(void) @@ -4482,22 +4477,20 @@ int hugetlb_report_node_meminfo(char *buf, int len, int nid) nid, h->surplus_huge_pages_node[nid]); } -void hugetlb_show_meminfo(void) +void hugetlb_show_meminfo_node(int nid) { struct hstate *h; - int nid; if (!hugepages_supported()) return; - for_each_node_state(nid, N_MEMORY) - for_each_hstate(h) - pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", - nid, - h->nr_huge_pages_node[nid], - h->free_huge_pages_node[nid], - h->surplus_huge_pages_node[nid], - huge_page_size(h) / SZ_1K); + for_each_hstate(h) + printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", + nid, + h->nr_huge_pages_node[nid], + h->free_huge_pages_node[nid], + h->surplus_huge_pages_node[nid], + huge_page_size(h) / SZ_1K); } void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) @@ -4732,6 +4725,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long npages = pages_per_huge_page(h); struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; + unsigned long last_addr_mask; int ret = 0; if (cow) { @@ -4751,11 +4745,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, i_mmap_lock_read(mapping); } + last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr, sz); - if (!src_pte) + if (!src_pte) { + addr |= last_addr_mask; continue; + } dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; @@ -4772,8 +4769,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * after taking the lock below. */ dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { + addr |= last_addr_mask; continue; + } dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -4808,12 +4807,11 @@ again: entry = swp_entry_to_pte(swp_entry); if (userfaultfd_wp(src_vma) && uffd_wp) entry = huge_pte_mkuffd_wp(entry); - set_huge_swap_pte_at(src, addr, src_pte, - entry, sz); + set_huge_pte_at(src, addr, src_pte, entry); } if (!userfaultfd_wp(dst_vma) && uffd_wp) entry = huge_pte_clear_uffd_wp(entry); - set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); + set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { /* * We copy the pte marker only if the dst vma has @@ -4880,7 +4878,7 @@ again: * table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_pte_wrprotect(entry); @@ -4939,7 +4937,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; - unsigned long old_addr_copy; + unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; bool shared_pmd = false; @@ -4954,23 +4952,23 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = huge_pte_offset(mm, old_addr, sz); - if (!src_pte) + if (!src_pte) { + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; continue; + } if (huge_pte_none(huge_ptep_get(src_pte))) continue; - /* old_addr arg to huge_pmd_unshare() is a pointer and so the - * arg may be modified. Pass a copy instead to preserve the - * value in old_addr. - */ - old_addr_copy = old_addr; - - if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) { + if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { shared_pmd = true; + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; continue; } @@ -5004,6 +5002,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mmu_notifier_range range; + unsigned long last_addr_mask; bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); @@ -5024,17 +5023,21 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address, sz); - if (!ptep) + if (!ptep) { + address |= last_addr_mask; continue; + } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, address, ptep)) { spin_unlock(ptl); tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); force_flush = true; + address |= last_addr_mask; continue; } @@ -5714,7 +5717,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(vma, mm, ptep); + migration_entry_wait_huge(vma, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -6052,8 +6055,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); - (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, - dst_vma->vm_flags & VM_WRITE); hugetlb_count_add(pages_per_huge_page(h), dst_mm); /* No need to invalidate - it was non-present before */ @@ -6305,6 +6306,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; + unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; @@ -6321,14 +6323,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += psize) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); - if (!ptep) + if (!ptep) { + address |= last_addr_mask; continue; + } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it @@ -6338,6 +6343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pages++; spin_unlock(ptl); shared_pmd = true; + address |= last_addr_mask; continue; } pte = huge_ptep_get(ptep); @@ -6363,8 +6369,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, newpte = pte_swp_mkuffd_wp(newpte); else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); - set_huge_swap_pte_at(mm, address, ptep, - newpte, psize); + set_huge_pte_at(mm, address, ptep, newpte); pages++; } spin_unlock(ptl); @@ -6415,7 +6420,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(&range); @@ -6761,11 +6766,11 @@ out: * 0 the underlying pte page is not shared, or it is the last user */ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { - pgd_t *pgd = pgd_offset(mm, *addr); - p4d_t *p4d = p4d_offset(pgd, *addr); - pud_t *pud = pud_offset(p4d, *addr); + pgd_t *pgd = pgd_offset(mm, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); BUG_ON(page_count(virt_to_page(ptep)) == 0); @@ -6775,14 +6780,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, pud_clear(pud); put_page(virt_to_page(ptep)); mm_dec_nr_pmds(mm); - /* - * This update of passed address optimizes loops sequentially - * processing addresses in increments of huge page size (PMD_SIZE - * in this case). By clearing the pud, a PUD_SIZE area is unmapped. - * Update address to the 'last page' in the cleared area so that - * calling loop can move to first page past this area. - */ - *addr |= PUD_SIZE - PMD_SIZE; return 1; } @@ -6794,7 +6791,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, } int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { return 0; } @@ -6877,6 +6874,37 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return (pte_t *)pmd; } +/* + * Return a mask that can be used to update an address to the last huge + * page in a page table page mapping size. Used to skip non-present + * page table entries when linearly scanning address ranges. Architectures + * with unique huge page to page table relationships can define their own + * version of this routine. + */ +unsigned long hugetlb_mask_last_page(struct hstate *h) +{ + unsigned long hp_size = huge_page_size(h); + + if (hp_size == PUD_SIZE) + return P4D_SIZE - PUD_SIZE; + else if (hp_size == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; + else + return 0UL; +} + +#else + +/* See description above. Architectures can provide their own version. */ +__weak unsigned long hugetlb_mask_last_page(struct hstate *h) +{ +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + if (huge_page_size(h) == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; +#endif + return 0UL; +} + #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /* @@ -6940,7 +6968,7 @@ retry: } else { if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pmd, ptl); + __migration_entry_wait_huge((pte_t *)pmd, ptl); goto retry; } /* @@ -6972,15 +7000,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); } -bool isolate_huge_page(struct page *page, struct list_head *list) +int isolate_hugetlb(struct page *page, struct list_head *list) { - bool ret = true; + int ret = 0; spin_lock_irq(&hugetlb_lock); if (!PageHeadHuge(page) || !HPageMigratable(page) || !get_page_unless_zero(page)) { - ret = false; + ret = -EBUSY; goto unlock; } ClearHPageMigratable(page); @@ -7100,21 +7128,18 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_invalidate_range_start(&range); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { - unsigned long tmp = address; - ptep = huge_pte_offset(mm, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - /* We don't want 'address' to be changed */ - huge_pmd_unshare(mm, vma, &tmp, ptep); + huge_pmd_unshare(mm, vma, address, ptep); spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); /* * No need to call mmu_notifier_invalidate_range(), see - * Documentation/vm/mmu_notifier.rst. + * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); } |