From 04a42e72d77a93a166b79c34b7bc862f55a53967 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Dec 2022 22:17:57 -0800 Subject: mm: move folio_set_compound_order() to mm/internal.h folio_set_compound_order() is moved to an mm-internal location so external folio users cannot misuse this function. Change the name of the function to folio_set_order() and use WARN_ON_ONCE() rather than BUG_ON. Also, handle the case if a non-large folio is passed and add clarifying comments to the function. Link: https://lore.kernel.org/lkml/20221207223731.32784-1-sidhartha.kumar@oracle.com/T/ Link: https://lkml.kernel.org/r/20221215061757.223440-1-sidhartha.kumar@oracle.com Fixes: 9fd330582b2f ("mm: add folio dtor and order setter functions") Signed-off-by: Sidhartha Kumar Suggested-by: Mike Kravetz Suggested-by: Muchun Song Suggested-by: Matthew Wilcox Suggested-by: John Hubbard Reviewed-by: John Hubbard Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bdbfeb6fb393..cfd47a66ded0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1492,7 +1492,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, set_page_refcounted(p); } - folio_set_compound_order(folio, 0); + folio_set_order(folio, 0); __folio_clear_head(folio); } @@ -1956,7 +1956,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, __folio_clear_reserved(folio); __folio_set_head(folio); /* we rely on prep_new_hugetlb_folio to set the destructor */ - folio_set_compound_order(folio, order); + folio_set_order(folio, order); for (i = 0; i < nr_pages; i++) { p = folio_page(folio, i); @@ -2020,7 +2020,7 @@ out_error: p = folio_page(folio, j); __ClearPageReserved(p); } - folio_set_compound_order(folio, 0); + folio_set_order(folio, 0); __folio_clear_head(folio); return false; } -- cgit v1.2.3-70-g09d2 From f1eb1bacfba9019823b2fce42383f010cd561fa6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 14 Dec 2022 15:15:33 -0500 Subject: mm/uffd: always wr-protect pte in pte|pmd_mkuffd_wp() This patch is a cleanup to always wr-protect pte/pmd in mkuffd_wp paths. The reasons I still think this patch is worthwhile, are: (1) It is a cleanup already; diffstat tells. (2) It just feels natural after I thought about this, if the pte is uffd protected, let's remove the write bit no matter what it was. (2) Since x86 is the only arch that supports uffd-wp, it also redefines pte|pmd_mkuffd_wp() in that it should always contain removals of write bits. It means any future arch that want to implement uffd-wp should naturally follow this rule too. It's good to make it a default, even if with vm_page_prot changes on VM_UFFD_WP. (3) It covers more than vm_page_prot. So no chance of any potential future "accident" (like pte_mkdirty() sparc64 or loongarch, even though it just got its pte_mkdirty fixed <1 month ago). It'll be fairly clear when reading the code too that we don't worry anything before a pte_mkuffd_wp() on uncertainty of the write bit. We may call pte_wrprotect() one more time in some paths (e.g. thp split), but that should be fully local bitop instruction so the overhead should be negligible. Although this patch should logically also fix all the known issues on uffd-wp too recently on page migration (not for numa hint recovery - that may need another explcit pte_wrprotect), but this is not the plan for that fix. So no fixes, and stable doesn't need this. Link: https://lkml.kernel.org/r/20221214201533.1774616-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Ives van Hoorne Cc: Mike Kravetz Cc: Nadav Amit Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 24 ++++++++++++------------ include/asm-generic/hugetlb.h | 16 ++++++++-------- mm/huge_memory.c | 8 +++----- mm/hugetlb.c | 4 ++-- mm/memory.c | 8 +++----- mm/mprotect.c | 6 ++---- mm/userfaultfd.c | 18 ++---------------- 7 files changed, 32 insertions(+), 52 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0564edd24ffb..1c843395a8b3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -289,6 +289,11 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } +static inline pte_t pte_wrprotect(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_RW); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { @@ -313,7 +318,7 @@ static inline int pte_uffd_wp(pte_t pte) static inline pte_t pte_mkuffd_wp(pte_t pte) { - return pte_set_flags(pte, _PAGE_UFFD_WP); + return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); } static inline pte_t pte_clear_uffd_wp(pte_t pte) @@ -332,11 +337,6 @@ static inline pte_t pte_mkold(pte_t pte) return pte_clear_flags(pte, _PAGE_ACCESSED); } -static inline pte_t pte_wrprotect(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_RW); -} - static inline pte_t pte_mkexec(pte_t pte) { return pte_clear_flags(pte, _PAGE_NX); @@ -401,6 +401,11 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) return native_make_pmd(v & ~clear); } +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pmd_uffd_wp(pmd_t pmd) { @@ -409,7 +414,7 @@ static inline int pmd_uffd_wp(pmd_t pmd) static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_UFFD_WP); + return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP)); } static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) @@ -428,11 +433,6 @@ static inline pmd_t pmd_mkclean(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_DIRTY); } -static inline pmd_t pmd_wrprotect(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_RW); -} - static inline pmd_t pmd_mkdirty(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index a57d667addd2..d7f6335d3999 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -25,6 +25,13 @@ static inline pte_t huge_pte_mkwrite(pte_t pte) return pte_mkwrite(pte); } +#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT +static inline pte_t huge_pte_wrprotect(pte_t pte) +{ + return pte_wrprotect(pte); +} +#endif + static inline pte_t huge_pte_mkdirty(pte_t pte) { return pte_mkdirty(pte); @@ -37,7 +44,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { - return pte_mkuffd_wp(pte); + return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) @@ -104,13 +111,6 @@ static inline int huge_pte_none_mostly(pte_t pte) return huge_pte_none(pte) || is_pte_marker(pte); } -#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} -#endif - #ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index abe6cfd92ffa..867f02e6061d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1920,17 +1920,15 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); - if (uffd_wp) { - entry = pmd_wrprotect(entry); + if (uffd_wp) entry = pmd_mkuffd_wp(entry); - } else if (uffd_wp_resolve) { + else if (uffd_wp_resolve) /* * Leave the write bit to be handled by PF interrupt * handler, then things like COW could be properly * handled. */ entry = pmd_clear_uffd_wp(entry); - } /* See change_pte_range(). */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && @@ -3275,7 +3273,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (is_writable_migration_entry(entry)) pmde = maybe_pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) - pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); + pmde = pmd_mkuffd_wp(pmde); if (!is_migration_entry_young(entry)) pmde = pmd_mkold(pmde); /* NOTE: this may contain setting soft-dirty on some archs */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cfd47a66ded0..92b3fd01a652 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5919,7 +5919,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * if populated. */ if (unlikely(pte_marker_uffd_wp(old_pte))) - new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte)); + new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); @@ -6728,7 +6728,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (uffd_wp) - pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte)); + pte = huge_pte_mkuffd_wp(pte); else if (uffd_wp_resolve) pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); diff --git a/mm/memory.c b/mm/memory.c index 3e836fecd035..ca490596b36f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -878,7 +878,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, *src_pte)) /* Uffd-wp needs to be delivered to dest pte as well */ - pte = pte_wrprotect(pte_mkuffd_wp(pte)); + pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -3950,10 +3950,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - if (pte_swp_uffd_wp(vmf->orig_pte)) { + if (pte_swp_uffd_wp(vmf->orig_pte)) pte = pte_mkuffd_wp(pte); - pte = pte_wrprotect(pte); - } vmf->orig_pte = pte; /* ksm created a completely new copy */ @@ -4296,7 +4294,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (unlikely(uffd_wp)) - entry = pte_mkuffd_wp(pte_wrprotect(entry)); + entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter(vma->vm_mm, MM_ANONPAGES); diff --git a/mm/mprotect.c b/mm/mprotect.c index 61cf60015a8b..bf8fa0af5a15 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -177,12 +177,10 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, oldpte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_modify(oldpte, newprot); - if (uffd_wp) { - ptent = pte_wrprotect(ptent); + if (uffd_wp) ptent = pte_mkuffd_wp(ptent); - } else if (uffd_wp_resolve) { + else if (uffd_wp_resolve) ptent = pte_clear_uffd_wp(ptent); - } /* * In some writable, shared mappings, we might want diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0499907b6f1a..f8d31b82aceb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -74,24 +74,10 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; - - /* - * Always mark a PTE as write-protected when needed, regardless of - * VM_WRITE, which the user might change. - */ - if (wp_copy) { - _dst_pte = pte_mkuffd_wp(_dst_pte); - writable = false; - } - if (writable) _dst_pte = pte_mkwrite(_dst_pte); - else - /* - * We need this to make sure write bit removed; as mk_pte() - * could return a pte with write bit set. - */ - _dst_pte = pte_wrprotect(_dst_pte); + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); -- cgit v1.2.3-70-g09d2 From 379c2e60e82ff71510a949033bf8431f39f66c75 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 12 Dec 2022 15:50:42 -0800 Subject: hugetlb: update vma flag check for hugetlb vma lock The check for whether a hugetlb vma lock exists partially depends on the vma's flags. Currently, it checks for either VM_MAYSHARE or VM_SHARED. The reason both flags are used is because VM_MAYSHARE was previously cleared in hugetlb vmas as they are tore down. This is no longer the case, and only the VM_MAYSHARE check is required. Link: https://lkml.kernel.org/r/20221212235042.178355-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: "Aneesh Kumar K.V" Cc: David Hildenbrand Cc: James Houghton Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92b3fd01a652..ed1ac2df582c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -262,8 +262,7 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) */ static bool __vma_shareable_lock(struct vm_area_struct *vma) { - return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && - vma->vm_private_data; + return vma->vm_flags & VM_MAYSHARE && vma->vm_private_data; } void hugetlb_vma_lock_read(struct vm_area_struct *vma) -- cgit v1.2.3-70-g09d2 From bb373dce2c7b473023f9e69f041a22d81171b71a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:53 -0500 Subject: mm/hugetlb: don't wait for migration entry during follow page That's what the code does with !hugetlb pages, so we should logically do the same for hugetlb, so migration entry will also be treated as no page. This is probably also the last piece in follow_page code that may sleep, the last one should be removed in cf994dd8af27 ("mm/gup: remove FOLL_MIGRATION", 2022-11-16). Link: https://lkml.kernel.org/r/20221216155100.2043537-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ed1ac2df582c..549f79668756 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6401,7 +6401,6 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; -retry: pte = huge_pte_offset(mm, haddr, huge_page_size(h)); if (!pte) return NULL; @@ -6424,16 +6423,6 @@ retry: page = NULL; goto out; } - } else { - if (is_hugetlb_entry_migration(entry)) { - spin_unlock(ptl); - __migration_entry_wait_huge(pte, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ } out: spin_unlock(ptl); -- cgit v1.2.3-70-g09d2 From fcd48540d188876c917a377d81cd24c100332a62 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:55 -0500 Subject: mm/hugetlb: move swap entry handling into vma lock when faulted In hugetlb_fault(), there used to have a special path to handle swap entry at the entrance using huge_pte_offset(). That's unsafe because huge_pte_offset() for a pmd sharable range can access freed pgtables if without any lock to protect the pgtable from being freed after pmd unshare. Here the simplest solution to make it safe is to move the swap handling to be after the vma lock being held. We may need to take the fault mutex on either migration or hwpoison entries now (also the vma lock, but that's really needed), however neither of them is hot path. Note that the vma lock cannot be released in hugetlb_fault() when the migration entry is detected, because in migration_entry_wait_huge() the pgtable page will be used again (by taking the pgtable lock), so that also need to be protected by the vma lock. Modify migration_entry_wait_huge() so that it must be called with vma read lock held, and properly release the lock in __migration_entry_wait_huge(). Link: https://lkml.kernel.org/r/20221216155100.2043537-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/swapops.h | 6 ++++-- mm/hugetlb.c | 37 ++++++++++++++++--------------------- mm/migrate.c | 25 +++++++++++++++++++++---- 3 files changed, 41 insertions(+), 27 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index b982dd614572..3a451b7afcb3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); #ifdef CONFIG_HUGETLB_PAGE -extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); +extern void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); #endif /* CONFIG_HUGETLB_PAGE */ #else /* CONFIG_MIGRATION */ @@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } #ifdef CONFIG_HUGETLB_PAGE -static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } +static inline void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } #endif /* CONFIG_HUGETLB_PAGE */ static inline int is_writable_migration_entry(swp_entry_t entry) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 549f79668756..7f9db1d9f6a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5993,22 +5993,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int need_wait_lock = 0; unsigned long haddr = address & huge_page_mask(h); - ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); - if (ptep) { - /* - * Since we hold no locks, ptep could be stale. That is - * OK as we are only making decisions based on content and - * not actually modifying content here. - */ - entry = huge_ptep_get(ptep); - if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(vma, ptep); - return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(hstate_index(h)); - } - /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate @@ -6023,10 +6007,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Acquire vma lock before calling huge_pte_alloc and hold * until finished with ptep. This prevents huge_pmd_unshare from * being called elsewhere and making the ptep no longer valid. - * - * ptep could have already be assigned via huge_pte_offset. That - * is OK, as huge_pte_alloc will return the same value unless - * something has changed. */ hugetlb_vma_lock_read(vma); ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); @@ -6055,8 +6035,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will * properly handle it. */ - if (!pte_present(entry)) + if (!pte_present(entry)) { + if (unlikely(is_hugetlb_entry_migration(entry))) { + /* + * Release the hugetlb fault lock now, but retain + * the vma lock, because it is needed to guard the + * huge_pte_lockptr() later in + * migration_entry_wait_huge(). The vma lock will + * be released there. + */ + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + migration_entry_wait_huge(vma, ptep); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; + } /* * If we are going to COW/unshare the mapping later, we examine the diff --git a/mm/migrate.c b/mm/migrate.c index a4d3fc65085f..98de7ce2b576 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -329,24 +329,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, } #ifdef CONFIG_HUGETLB_PAGE -void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) +/* + * The vma read lock must be held upon entry. Holding that lock prevents either + * the pte or the ptl from being freed. + * + * This function will release the vma lock before returning. + */ +void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl) { pte_t pte; + hugetlb_vma_assert_locked(vma); spin_lock(ptl); pte = huge_ptep_get(ptep); - if (unlikely(!is_hugetlb_entry_migration(pte))) + if (unlikely(!is_hugetlb_entry_migration(pte))) { spin_unlock(ptl); - else + hugetlb_vma_unlock_read(vma); + } else { + /* + * If migration entry existed, safe to release vma lock + * here because the pgtable page won't be freed without the + * pgtable lock released. See comment right above pgtable + * lock release in migration_entry_wait_on_locked(). + */ + hugetlb_vma_unlock_read(vma); migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); + } } void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); - __migration_entry_wait_huge(pte, ptl); + __migration_entry_wait_huge(vma, pte, ptl); } #endif -- cgit v1.2.3-70-g09d2 From 7d049f3a03ea705522210d70b9d3e223ef86d663 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:19 -0500 Subject: mm/hugetlb: make hugetlb_follow_page_mask() safe to pmd unshare Since hugetlb_follow_page_mask() walks the pgtable, it needs the vma lock to make sure the pgtable page will not be freed concurrently. Link: https://lkml.kernel.org/r/20221216155219.2043714-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/hugetlb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7f9db1d9f6a5..807edc1410e5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6396,9 +6396,10 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; + hugetlb_vma_lock_read(vma); pte = huge_pte_offset(mm, haddr, huge_page_size(h)); if (!pte) - return NULL; + goto out_unlock; ptl = huge_pte_lock(h, mm, pte); entry = huge_ptep_get(pte); @@ -6421,6 +6422,8 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, } out: spin_unlock(ptl); +out_unlock: + hugetlb_vma_unlock_read(vma); return page; } -- cgit v1.2.3-70-g09d2 From eefc7fa53608920203a1402ecf7255ecfa8bb030 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:23 -0500 Subject: mm/hugetlb: make follow_hugetlb_page() safe to pmd unshare Since follow_hugetlb_page() walks the pgtable, it needs the vma lock to make sure the pgtable page will not be freed concurrently. Link: https://lkml.kernel.org/r/20221216155223.2043727-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/hugetlb.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 807edc1410e5..da4c37553c08 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6454,6 +6454,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } + hugetlb_vma_lock_read(vma); /* * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the @@ -6478,6 +6479,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, !hugetlbfs_pagecache_present(h, vma, vaddr)) { if (pte) spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); remainder = 0; break; } @@ -6499,6 +6501,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (pte) spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); + if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; else if (unshare) @@ -6561,6 +6565,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, remainder -= pages_per_huge_page(h); i += pages_per_huge_page(h); spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); continue; } @@ -6590,6 +6595,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, flags))) { spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); remainder = 0; err = -ENOMEM; break; @@ -6601,6 +6607,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, i += refs; spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); } *nr_pages = remainder; /* -- cgit v1.2.3-70-g09d2 From 9c67a20704e763f9cb8cd262c3e45de7bd2816bc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:29 -0500 Subject: mm/hugetlb: introduce hugetlb_walk() huge_pte_offset() is the main walker function for hugetlb pgtables. The name is not really representing what it does, though. Instead of renaming it, introduce a wrapper function called hugetlb_walk() which will use huge_pte_offset() inside. Assert on the locks when walking the pgtable. Note, the vma lock assertion will be a no-op for private mappings. Document the last special case in the page_vma_mapped_walk() path where we don't need any more lock to call hugetlb_walk(). Taking vma lock there is not needed because either: (1) potential callers of hugetlb pvmw holds i_mmap_rwsem already (from one rmap_walk()), or (2) the caller will not walk a hugetlb vma at all so the hugetlb code path not reachable (e.g. in ksm or uprobe paths). It's slightly implicit for future page_vma_mapped_walk() callers on that lock requirement. But anyway, when one day this rule breaks, one will get a straightforward warning in hugetlb_walk() with lockdep, then there'll be a way out. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20221216155229.2043750-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 4 +--- fs/userfaultfd.c | 6 ++---- include/linux/hugetlb.h | 37 +++++++++++++++++++++++++++++++++++++ mm/hugetlb.c | 31 +++++++++++++------------------ mm/page_vma_mapped.c | 9 ++++++--- mm/pagewalk.c | 4 +--- 6 files changed, 60 insertions(+), 31 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index fdb16246f46e..48f1a8ad2243 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -388,9 +388,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, { pte_t *ptep, pte; - ptep = huge_pte_offset(vma->vm_mm, addr, - huge_page_size(hstate_vma(vma))); - + ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma))); if (!ptep) return false; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 3b1797e0448a..15a5bf765d43 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -252,14 +252,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, unsigned long flags, unsigned long reason) { - struct mm_struct *mm = ctx->mm; pte_t *ptep, pte; bool ret = true; - mmap_assert_locked(mm); - - ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); + mmap_assert_locked(ctx->mm); + ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma)); if (!ptep) goto out; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d755e2a7c0db..b6b10101bea7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -2,6 +2,7 @@ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H +#include #include #include #include @@ -196,6 +197,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. * Returns the pte_t* if found, or NULL if the address is not mapped. * + * IMPORTANT: we should normally not directly call this function, instead + * this is only a common interface to implement arch-specific + * walker. Please use hugetlb_walk() instead, because that will attempt to + * verify the locking for you. + * * Since this function will walk all the pgtable pages (including not only * high-level pgtable page, but also PUD entry that can be unshared * concurrently for VM_SHARED), the caller of this function should be @@ -1229,4 +1235,35 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #endif +static inline bool __vma_shareable_lock(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; +} + +/* + * Safe version of huge_pte_offset() to check the locks. See comments + * above huge_pte_offset(). + */ +static inline pte_t * +hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) +{ +#if defined(CONFIG_HUGETLB_PAGE) && \ + defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP) + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + /* + * If pmd sharing possible, locking needed to safely walk the + * hugetlb pgtables. More information can be found at the comment + * above huge_pte_offset() in the same file. + * + * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP. + */ + if (__vma_shareable_lock(vma)) + WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) && + !lockdep_is_held( + &vma->vm_file->f_mapping->i_mmap_rwsem)); +#endif + return huge_pte_offset(vma->vm_mm, addr, sz); +} + #endif /* _LINUX_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da4c37553c08..0e5441d6890a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -260,11 +260,6 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) /* * hugetlb vma_lock helper routines */ -static bool __vma_shareable_lock(struct vm_area_struct *vma) -{ - return vma->vm_flags & VM_MAYSHARE && vma->vm_private_data; -} - void hugetlb_vma_lock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { @@ -4980,7 +4975,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } else { /* * For shared mappings the vma lock must be held before - * calling huge_pte_offset in the src vma. Otherwise, the + * calling hugetlb_walk() in the src vma. Otherwise, the * returned ptep could go away if part of a shared pmd and * another thread calls huge_pmd_unshare. */ @@ -4990,7 +4985,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr, sz); + src_pte = hugetlb_walk(src_vma, addr, sz); if (!src_pte) { addr |= last_addr_mask; continue; @@ -5197,7 +5192,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { - src_pte = huge_pte_offset(mm, old_addr, sz); + src_pte = hugetlb_walk(vma, old_addr, sz); if (!src_pte) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; @@ -5260,7 +5255,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { - ptep = huge_pte_offset(mm, address, sz); + ptep = hugetlb_walk(vma, address, sz); if (!ptep) { address |= last_addr_mask; continue; @@ -5573,7 +5568,7 @@ retry_avoidcopy: mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); spin_lock(ptl); - ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); + ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -5611,7 +5606,7 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); + ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); @@ -6397,7 +6392,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, return NULL; hugetlb_vma_lock_read(vma); - pte = huge_pte_offset(mm, haddr, huge_page_size(h)); + pte = hugetlb_walk(vma, haddr, huge_page_size(h)); if (!pte) goto out_unlock; @@ -6462,8 +6457,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * * Note that page table lock is not held when pte is null. */ - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), - huge_page_size(h)); + pte = hugetlb_walk(vma, vaddr & huge_page_mask(h), + huge_page_size(h)); if (pte) ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); @@ -6654,7 +6649,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; - ptep = huge_pte_offset(mm, address, psize); + ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { address |= last_addr_mask; @@ -7064,8 +7059,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { - spte = huge_pte_offset(svma->vm_mm, saddr, - vma_mmu_pagesize(svma)); + spte = hugetlb_walk(svma, saddr, + vma_mmu_pagesize(svma)); if (spte) { get_page(virt_to_page(spte)); break; @@ -7377,7 +7372,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { - ptep = huge_pte_offset(mm, address, sz); + ptep = hugetlb_walk(vma, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 93e13fc17d3c..4e448cfbc6ef 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -168,9 +168,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) /* The only possible mapping was handled on last iteration */ if (pvmw->pte) return not_found(pvmw); - - /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address, size); + /* + * All callers that get here will already hold the + * i_mmap_rwsem. Therefore, no additional locks need to be + * taken before calling hugetlb_walk(). + */ + pvmw->pte = hugetlb_walk(vma, pvmw->address, size); if (!pvmw->pte) return false; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d98564a7be57..cb23f8a15c13 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -305,13 +305,11 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, hugetlb_vma_lock_read(vma); do { next = hugetlb_entry_end(h, addr, end); - pte = huge_pte_offset(walk->mm, addr & hmask, sz); - + pte = hugetlb_walk(vma, addr & hmask, sz); if (pte) err = ops->hugetlb_entry(pte, hmask, addr, next, walk); else if (ops->pte_hole) err = ops->pte_hole(addr, next, -1, walk); - if (err) break; } while (addr = next, addr != end); -- cgit v1.2.3-70-g09d2 From c5094ec79cbe487983e3a96548a7eb1c1c82c727 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 16 Dec 2022 14:45:07 -0800 Subject: hugetlb: initialize variable to avoid compiler warning With the gcc 'maybe-uninitialized' warning enabled, gcc will produce: mm/hugetlb.c:6896:20: warning: `chg' may be used uninitialized This is a false positive, but may be difficult for the compiler to determine. maybe-uninitialized is disabled by default, but this gets flagged as a 0-DAY build regression. Initialize the variable to silence the warning. Link: https://lkml.kernel.org/r/20221216224507.106789-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0e5441d6890a..a82f41024167 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6760,7 +6760,7 @@ bool hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long chg, add = -1; + long chg = -1, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; -- cgit v1.2.3-70-g09d2 From a79390f5d6a78647fd70856bd42b22d994de0ba2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Jan 2023 17:52:06 -0500 Subject: mm/mprotect: use long for page accountings and retval Switch to use type "long" for page accountings and retval across the whole procedure of change_protection(). The change should have shrinked the possible maximum page number to be half comparing to previous (ULONG_MAX / 2), but it shouldn't overflow on any system either because the maximum possible pages touched by change protection should be ULONG_MAX / PAGE_SIZE. Two reasons to switch from "unsigned long" to "long": 1. It suites better on count_vm_numa_events(), whose 2nd parameter takes a long type. 2. It paves way for returning negative (error) values in the future. Currently the only caller that consumes this retval is change_prot_numa(), where the unsigned long was converted to an int. Since at it, touching up the numa code to also take a long, so it'll avoid any possible overflow too during the int-size convertion. Link: https://lkml.kernel.org/r/20230104225207.1066932-3-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Mike Kravetz Acked-by: James Houghton Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Muchun Song Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- include/linux/mm.h | 2 +- mm/hugetlb.c | 4 ++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 26 +++++++++++++------------- 5 files changed, 19 insertions(+), 19 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b6b10101bea7..e3aa336df900 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -248,7 +248,7 @@ void hugetlb_vma_lock_release(struct kref *kref); int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); @@ -437,7 +437,7 @@ static inline void move_hugetlb_state(struct folio *old_folio, { } -static inline unsigned long hugetlb_change_protection( +static inline long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) diff --git a/include/linux/mm.h b/include/linux/mm.h index 329ed67edd76..4ac5ea4b584c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2132,7 +2132,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma } bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); -extern unsigned long change_protection(struct mmu_gather *tlb, +extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a82f41024167..a0d6d0980064 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6615,7 +6615,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i ? i : err; } -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { @@ -6624,7 +6624,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); - unsigned long pages = 0, psize = huge_page_size(h); + long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d3558248a0f0..a86b8f15e2f0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -631,7 +631,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; - int nr_updated; + long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); diff --git a/mm/mprotect.c b/mm/mprotect.c index 71358e45a742..0af22ab59ea8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -80,13 +80,13 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, return pte_dirty(pte); } -static unsigned long change_pte_range(struct mmu_gather *tlb, +static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pte_t *pte, oldpte; spinlock_t *ptl; - unsigned long pages = 0; + long pages = 0; int target_node = NUMA_NO_NODE; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; @@ -353,13 +353,13 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) } \ } while (0) -static inline unsigned long change_pmd_range(struct mmu_gather *tlb, +static inline long change_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; unsigned long next; - unsigned long pages = 0; + long pages = 0; unsigned long nr_huge_updates = 0; struct mmu_notifier_range range; @@ -367,7 +367,7 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { - unsigned long this_pages; + long this_pages; next = pmd_addr_end(addr, end); @@ -437,13 +437,13 @@ next: return pages; } -static inline unsigned long change_pud_range(struct mmu_gather *tlb, +static inline long change_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; - unsigned long pages = 0; + long pages = 0; pud = pud_offset(p4d, addr); do { @@ -458,13 +458,13 @@ static inline unsigned long change_pud_range(struct mmu_gather *tlb, return pages; } -static inline unsigned long change_p4d_range(struct mmu_gather *tlb, +static inline long change_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; - unsigned long pages = 0; + long pages = 0; p4d = p4d_offset(pgd, addr); do { @@ -479,14 +479,14 @@ static inline unsigned long change_p4d_range(struct mmu_gather *tlb, return pages; } -static unsigned long change_protection_range(struct mmu_gather *tlb, +static long change_protection_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; - unsigned long pages = 0; + long pages = 0; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); @@ -505,12 +505,12 @@ static unsigned long change_protection_range(struct mmu_gather *tlb, return pages; } -unsigned long change_protection(struct mmu_gather *tlb, +long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags) { pgprot_t newprot = vma->vm_page_prot; - unsigned long pages; + long pages; BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); -- cgit v1.2.3-70-g09d2 From d1751118c88673fe5a948ad82277898e9e284c55 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Jan 2023 17:52:07 -0500 Subject: mm/uffd: detect pgtable allocation failures Before this patch, when there's any pgtable allocation issues happened during change_protection(), the error will be ignored from the syscall. For shmem, there will be an error dumped into the host dmesg. Two issues with that: (1) Doing a trace dump when allocation fails is not anything close to grace. (2) The user should be notified with any kind of such error, so the user can trap it and decide what to do next, either by retrying, or stop the process properly, or anything else. For userfault users, this will change the API of UFFDIO_WRITEPROTECT when pgtable allocation failure happened. It should not normally break anyone, though. If it breaks, then in good ways. One man-page update will be on the way to introduce the new -ENOMEM for UFFDIO_WRITEPROTECT. Not marking stable so we keep the old behavior on the 5.19-till-now kernels. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20230104225207.1066932-4-peterx@redhat.com Signed-off-by: Peter Xu Reported-by: James Houghton Acked-by: James Houghton Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 2 +- mm/hugetlb.c | 6 +++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 63 ++++++++++++++++++++++++++++--------------- mm/userfaultfd.c | 16 +++++++---- 5 files changed, 59 insertions(+), 30 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 9df0b9a762cc..3767f18114ef 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -73,7 +73,7 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); -extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, +extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a0d6d0980064..6fe65f14d33b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6660,8 +6660,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma, * pre-allocations to install pte markers. */ ptep = huge_pte_alloc(mm, vma, address, psize); - if (!ptep) + if (!ptep) { + pages = -ENOMEM; break; + } } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { @@ -6751,7 +6753,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); - return pages << h->order; + return pages > 0 ? (pages << h->order) : pages; } /* Return true if reservation was successful, false otherwise. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a86b8f15e2f0..85a34f1f3ab8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,7 +636,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); - if (nr_updated) + if (nr_updated > 0) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); tlb_finish_mmu(&tlb); diff --git a/mm/mprotect.c b/mm/mprotect.c index 0af22ab59ea8..92fc6f3fa512 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -330,28 +330,34 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) /* * If wr-protecting the range for file-backed, populate pgtable for the case * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc() - * failed it means no memory, we don't have a better option but stop. + * failed we treat it the same way as pgtable allocation failures during + * page faults by kicking OOM and returning error. */ #define change_pmd_prepare(vma, pmd, cp_flags) \ - do { \ + ({ \ + long err = 0; \ if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ - if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \ - break; \ + if (pte_alloc(vma->vm_mm, pmd)) \ + err = -ENOMEM; \ } \ - } while (0) + err; \ + }) + /* * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to * have separate change_pmd_prepare() because pte_alloc() returns 0 on success, * while {pmd|pud|p4d}_alloc() returns the valid pointer on success. */ #define change_prepare(vma, high, low, addr, cp_flags) \ - do { \ + ({ \ + long err = 0; \ if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ - if (WARN_ON_ONCE(p == NULL)) \ - break; \ + if (p == NULL) \ + err = -ENOMEM; \ } \ - } while (0) + err; \ + }) static inline long change_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, @@ -367,11 +373,15 @@ static inline long change_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { - long this_pages; + long ret; next = pmd_addr_end(addr, end); - change_pmd_prepare(vma, pmd, cp_flags); + ret = change_pmd_prepare(vma, pmd, cp_flags); + if (ret) { + pages = ret; + break; + } /* * Automatic NUMA balancing walks the tables with mmap_lock * held for read. It's possible a parallel update to occur @@ -401,7 +411,11 @@ static inline long change_pmd_range(struct mmu_gather *tlb, * cleared; make sure pmd populated if * necessary, then fall-through to pte level. */ - change_pmd_prepare(vma, pmd, cp_flags); + ret = change_pmd_prepare(vma, pmd, cp_flags); + if (ret) { + pages = ret; + break; + } } else { /* * change_huge_pmd() does not defer TLB flushes, @@ -422,9 +436,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb, } /* fall through, the trans huge pmd just split */ } - this_pages = change_pte_range(tlb, vma, pmd, addr, next, - newprot, cp_flags); - pages += this_pages; + pages += change_pte_range(tlb, vma, pmd, addr, next, + newprot, cp_flags); next: cond_resched(); } while (pmd++, addr = next, addr != end); @@ -443,12 +456,14 @@ static inline long change_pud_range(struct mmu_gather *tlb, { pud_t *pud; unsigned long next; - long pages = 0; + long pages = 0, ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - change_prepare(vma, pud, pmd, addr, cp_flags); + ret = change_prepare(vma, pud, pmd, addr, cp_flags); + if (ret) + return ret; if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, @@ -464,12 +479,14 @@ static inline long change_p4d_range(struct mmu_gather *tlb, { p4d_t *p4d; unsigned long next; - long pages = 0; + long pages = 0, ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - change_prepare(vma, p4d, pud, addr, cp_flags); + ret = change_prepare(vma, p4d, pud, addr, cp_flags); + if (ret) + return ret; if (p4d_none_or_clear_bad(p4d)) continue; pages += change_pud_range(tlb, vma, p4d, addr, next, newprot, @@ -486,14 +503,18 @@ static long change_protection_range(struct mmu_gather *tlb, struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; - long pages = 0; + long pages = 0, ret; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); tlb_start_vma(tlb, vma); do { next = pgd_addr_end(addr, end); - change_prepare(vma, pgd, p4d, addr, cp_flags); + ret = change_prepare(vma, pgd, p4d, addr, cp_flags); + if (ret) { + pages = ret; + break; + } if (pgd_none_or_clear_bad(pgd)) continue; pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 65ad172add27..53c3d916ff66 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -710,11 +710,12 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, mmap_changing, 0); } -void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, +long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { unsigned int mm_cp_flags; struct mmu_gather tlb; + long ret; if (enable_wp) mm_cp_flags = MM_CP_UFFD_WP; @@ -730,8 +731,10 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); + ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); tlb_finish_mmu(&tlb); + + return ret; } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, @@ -740,7 +743,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, { struct vm_area_struct *dst_vma; unsigned long page_mask; - int err; + long err; /* * Sanitize the command parameters: @@ -779,9 +782,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, goto out_unlock; } - uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); + err = uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); + + /* Return 0 on success, <0 on failures */ + if (err > 0) + err = 0; - err = 0; out_unlock: mmap_read_unlock(dst_mm); return err; -- cgit v1.2.3-70-g09d2 From 7d4a8be0c4b2b7ffb367929d2b352651f083806b Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 10 Jan 2023 13:57:22 +1100 Subject: mm/mmu_notifier: remove unused mmu_notifier_range_update_to_read_only export mmu_notifier_range_update_to_read_only() was originally introduced in commit c6d23413f81b ("mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper") as an optimisation for device drivers that know a range has only been mapped read-only. However there are no users of this feature so remove it. As it is the only user of the struct mmu_notifier_range.vma field remove that also. Link: https://lkml.kernel.org/r/20230110025722.600912-1-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Mike Rapoport (IBM) Reviewed-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Mike Kravetz Cc: Ira Weiny Cc: Jerome Glisse Cc: John Hubbard Cc: Ralph Campbell Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- include/linux/mmu_notifier.h | 13 +++++-------- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 4 ++-- mm/hugetlb.c | 12 ++++++------ mm/khugepaged.c | 6 +++--- mm/ksm.c | 5 ++--- mm/madvise.c | 2 +- mm/mapping_dirty_helpers.c | 2 +- mm/memory.c | 10 +++++----- mm/migrate_device.c | 4 ++-- mm/mmu_notifier.c | 10 ---------- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- mm/oom_kill.c | 2 +- mm/rmap.c | 11 +++++------ 16 files changed, 37 insertions(+), 52 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index af1c49ae11b1..a44339a77a75 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1306,7 +1306,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, - 0, NULL, mm, 0, -1UL); + 0, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d6c06e140277..64a3e051c3c4 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -269,7 +269,6 @@ extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { - struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; @@ -514,12 +513,10 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, - struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) { - range->vma = vma; range->event = event; range->mm = mm; range->start = start; @@ -530,10 +527,10 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, - struct vm_area_struct *vma, struct mm_struct *mm, - unsigned long start, unsigned long end, void *owner) + struct mm_struct *mm, unsigned long start, + unsigned long end, void *owner) { - mmu_notifier_range_init(range, event, flags, vma, mm, start, end); + mmu_notifier_range_init(range, event, flags, mm, start, end); range->owner = owner; } @@ -659,9 +656,9 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, range->end = end; } -#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ +#define mmu_notifier_range_init(range,event,flags,mm,start,end) \ _mmu_notifier_range_init(range, start, end) -#define mmu_notifier_range_init_owner(range, event, flags, vma, mm, start, \ +#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d9e357b7e17c..29f36d2ae129 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, int err; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); if (new_page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7e68a36b4f7d..c13b1f67d14e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2020,7 +2020,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2282,7 +2282,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6fe65f14d33b..273a6522aa4c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4966,7 +4966,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int ret = 0; if (cow) { - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src, src_vma->vm_start, src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); @@ -5177,7 +5177,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, struct mmu_notifier_range range; bool shared_pmd = false; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* @@ -5391,7 +5391,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, struct mmu_notifier_range range; struct mmu_gather tlb; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); @@ -5597,7 +5597,7 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); @@ -6637,7 +6637,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, * range if PMD sharing is possible. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, - 0, vma, mm, start, end); + 0, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); @@ -7368,7 +7368,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 90acfea40c13..57164c15e076 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1040,8 +1040,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, anon_vma_lock_write(vma->anon_vma); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, - address, address + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, + address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pte = pte_offset_map(pmd, address); @@ -1412,7 +1412,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v if (vma->anon_vma) lockdep_assert_held_write(&vma->anon_vma->root->rwsem); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd = pmdp_collapse_flush(vma, addr, pmdp); diff --git a/mm/ksm.c b/mm/ksm.c index dd02780c387f..cea0c4478220 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1057,8 +1057,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, - pvmw.address, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1164,7 +1163,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/madvise.c b/mm/madvise.c index e407d335e614..5296e78dccda 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -765,7 +765,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 175e424b9ab1..e1eb33f49059 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -191,7 +191,7 @@ static int wp_clean_pre_vma(unsigned long start, unsigned long end, wpwalk->tlbflush_end = start; mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0, - walk->vma, walk->mm, start, end); + walk->mm, start, end); mmu_notifier_invalidate_range_start(&wpwalk->range); flush_cache_range(walk->vma, start, end); diff --git a/mm/memory.c b/mm/memory.c index 90f8f72777c7..c6bacd58d032 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1266,7 +1266,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, src_vma, src_mm, addr, end); + 0, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); /* * Disabling preemption is not needed for the write side, as @@ -1683,7 +1683,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, }; MA_STATE(mas, mt, vma->vm_end, vma->vm_end); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { @@ -1709,7 +1709,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); if (is_vm_hugetlb_page(vma)) adjust_range_if_pmd_sharing_possible(vma, &range.start, @@ -3091,7 +3091,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -3561,7 +3561,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) return VM_FAULT_RETRY; - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 721b2365dbca..6c3740318a98 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -306,7 +306,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * private page mappings that won't be migrated. */ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, - migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->vma->vm_mm, migrate->start, migrate->end, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); @@ -733,7 +733,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, notified = true; mmu_notifier_range_init_owner(&range, - MMU_NOTIFY_MIGRATE, 0, migrate->vma, + MMU_NOTIFY_MIGRATE, 0, migrate->vma->vm_mm, addr, migrate->end, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index f45ff1b7626a..50c0dde1354f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1120,13 +1120,3 @@ void mmu_notifier_synchronize(void) synchronize_srcu(&srcu); } EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); - -bool -mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) -{ - if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) - return false; - /* Return true if the vma still have the read flag set. */ - return range->vma->vm_flags & VM_READ; -} -EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); diff --git a/mm/mprotect.c b/mm/mprotect.c index 92fc6f3fa512..6ecdf0671b81 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -398,7 +398,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb, if (!range.start) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, - vma, vma->vm_mm, addr, end); + vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/mremap.c b/mm/mremap.c index 930f65c315c0..05f90f47e149 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -498,7 +498,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_addr, len); flush_cache_range(vma, old_addr, old_end); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, old_addr, old_end); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1276e49b31b0..044e1eed720e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -542,7 +542,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, - vma, mm, vma->vm_start, + mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, mm); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { diff --git a/mm/rmap.c b/mm/rmap.c index ab74e0547a52..6ccd42bbae93 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -944,9 +944,8 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) * We have to assume the worse case ie pmd for invalidation. Note that * the folio can not be freed from this function. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, vma, vma->vm_mm, address, - vma_address_end(pvmw)); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, + vma->vm_mm, address, vma_address_end(pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(pvmw)) { @@ -1475,7 +1474,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * try_to_unmap() must hold a reference on the folio. */ range.end = vma_address_end(&pvmw); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* @@ -1850,7 +1849,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * try_to_unmap() must hold a reference on the page. */ range.end = vma_address_end(&pvmw); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* @@ -2180,7 +2179,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, swp_entry_t entry; pte_t swp_pte; - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, address + folio_size(folio)), args->owner); -- cgit v1.2.3-70-g09d2 From 94688e8eb453e616098cb930e5f6fed4a6ea2dfa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:47 +0000 Subject: mm: remove folio_pincount_ptr() and head_compound_pincount() We can use folio->_pincount directly, since all users are guarded by tests of compound/large. Link: https://lkml.kernel.org/r/20230111142915.1001531-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: John Hubbard Signed-off-by: Andrew Morton --- Documentation/core-api/pin_user_pages.rst | 29 ++++++++++++++--------------- include/linux/mm.h | 14 ++------------ include/linux/mm_types.h | 5 ----- mm/debug.c | 4 ++-- mm/gup.c | 8 ++++---- mm/huge_memory.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/page_alloc.c | 9 ++++++--- 8 files changed, 32 insertions(+), 45 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index facafbdecb95..9fb0b1080d3b 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -55,18 +55,17 @@ flags the caller provides. The caller is required to pass in a non-null struct pages* array, and the function then pins pages by incrementing each by a special value: GUP_PIN_COUNTING_BIAS. -For compound pages, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, -an exact form of pin counting is achieved, by using the 2nd struct page -in the compound page. A new struct page field, compound_pincount, has -been added in order to support this. - -This approach for compound pages avoids the counting upper limit problems that -are discussed below. Those limitations would have been aggravated severely by -huge pages, because each tail page adds a refcount to the head page. And in -fact, testing revealed that, without a separate compound_pincount field, -page overflows were seen in some huge page stress tests. - -This also means that huge pages and compound pages do not suffer +For large folios, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, +the extra space available in the struct folio is used to store the +pincount directly. + +This approach for large folios avoids the counting upper limit problems +that are discussed below. Those limitations would have been aggravated +severely by huge pages, because each tail page adds a refcount to the +head page. And in fact, testing revealed that, without a separate pincount +field, refcount overflows were seen in some huge page stress tests. + +This also means that huge pages and large folios do not suffer from the false positives problem that is mentioned below.:: Function @@ -264,9 +263,9 @@ place.) Other diagnostics ================= -dump_page() has been enhanced slightly, to handle these new counting -fields, and to better report on compound pages in general. Specifically, -for compound pages, the exact (compound_pincount) pincount is reported. +dump_page() has been enhanced slightly to handle these new counting +fields, and to better report on large folios in general. Specifically, +for large folios, the exact pincount is reported. References ========== diff --git a/include/linux/mm.h b/include/linux/mm.h index 76c97cb8ee9a..6d3945207067 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,11 +1011,6 @@ static inline void folio_set_compound_dtor(struct folio *folio, void destroy_large_folio(struct folio *folio); -static inline int head_compound_pincount(struct page *head) -{ - return atomic_read(compound_pincount_ptr(head)); -} - static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; @@ -1641,11 +1636,6 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } -static inline atomic_t *folio_pincount_ptr(struct folio *folio) -{ - return &folio_page(folio, 1)->compound_pincount; -} - /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. @@ -1663,7 +1653,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) * expected to be able to deal gracefully with a false positive. * * For large folios, the result will be exactly correct. That's because - * we have more tracking data available: the compound_pincount is used + * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. @@ -1674,7 +1664,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_test_large(folio)) - return atomic_read(folio_pincount_ptr(folio)) > 0; + return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 10b6eb311ede..6ff1d7db00a7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -443,11 +443,6 @@ static inline atomic_t *subpages_mapcount_ptr(struct page *page) return &page[1].subpages_mapcount; } -static inline atomic_t *compound_pincount_ptr(struct page *page) -{ - return &page[1].compound_pincount; -} - /* * Used for sizing the vmemmap region on some architectures */ diff --git a/mm/debug.c b/mm/debug.c index 7f8e5f744e42..893c9dbf76ca 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,11 +94,11 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d pincount:%d\n", head, compound_order(head), head_compound_mapcount(head), head_subpages_mapcount(head), - head_compound_pincount(head)); + atomic_read(&folio->_pincount)); } #ifdef CONFIG_MEMCG diff --git a/mm/gup.c b/mm/gup.c index f45a3a5be53a..38ba1697dd61 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -111,7 +111,7 @@ retry: * FOLL_GET: folio's refcount will be incremented by @refs. * * FOLL_PIN on large folios: folio's refcount will be incremented by - * @refs, and its compound_pincount will be incremented by @refs. + * @refs, and its pincount will be incremented by @refs. * * FOLL_PIN on single-page folios: folio's refcount will be incremented by * @refs * GUP_PIN_COUNTING_BIAS. @@ -157,7 +157,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) * try_get_folio() is left intact. */ if (folio_test_large(folio)) - atomic_add(refs, folio_pincount_ptr(folio)); + atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1)); @@ -182,7 +182,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) if (flags & FOLL_PIN) { node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) - atomic_sub(refs, folio_pincount_ptr(folio)); + atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; } @@ -232,7 +232,7 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) */ if (folio_test_large(folio)) { folio_ref_add(folio, 1); - atomic_add(1, folio_pincount_ptr(folio)); + atomic_add(1, &folio->_pincount); } else { folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c13b1f67d14e..9570f03cdee4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2477,9 +2477,9 @@ static void __split_huge_page_tail(struct page *head, int tail, * of swap cache pages that store the swp_entry_t in tail pages. * Fix up and warn once if private is unexpectedly set. * - * What of 32-bit systems, on which head[1].compound_pincount overlays + * What of 32-bit systems, on which folio->_pincount overlays * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and - * compound_pincount must be 0 for folio_ref_freeze() to have succeeded. + * pincount must be 0 for folio_ref_freeze() to have succeeded. */ if (!folio_test_swapcache(page_folio(head))) { VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 273a6522aa4c..15b2707c1600 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1476,7 +1476,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, atomic_set(folio_mapcount_ptr(folio), 0); atomic_set(folio_subpages_mapcount_ptr(folio), 0); - atomic_set(folio_pincount_ptr(folio), 0); + atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { p = folio_page(folio, i); @@ -1998,7 +1998,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, } atomic_set(folio_mapcount_ptr(folio), -1); atomic_set(folio_subpages_mapcount_ptr(folio), 0); - atomic_set(folio_pincount_ptr(folio), 0); + atomic_set(&folio->_pincount, 0); return true; out_error: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5514d84cc712..b224c2132ed1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -775,11 +775,13 @@ void free_compound_page(struct page *page) static void prep_compound_head(struct page *page, unsigned int order) { + struct folio *folio = (struct folio *)page; + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); atomic_set(subpages_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(&folio->_pincount, 0); } static void prep_compound_tail(struct page *head, int tail_idx) @@ -1291,6 +1293,7 @@ static inline bool free_page_is_bad(struct page *page) static int free_tail_pages_check(struct page *head_page, struct page *page) { + struct folio *folio = (struct folio *)head_page; int ret = 1; /* @@ -1314,8 +1317,8 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) bad_page(page, "nonzero subpages_mapcount"); goto out; } - if (unlikely(head_compound_pincount(head_page))) { - bad_page(page, "nonzero compound_pincount"); + if (unlikely(atomic_read(&folio->_pincount))) { + bad_page(page, "nonzero pincount"); goto out; } break; -- cgit v1.2.3-70-g09d2 From eec20426d48bd7b63c69969a793943ed1a99b731 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:48 +0000 Subject: mm: convert head_subpages_mapcount() into folio_nr_pages_mapped() Calling this 'mapcount' is confusing since mapcount is usually the number of times something is mapped; instead this is the number of mapped pages. It's also better to enforce that this is a folio rather than a head page. Move folio_nr_pages_mapped() into mm/internal.h since this is not something we want device drivers or filesystems poking at. Get rid of folio_subpages_mapcount_ptr() and use folio->_nr_pages_mapped directly. Link: https://lkml.kernel.org/r/20230111142915.1001531-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++-------------------- include/linux/mm_types.h | 12 +++--------- mm/debug.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/internal.h | 18 ++++++++++++++++++ mm/rmap.c | 9 +++++---- 6 files changed, 32 insertions(+), 37 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6d3945207067..2bdd08a5b8b4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -843,24 +843,6 @@ static inline int head_compound_mapcount(struct page *head) return atomic_read(compound_mapcount_ptr(head)) + 1; } -/* - * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages, - * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit - * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently - * leaves subpages_mapcount at 0, but avoid surprise if it participates later. - */ -#define COMPOUND_MAPPED 0x800000 -#define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1) - -/* - * Number of sub-pages mapped by PTE, does not include compound mapcount. - * Must be called only on head of compound page. - */ -static inline int head_subpages_mapcount(struct page *head) -{ - return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED; -} - /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -920,9 +902,9 @@ static inline bool folio_large_is_mapped(struct folio *folio) { /* * Reading folio_mapcount_ptr() below could be omitted if hugetlb - * participated in incrementing subpages_mapcount when compound mapped. + * participated in incrementing nr_pages_mapped when compound mapped. */ - return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 || + return atomic_read(&folio->_nr_pages_mapped) > 0 || atomic_read(folio_mapcount_ptr(folio)) >= 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6ff1d7db00a7..4751c67b98a6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -307,7 +307,7 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). - * @_subpages_mapcount: Do not use directly, call folio_mapcount(). + * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * @_flags_2: For alignment. Do not use. @@ -361,7 +361,7 @@ struct folio { unsigned char _folio_dtor; unsigned char _folio_order; atomic_t _compound_mapcount; - atomic_t _subpages_mapcount; + atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; @@ -404,7 +404,7 @@ FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); FOLIO_MATCH(compound_mapcount, _compound_mapcount); -FOLIO_MATCH(subpages_mapcount, _subpages_mapcount); +FOLIO_MATCH(subpages_mapcount, _nr_pages_mapped); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); @@ -427,12 +427,6 @@ static inline atomic_t *folio_mapcount_ptr(struct folio *folio) return &tail->compound_mapcount; } -static inline atomic_t *folio_subpages_mapcount_ptr(struct folio *folio) -{ - struct page *tail = &folio->page + 1; - return &tail->subpages_mapcount; -} - static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; diff --git a/mm/debug.c b/mm/debug.c index 893c9dbf76ca..8e58e8dab0b2 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,10 +94,10 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d nr_pages_mapped:%d pincount:%d\n", head, compound_order(head), head_compound_mapcount(head), - head_subpages_mapcount(head), + folio_nr_pages_mapped(folio), atomic_read(&folio->_pincount)); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 15b2707c1600..c9702224931c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1475,7 +1475,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, struct page *p; atomic_set(folio_mapcount_ptr(folio), 0); - atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { @@ -1997,7 +1997,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, set_compound_head(p, &folio->page); } atomic_set(folio_mapcount_ptr(folio), -1); - atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); return true; diff --git a/mm/internal.h b/mm/internal.h index 1d6f4e168510..583e15357e09 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -52,6 +52,24 @@ struct folio_batch; void page_writeback_init(void); +/* + * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, + * its nr_pages_mapped would be 0x400000: choose the COMPOUND_MAPPED bit + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently + * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. + */ +#define COMPOUND_MAPPED 0x800000 +#define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1) + +/* + * How many individual pages have an elevated _mapcount. Excludes + * the folio's entire_mapcount. + */ +static inline int folio_nr_pages_mapped(struct folio *folio) +{ + return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; +} + static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; diff --git a/mm/rmap.c b/mm/rmap.c index 6ccd42bbae93..b573472b4ac3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1080,12 +1080,13 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, int total_compound_mapcount(struct page *head) { + struct folio *folio = (struct folio *)head; int mapcount = head_compound_mapcount(head); int nr_subpages; int i; /* In the common case, avoid the loop when no subpages mapped by PTE */ - if (head_subpages_mapcount(head) == 0) + if (folio_nr_pages_mapped(folio) == 0) return mapcount; /* * Add all the PTE mappings of those subpages mapped by PTE. @@ -1233,7 +1234,7 @@ void page_add_anon_rmap(struct page *page, nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; @@ -1337,7 +1338,7 @@ void page_add_file_rmap(struct page *page, nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; @@ -1399,7 +1400,7 @@ void page_remove_rmap(struct page *page, nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED)) { nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; -- cgit v1.2.3-70-g09d2 From 46f2722825983a51e849eb0ef2814e5c7f040fef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:59 +0000 Subject: hugetlb: remove uses of folio_mapcount_ptr Use the entire_mapcount field directly. Link: https://lkml.kernel.org/r/20230111142915.1001531-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c9702224931c..a68e0e597a8f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1474,7 +1474,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, int nr_pages = 1 << order; struct page *p; - atomic_set(folio_mapcount_ptr(folio), 0); + atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); @@ -1996,7 +1996,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, if (i != 0) set_compound_head(p, &folio->page); } - atomic_set(folio_mapcount_ptr(folio), -1); + atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); return true; -- cgit v1.2.3-70-g09d2 From 2d678c641a4625d2b1cfeb50d7426fab6d3740b3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:07 +0000 Subject: hugetlb: remove uses of compound_dtor and compound_nr Convert the entire file to use the folio equivalents. Link: https://lkml.kernel.org/r/20230111142915.1001531-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a68e0e597a8f..ca9e177b9c54 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2038,11 +2038,12 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, */ int PageHuge(struct page *page) { + struct folio *folio; + if (!PageCompound(page)) return 0; - - page = compound_head(page); - return page[1].compound_dtor == HUGETLB_PAGE_DTOR; + folio = page_folio(page); + return folio->_folio_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHuge); @@ -2052,10 +2053,11 @@ EXPORT_SYMBOL_GPL(PageHuge); */ int PageHeadHuge(struct page *page_head) { - if (!PageHead(page_head)) + struct folio *folio = (struct folio *)page_head; + if (!folio_test_large(folio)) return 0; - return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; + return folio->_folio_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHeadHuge); -- cgit v1.2.3-70-g09d2 From 2ff6cecee669bf0fc63eadebac8cfc81f74b9a4c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:03 -0600 Subject: mm/memory-failure: convert hugetlb_clear_page_hwpoison to folios Change hugetlb_clear_page_hwpoison() to folio_clear_hugetlb_hwpoison() by changing the function to take in a folio. This converts one use of ClearPageHWPoison and HPageRawHwpUnreliable to their folio equivalents. Link: https://lkml.kernel.org/r/20230112204608.80136-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 2 +- mm/memory-failure.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7d6413c3b8f5..cf60fe741c1d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -878,9 +878,9 @@ extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_FAILURE -extern void hugetlb_clear_page_hwpoison(struct page *hpage); +extern void folio_clear_hugetlb_hwpoison(struct folio *folio); #else -static inline void hugetlb_clear_page_hwpoison(struct page *hpage) +static inline void folio_clear_hugetlb_hwpoison(struct folio *folio) { } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ca9e177b9c54..291ad4cb02f9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1731,7 +1731,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * which makes any healthy subpages reusable. */ if (unlikely(folio_test_hwpoison(folio))) - hugetlb_clear_page_hwpoison(&folio->page); + folio_clear_hugetlb_hwpoison(folio); for (i = 0; i < pages_per_huge_page(h); i++) { subpage = folio_page(folio, i); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 19f6035608d5..d4aaed2756af 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1785,12 +1785,12 @@ static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag) return __free_raw_hwp_pages(hpage, move_flag); } -void hugetlb_clear_page_hwpoison(struct page *hpage) +void folio_clear_hugetlb_hwpoison(struct folio *folio) { - if (HPageRawHwpUnreliable(hpage)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - ClearPageHWPoison(hpage); - free_raw_hwp_pages(hpage, true); + folio_clear_hwpoison(folio); + free_raw_hwp_pages(&folio->page, true); } /* @@ -1889,7 +1889,7 @@ retry: folio_lock(folio); if (hwpoison_filter(p)) { - hugetlb_clear_page_hwpoison(&folio->page); + folio_clear_hugetlb_hwpoison(folio); if (migratable_cleared) folio_set_hugetlb_migratable(folio); folio_unlock(folio); -- cgit v1.2.3-70-g09d2 From 04bac040bc71b4b37550eed5854f34ca161756f9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 18 Jan 2023 09:40:39 -0800 Subject: mm/hugetlb: convert get_hwpoison_huge_page() to folios Straightforward conversion of get_hwpoison_huge_page() to get_hwpoison_hugetlb_folio(). Reduces two references to a head page in memory-failure.c [arnd@arndb.de: fix get_hwpoison_hugetlb_folio() stub] Link: https://lkml.kernel.org/r/20230119111920.635260-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20230118174039.14247-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Signed-off-by: Arnd Bergmann Acked-by: Naoya Horiguchi Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 10 +++++----- mm/memory-failure.c | 22 +++++++++++----------- 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cf60fe741c1d..a51e6daacac6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -172,7 +172,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); int isolate_hugetlb(struct page *page, struct list_head *list); -int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); +int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void putback_active_hugepage(struct page *page); @@ -418,7 +418,7 @@ static inline int isolate_hugetlb(struct page *page, struct list_head *list) return -EBUSY; } -static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) +static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 291ad4cb02f9..0f9df0143772 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7268,18 +7268,18 @@ unlock: return ret; } -int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) +int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { int ret = 0; *hugetlb = false; spin_lock_irq(&hugetlb_lock); - if (PageHeadHuge(page)) { + if (folio_test_hugetlb(folio)) { *hugetlb = true; - if (HPageFreed(page)) + if (folio_test_hugetlb_freed(folio)) ret = 0; - else if (HPageMigratable(page) || unpoison) - ret = get_page_unless_zero(page); + else if (folio_test_hugetlb_migratable(folio) || unpoison) + ret = folio_try_get(folio); else ret = -EBUSY; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 38f84bff8bdf..0a382191737f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1257,28 +1257,28 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) static int __get_hwpoison_page(struct page *page, unsigned long flags) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb, false); + ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false); if (hugetlb) return ret; /* - * This check prevents from calling get_page_unless_zero() for any - * unsupported type of page in order to reduce the risk of unexpected - * races caused by taking a page refcount. + * This check prevents from calling folio_try_get() for any + * unsupported type of folio in order to reduce the risk of unexpected + * races caused by taking a folio refcount. */ - if (!HWPoisonHandlable(head, flags)) + if (!HWPoisonHandlable(&folio->page, flags)) return -EBUSY; - if (get_page_unless_zero(head)) { - if (head == compound_head(page)) + if (folio_try_get(folio)) { + if (folio == page_folio(page)) return 1; pr_info("%#lx cannot catch tail\n", page_to_pfn(page)); - put_page(head); + folio_put(folio); } return 0; @@ -1347,11 +1347,11 @@ out: static int __get_unpoison_page(struct page *page) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb, true); + ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true); if (hugetlb) return ret; -- cgit v1.2.3-70-g09d2 From e430a95a04efc557bc4ff9b3035c7c85aee5d63f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:48 -0800 Subject: mm: replace VM_LOCKED_CLEAR_MASK with VM_LOCKED_MASK To simplify the usage of VM_LOCKED_CLEAR_MASK in vm_flags_clear(), replace it with VM_LOCKED_MASK bitmask and convert all users. Link: https://lkml.kernel.org/r/20230126193752.297968-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Reviewed-by: Davidlohr Bueso Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- kernel/fork.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mlock.c | 6 +++--- mm/mmap.c | 6 +++--- mm/mremap.c | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index e2df5d122b67..663726ca2240 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -421,8 +421,8 @@ extern unsigned int kobjsize(const void *objp); /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE -/* This mask is used to clear all the VMA flags used by mlock */ -#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR diff --git a/kernel/fork.c b/kernel/fork.c index 9260f975b8f4..5e3029ea8e1e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -659,7 +659,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + vm_flags_clear(tmp, VM_LOCKED_MASK); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0f9df0143772..ab35b1cc9927 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6969,8 +6969,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; - unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the diff --git a/mm/mlock.c b/mm/mlock.c index 0336f52e03d7..5c4fff93cd6b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -497,7 +497,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, if (vma->vm_start != tmp) return -ENOMEM; - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; @@ -661,7 +661,7 @@ static int apply_mlockall_flags(int flags) struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; - current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; + current->mm->def_flags &= ~VM_LOCKED_MASK; if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; @@ -681,7 +681,7 @@ static int apply_mlockall_flags(int flags) for_each_vma(vmi, vma) { vm_flags_t newflags; - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; /* Ignore errors */ diff --git a/mm/mmap.c b/mm/mmap.c index b698a96d0511..03d7c37c5969 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2668,7 +2668,7 @@ expanded: if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += (len >> PAGE_SHIFT); } @@ -3338,8 +3338,8 @@ static struct vm_area_struct *__install_special_mapping( vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_init(vma, (vm_flags | mm->def_flags | + VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; diff --git a/mm/mremap.c b/mm/mremap.c index 5c9a57909862..d70d8063c6e2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -687,7 +687,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { /* We always clear VM_LOCKED[ONFAULT] on the old vma */ - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_clear(vma, VM_LOCKED_MASK); /* * anon_vma links of the old vma is no longer needed after its page -- cgit v1.2.3-70-g09d2 From 6aa3a920125e9f58891e2b5dc2efd4d0c1ff05a6 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:50 -0600 Subject: mm/hugetlb: convert isolate_hugetlb to folios Patch series "continue hugetlb folio conversion", v3. This series continues the conversion of core hugetlb functions to use folios. This series converts many helper funtions in the hugetlb fault path. This is in preparation for another series to convert the hugetlb fault code paths to operate on folios. This patch (of 8): Convert isolate_hugetlb() to take in a folio and convert its callers to pass a folio. Use page_folio() to convert the callers to use a folio is safe as isolate_hugetlb() operates on a head page. Link: https://lkml.kernel.org/r/20230113223057.173292-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20230113223057.173292-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/gup.c | 2 +- mm/hugetlb.c | 16 ++++++++-------- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a51e6daacac6..6e38a019f654 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct page *page, struct list_head *list); +int isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -413,7 +413,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct page *page, struct list_head *list) +static inline int isolate_hugetlb(struct folio *folio, struct list_head *list) { return -EBUSY; } diff --git a/mm/gup.c b/mm/gup.c index 25e4a3d923d6..b0885f70579c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1930,7 +1930,7 @@ static unsigned long collect_longterm_unpinnable_pages( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(&folio->page, movable_page_list); + isolate_hugetlb(folio, movable_page_list); continue; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab35b1cc9927..0c1e1ce113c8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2925,7 +2925,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(&old_folio->page, list); + ret = isolate_hugetlb(old_folio, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { @@ -3000,7 +3000,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) + if (folio_ref_count(folio) && !isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -7250,19 +7250,19 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -int isolate_hugetlb(struct page *page, struct list_head *list) +int isolate_hugetlb(struct folio *folio, struct list_head *list) { int ret = 0; spin_lock_irq(&hugetlb_lock); - if (!PageHeadHuge(page) || - !HPageMigratable(page) || - !get_page_unless_zero(page)) { + if (!folio_test_hugetlb(folio) || + !folio_test_hugetlb_migratable(folio) || + !folio_try_get(folio)) { ret = -EBUSY; goto unlock; } - ClearHPageMigratable(page); - list_move_tail(&page->lru, list); + folio_clear_hugetlb_migratable(folio); + list_move_tail(&folio->lru, list); unlock: spin_unlock_irq(&hugetlb_lock); return ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b4b30d9b0782..db85c2d37f70 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2508,7 +2508,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool isolated = false; if (PageHuge(page)) { - isolated = !isolate_hugetlb(page, pagelist); + isolated = !isolate_hugetlb(page_folio(page), pagelist); } else { bool lru = !__PageMovable(page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd40f7e9f176..a1e8c3e9ab08 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1641,7 +1641,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(head, &source); + isolate_hugetlb(folio, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dd5ca942256f..fc034b070645 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -602,7 +602,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(page, qp->pagelist) && + if (isolate_hugetlb(page_folio(page), qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate page but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index 206fcdbe67f3..f6464bce7678 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1773,7 +1773,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - err = isolate_hugetlb(page, pagelist); + err = isolate_hugetlb(page_folio(page), pagelist); if (!err) err = 1; } -- cgit v1.2.3-70-g09d2 From 6f6956cf7e6a3034f61780446547e849aa4e216d Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:51 -0600 Subject: mm/hugetlb: convert __update_and_free_page() to folios Change __update_and_free_page() to __update_and_free_hugetlb_folio() by changing its callers to pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0c1e1ce113c8..d27fcf768548 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1698,10 +1698,10 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, enqueue_hugetlb_folio(h, folio); } -static void __update_and_free_page(struct hstate *h, struct page *page) +static void __update_and_free_hugetlb_folio(struct hstate *h, + struct folio *folio) { int i; - struct folio *folio = page_folio(page); struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) @@ -1714,7 +1714,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - if (hugetlb_vmemmap_restore(h, page)) { + if (hugetlb_vmemmap_restore(h, &folio->page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1750,7 +1750,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { - __free_pages(page, huge_page_order(h)); + __free_pages(&folio->page, huge_page_order(h)); } } @@ -1790,7 +1790,7 @@ static void free_hpage_workfn(struct work_struct *work) */ h = size_to_hstate(page_size(page)); - __update_and_free_page(h, page); + __update_and_free_hugetlb_folio(h, page_folio(page)); cond_resched(); } @@ -1807,7 +1807,7 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { - __update_and_free_page(h, &folio->page); + __update_and_free_hugetlb_folio(h, folio); return; } -- cgit v1.2.3-70-g09d2 From a36f1e9024740c3820427afca4cd375e32a1bb15 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:52 -0600 Subject: mm/hugetlb: convert dequeue_hugetlb_page functions to folios dequeue_huge_page_node_exact() is changed to dequeue_hugetlb_folio_node_ exact() and dequeue_huge_page_nodemask() is changed to dequeue_hugetlb_ folio_nodemask(). Update their callers to pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 56 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 26 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d27fcf768548..3e648fccf33e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1282,32 +1282,33 @@ static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) folio_set_hugetlb_freed(folio); } -static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) +static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, + int nid) { - struct page *page; + struct folio *folio; bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); - list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && !is_longterm_pinnable_page(page)) + list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { + if (pin && !folio_is_longterm_pinnable(folio)) continue; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) continue; - list_move(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); - ClearHPageFreed(page); + list_move(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); + folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; - return page; + return folio; } return NULL; } -static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, - nodemask_t *nmask) +static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { unsigned int cpuset_mems_cookie; struct zonelist *zonelist; @@ -1320,7 +1321,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { - struct page *page; + struct folio *folio; if (!cpuset_zone_allowed(zone, gfp_mask)) continue; @@ -1332,9 +1333,9 @@ retry_cpuset: continue; node = zone_to_nid(zone); - page = dequeue_huge_page_node_exact(h, node); - if (page) - return page; + folio = dequeue_hugetlb_folio_node_exact(h, node); + if (folio) + return folio; } if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; @@ -1352,7 +1353,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page = NULL; + struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1374,22 +1375,24 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!page) - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (!folio) + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + nid, nodemask); - if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { - SetHPageRestoreReserve(page); + if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { + folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } mpol_cond_put(mpol); - return page; + return &folio->page; err: return NULL; @@ -2475,12 +2478,13 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { - struct page *page; + struct folio *folio; - page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); - if (page) { + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + preferred_nid, nmask); + if (folio) { spin_unlock_irq(&hugetlb_lock); - return page; + return &folio->page; } } spin_unlock_irq(&hugetlb_lock); -- cgit v1.2.3-70-g09d2 From 3a740e8bb56ef7ee6b9098b694caabab843be067 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:53 -0600 Subject: mm/hugetlb: convert alloc_surplus_huge_page() to folios Change alloc_surplus_huge_page() to alloc_surplus_hugetlb_folio() and update its callers. Link: https://lkml.kernel.org/r/20230113223057.173292-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e648fccf33e..fa61b4aa68ca 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2378,8 +2378,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) /* * Allocates a fresh surplus page from the page allocator. */ -static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) +static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio = NULL; @@ -2416,7 +2416,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, out_unlock: spin_unlock_irq(&hugetlb_lock); - return &folio->page; + return folio; } static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, @@ -2449,7 +2449,7 @@ static struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page = NULL; + struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; @@ -2460,16 +2460,16 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = alloc_surplus_huge_page(h, gfp, nid, nodemask); + folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!page) - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + if (!folio) + folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); - return page; + return &folio->page; } /* page migration callback function */ @@ -2518,6 +2518,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); + struct folio *folio; struct page *page, *tmp; int ret; long i; @@ -2537,13 +2538,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); - if (!page) { + if (!folio) { alloc_ok = false; break; } - list_add(&page->lru, &surplus_list); + list_add(&folio->lru, &surplus_list); cond_resched(); } allocated += i; @@ -3496,7 +3497,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with alloc_surplus_huge_page() here and be unable + * We might race with alloc_surplus_hugetlb_folio() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -3539,7 +3540,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * alloc_surplus_huge_page() is checking the global counter, + * alloc_surplus_hugetlb_folio() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. -- cgit v1.2.3-70-g09d2 From ff7d853b031302376a0d3640fa1c463d94079637 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:54 -0600 Subject: mm/hugetlb: increase use of folios in alloc_huge_page() Change hugetlb_cgroup_commit_charge{,_rsvd}(), dequeue_huge_page_vma() and alloc_buddy_huge_page_with_mpol() to use folios so alloc_huge_page() is cleaned by operating on folios until its return. Link: https://lkml.kernel.org/r/20230113223057.173292-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 8 ++++---- mm/hugetlb.c | 33 ++++++++++++++++----------------- mm/hugetlb_cgroup.c | 8 ++------ 3 files changed, 22 insertions(+), 27 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index f706626a8063..3d82d91f49ac 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -141,10 +141,10 @@ extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio); extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, @@ -230,14 +230,14 @@ static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx, static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } static inline void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fa61b4aa68ca..5d0d1efbe590 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1348,7 +1348,7 @@ static unsigned long available_huge_pages(struct hstate *h) return h->free_huge_pages - h->resv_huge_pages; } -static struct page *dequeue_huge_page_vma(struct hstate *h, +static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, long chg) @@ -1392,7 +1392,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, } mpol_cond_put(mpol); - return &folio->page; + return folio; err: return NULL; @@ -2446,7 +2446,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static -struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, +struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = NULL; @@ -2469,7 +2469,7 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, if (!folio) folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); - return &folio->page; + return folio; } /* page migration callback function */ @@ -3018,7 +3018,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); - struct page *page; struct folio *folio; long map_chg, map_commit; long gbl_chg; @@ -3082,34 +3081,34 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); - if (!page) { + folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + if (!folio) { spin_unlock_irq(&hugetlb_lock); - page = alloc_buddy_huge_page_with_mpol(h, vma, addr); - if (!page) + folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); + if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } - list_add(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); + list_add(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); /* Fall through */ } - folio = page_folio(page); - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ if (deferred_reserve) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), - h_cg, page); + h_cg, folio); } spin_unlock_irq(&hugetlb_lock); - hugetlb_set_page_subpool(page, spool); + hugetlb_set_folio_subpool(folio, spool); map_commit = vma_commit_reservation(h, vma, addr); if (unlikely(map_chg > map_commit)) { @@ -3130,7 +3129,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } - return page; + return &folio->page; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d9e4425d81ac..dedd2edb076e 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -331,19 +331,15 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } -- cgit v1.2.3-70-g09d2 From e37d3e838d9078538f920957d1e89682b6764977 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:55 -0600 Subject: mm/hugetlb: convert alloc_migrate_huge_page to folios Change alloc_huge_page_nodemask() to alloc_hugetlb_folio_nodemask() and alloc_migrate_huge_page() to alloc_migrate_hugetlb_folio(). Both functions now return a folio rather than a page. Link: https://lkml.kernel.org/r/20230113223057.173292-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 18 +++++++++--------- mm/migrate.c | 5 ++++- 3 files changed, 16 insertions(+), 13 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6e38a019f654..2375c62c61a4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -719,7 +719,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); @@ -1040,8 +1040,8 @@ static inline struct page *alloc_huge_page(struct vm_area_struct *vma, return NULL; } -static inline struct page * -alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +static inline struct folio * +alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { return NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d0d1efbe590..57894beb3382 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2419,7 +2419,7 @@ out_unlock: return folio; } -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; @@ -2439,7 +2439,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, */ folio_set_hugetlb_temporary(folio); - return &folio->page; + return folio; } /* @@ -2472,8 +2472,8 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, return folio; } -/* page migration callback function */ -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +/* folio migration callback function */ +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); @@ -2484,12 +2484,12 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, preferred_nid, nmask); if (folio) { spin_unlock_irq(&hugetlb_lock); - return &folio->page; + return folio; } } spin_unlock_irq(&hugetlb_lock); - return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } /* mempolicy aware migration callback */ @@ -2498,16 +2498,16 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, { struct mempolicy *mpol; nodemask_t *nodemask; - struct page *page; + struct folio *folio; gfp_t gfp_mask; int node; gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); - return page; + return &folio->page; } /* diff --git a/mm/migrate.c b/mm/migrate.c index f6464bce7678..811e76c6fac1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1663,6 +1663,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) struct migration_target_control *mtc; gfp_t gfp_mask; unsigned int order = 0; + struct folio *hugetlb_folio = NULL; struct folio *new_folio = NULL; int nid; int zidx; @@ -1677,7 +1678,9 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) struct hstate *h = folio_hstate(folio); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); - return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); + hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid, + mtc->nmask, gfp_mask); + return &hugetlb_folio->page; } if (folio_test_large(folio)) { -- cgit v1.2.3-70-g09d2 From 0ffdc38eb564c1c71a58bbaf874945ba54293ff9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:56 -0600 Subject: mm/hugetlb: convert restore_reserve_on_error() to folios Use the hugetlb folio flag macros inside restore_reserve_on_error() and update the comments to reflect the use of folios. Link: https://lkml.kernel.org/r/20230113223057.173292-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 57894beb3382..3120c3db60c4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2819,22 +2819,23 @@ static long vma_del_reservation(struct hstate *h, void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) { + struct folio *folio = page_folio(page); long rc = vma_needs_reservation(h, vma, address); - if (HPageRestoreReserve(page)) { + if (folio_test_hugetlb_restore_reserve(folio)) { if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map - * manipulation. Clear HPageRestoreReserve so that - * global reserve count will not be incremented + * manipulation. Clear hugetlb_restore_reserve so + * that global reserve count will not be incremented * by free_huge_page. This will make it appear - * as though the reservation for this page was + * as though the reservation for this folio was * consumed. This may prevent the task from - * faulting in the page at a later time. This + * faulting in the folio at a later time. This * is better than inconsistent global huge page * accounting of reserve counts. */ - ClearHPageRestoreReserve(page); + folio_clear_hugetlb_restore_reserve(folio); else if (rc) (void)vma_add_reservation(h, vma, address); else @@ -2845,7 +2846,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, * This indicates there is an entry in the reserve map * not added by alloc_huge_page. We know it was added * before the alloc_huge_page call, otherwise - * HPageRestoreReserve would be set on the page. + * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. */ @@ -2854,12 +2855,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * VERY rare out of memory condition. Since * we can not delete the entry, set - * HPageRestoreReserve so that the reserve - * count will be incremented when the page + * hugetlb_restore_reserve so that the reserve + * count will be incremented when the folio * is freed. This reserve will be consumed * on a subsequent allocation. */ - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); } else if (rc < 0) { /* * Rare out of memory condition from @@ -2875,12 +2876,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * For private mappings, no entry indicates * a reservation is present. Since we can - * not add an entry, set SetHPageRestoreReserve - * on the page so reserve count will be + * not add an entry, set hugetlb_restore_reserve + * on the folio so reserve count will be * incremented when freed. This reserve will * be consumed on a subsequent allocation. */ - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); } else /* * No reservation present, do nothing -- cgit v1.2.3-70-g09d2 From bdd7be075acb650cc57d8ee752b5375b966ad07e Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:57 -0600 Subject: mm/hugetlb: convert demote_free_huge_page to folios Change demote_free_huge_page to demote_free_hugetlb_folio() and change demote_pool_huge_page() pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3120c3db60c4..4ecdbad9a451 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3579,12 +3579,12 @@ out: return 0; } -static int demote_free_huge_page(struct hstate *h, struct page *page) +static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) { - int i, nid = page_to_nid(page); + int i, nid = folio_nid(folio); struct hstate *target_hstate; - struct folio *folio = page_folio(page); struct page *subpage; + struct folio *inner_folio; int rc = 0; target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); @@ -3592,18 +3592,18 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_restore(h, page); + rc = hugetlb_vmemmap_restore(h, &folio->page); if (rc) { - /* Allocation of vmemmmap failed, we can not demote page */ + /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); - set_page_refcounted(page); - add_hugetlb_folio(h, page_folio(page), false); + folio_ref_unfreeze(folio, 1); + add_hugetlb_folio(h, folio, false); return rc; } /* * Use destroy_compound_hugetlb_folio_for_demote for all huge page - * sizes as it will not ref count pages. + * sizes as it will not ref count folios. */ destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); @@ -3618,15 +3618,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) mutex_lock(&target_hstate->resize_lock); for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { - subpage = nth_page(page, i); - folio = page_folio(subpage); + subpage = folio_page(folio, i); + inner_folio = page_folio(subpage); if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_folio_for_demote(folio, + prep_compound_gigantic_folio_for_demote(inner_folio, target_hstate->order); else prep_compound_page(subpage, target_hstate->order); - set_page_private(subpage, 0); - prep_new_hugetlb_folio(target_hstate, folio, nid); + folio_change_private(inner_folio, NULL); + prep_new_hugetlb_folio(target_hstate, inner_folio, nid); free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); @@ -3648,7 +3648,7 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) __must_hold(&hugetlb_lock) { int nr_nodes, node; - struct page *page; + struct folio *folio; lockdep_assert_held(&hugetlb_lock); @@ -3659,11 +3659,10 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) } for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { - list_for_each_entry(page, &h->hugepage_freelists[node], lru) { - if (PageHWPoison(page)) + list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { + if (folio_test_hwpoison(folio)) continue; - - return demote_free_huge_page(h, page); + return demote_free_hugetlb_folio(h, folio); } } -- cgit v1.2.3-70-g09d2 From ea4c353df37750d170dc0dcbfa8c47c984779733 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:30 -0800 Subject: mm/hugetlb: convert hugetlb_install_page to folios Patch series "convert hugetlb fault functions to folios", v2. This series converts the hugetlb page faulting functions to operate on folios. These include hugetlb_no_page(), hugetlb_wp(), copy_hugetlb_page_range(), and hugetlb_mcopy_atomic_pte(). This patch (of 8): Change hugetlb_install_page() to hugetlb_install_folio(). This reduces one user of the Huge Page flag macros which take in a page. Link: https://lkml.kernel.org/r/20230125170537.96973-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20230125170537.96973-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4ecdbad9a451..b246f2b4d0bd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4946,14 +4946,14 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte) } static void -hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, - struct page *new_page) +hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, + struct folio *new_folio) { - __SetPageUptodate(new_page); - hugepage_add_new_anon_rmap(new_page, vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); + __folio_mark_uptodate(new_folio); + hugepage_add_new_anon_rmap(&new_folio->page, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); - SetHPageMigratable(new_page); + folio_set_hugetlb_migratable(new_folio); } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, @@ -5107,7 +5107,7 @@ again: /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_page(dst_vma, dst_pte, addr, new); + hugetlb_install_folio(dst_vma, dst_pte, addr, page_folio(new)); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; -- cgit v1.2.3-70-g09d2 From 91a2fb956ad993f3cbcfc632611e17e3699fb652 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:31 -0800 Subject: mm/hugetlb: convert hugetlbfs_pagecache_present() to folios Refactor hugetlbfs_pagecache_present() to avoid getting and dropping a refcount on a page. Use RCU and page_cache_next_miss() instead. Link: https://lkml.kernel.org/r/20230125170537.96973-3-sidhartha.kumar@oracle.com Suggested-by: Matthew Wilcox Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: kernel test robot Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b246f2b4d0bd..a0d486ed5411 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5651,17 +5651,15 @@ out_release_old: static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { - struct address_space *mapping; - pgoff_t idx; - struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, vma, address); + bool present; - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + rcu_read_lock(); + present = page_cache_next_miss(mapping, idx, 1) != idx; + rcu_read_unlock(); - page = find_get_page(mapping, idx); - if (page) - put_page(page); - return page != NULL; + return present; } int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, -- cgit v1.2.3-70-g09d2 From ea8e72f4116a995c2aba3fb738ac372c4115375a Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:32 -0800 Subject: mm/hugetlb: convert putback_active_hugepage to take in a folio Convert putback_active_hugepage() to folio_putback_active_hugetlb(), this removes one user of the Huge Page macros which take in a page. The callers in migrate.c are also cleaned up by being able to directly use the src and dst folio variables. Link: https://lkml.kernel.org/r/20230125170537.96973-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 8 ++++---- mm/migrate.c | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2375c62c61a4..067906c5778e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -175,7 +175,7 @@ int isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void putback_active_hugepage(struct page *page); +void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); @@ -429,7 +429,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void putback_active_hugepage(struct page *page) +static inline void folio_putback_active_hugetlb(struct folio *folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a0d486ed5411..fd1ce61b8f3f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7300,13 +7300,13 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void putback_active_hugepage(struct page *page) +void folio_putback_active_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); - SetHPageMigratable(page); - list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + folio_set_hugetlb_migratable(folio); + list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); - put_page(page); + folio_put(folio); } void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) diff --git a/mm/migrate.c b/mm/migrate.c index 811e76c6fac1..c09872cf41b7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -151,7 +151,7 @@ void putback_movable_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { if (unlikely(PageHuge(page))) { - putback_active_hugepage(page); + folio_putback_active_hugetlb(page_folio(page)); continue; } list_del(&page->lru); @@ -1298,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ - putback_active_hugepage(hpage); + folio_putback_active_hugetlb(src); return MIGRATEPAGE_SUCCESS; } @@ -1383,7 +1383,7 @@ out_unlock: folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) - putback_active_hugepage(hpage); + folio_putback_active_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); @@ -1395,7 +1395,7 @@ out: if (put_new_page) put_new_page(new_hpage, private); else - putback_active_hugepage(new_hpage); + folio_putback_active_hugetlb(dst); return rc; } -- cgit v1.2.3-70-g09d2 From d0ce0e47b323a8d7fb5dc3314ce56afa650ade2d Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:33 -0800 Subject: mm/hugetlb: convert hugetlb fault paths to use alloc_hugetlb_folio() Change alloc_huge_page() to alloc_hugetlb_folio() by changing all callers to handle the now folio return type of the function. In this conversion, alloc_huge_page_vma() is also changed to alloc_hugetlb_folio_vma() and hugepage_add_new_anon_rmap() is changed to take in a folio directly. Many additions of '&folio->page' are cleaned up in subsequent patches. hugetlbfs_fallocate() is also refactored to use the RCU + page_cache_next_miss() API. Link: https://lkml.kernel.org/r/20230125170537.96973-5-sidhartha.kumar@oracle.com Suggested-by: Mike Kravetz Reported-by: kernel test robot Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 40 +++++----- include/linux/hugetlb.h | 8 +- include/linux/rmap.h | 2 +- mm/hugetlb.c | 201 ++++++++++++++++++++++++------------------------ mm/mempolicy.c | 6 +- mm/rmap.c | 6 +- 6 files changed, 133 insertions(+), 130 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 44ecdcb796cc..f89e12106e8a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -819,8 +819,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ - struct page *page; + struct folio *folio; unsigned long addr; + bool present; cond_resched(); @@ -844,48 +845,49 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - page = find_get_page(mapping, index); - if (page) { - put_page(page); + rcu_read_lock(); + present = page_cache_next_miss(mapping, index, 1) != index; + rcu_read_unlock(); + if (present) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); hugetlb_drop_vma_policy(&pseudo_vma); continue; } /* - * Allocate page without setting the avoid_reserve argument. + * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate - * pages in these areas, we need to consume the reserves + * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - page = alloc_huge_page(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); - if (IS_ERR(page)) { + if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - error = PTR_ERR(page); + error = PTR_ERR(folio); goto out; } - clear_huge_page(page, addr, pages_per_huge_page(h)); - __SetPageUptodate(page); - error = hugetlb_add_to_page_cache(page, mapping, index); + clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + error = hugetlb_add_to_page_cache(&folio->page, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, page); - put_page(page); + restore_reserve_on_error(h, &pseudo_vma, addr, &folio->page); + folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); /* - * unlock_page because locked by hugetlb_add_to_page_cache() - * put_page() due to reference from alloc_huge_page() + * folio_unlock because locked by hugetlb_add_to_page_cache() + * folio_put() due to reference from alloc_hugetlb_folio() */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 067906c5778e..6408f85e5754 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -717,11 +717,11 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -1033,7 +1033,7 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } -static inline struct page *alloc_huge_page(struct vm_area_struct *vma, +static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -1047,7 +1047,7 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return NULL; } -static inline struct page *alloc_huge_page_vma(struct hstate *h, +static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a6bd1f0a183d..a4570da03e58 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -203,7 +203,7 @@ void page_remove_rmap(struct page *, struct vm_area_struct *, void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, +void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); static inline void __page_dup_rmap(struct page *page, bool compound) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fd1ce61b8f3f..ea8d4611779b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2493,7 +2493,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, } /* mempolicy aware migration callback */ -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct mempolicy *mpol; @@ -2507,7 +2507,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); - return &folio->page; + return folio; } /* @@ -2798,14 +2798,14 @@ static long vma_del_reservation(struct hstate *h, /* * This routine is called to restore reservation information on error paths. - * It should ONLY be called for pages allocated via alloc_huge_page(), and - * the hugetlb mutex should remain held when calling this routine. + * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), + * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: * 1) A reservation was in place and the page consumed the reservation. * HPageRestoreReserve is set in the page. * 2) No reservation was in place for the page, so HPageRestoreReserve is - * not set. However, alloc_huge_page always updates the reserve map. + * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_page later in the error path will increment the * global reserve count. But, free_huge_page does not have enough context @@ -2814,7 +2814,7 @@ static long vma_del_reservation(struct hstate *h, * reserve count adjustments to be made by free_huge_page. Make sure the * reserve map indicates there is a reservation present. * - * In case 2, simply undo reserve map modifications done by alloc_huge_page. + * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) @@ -2844,8 +2844,8 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, if (!rc) { /* * This indicates there is an entry in the reserve map - * not added by alloc_huge_page. We know it was added - * before the alloc_huge_page call, otherwise + * not added by alloc_hugetlb_folio. We know it was added + * before the alloc_hugetlb_folio call, otherwise * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. @@ -3014,7 +3014,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); @@ -3023,7 +3023,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, long map_chg, map_commit; long gbl_chg; int ret, idx; - struct hugetlb_cgroup *h_cg; + struct hugetlb_cgroup *h_cg = NULL; bool deferred_reserve; idx = hstate_index(h); @@ -3130,7 +3130,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } - return &folio->page; + return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); @@ -4950,7 +4950,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add struct folio *new_folio) { __folio_mark_uptodate(new_folio); - hugepage_add_new_anon_rmap(&new_folio->page, vma, addr); + hugepage_add_new_anon_rmap(new_folio, vma, addr); set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); @@ -5080,34 +5080,34 @@ again: } else if (page_try_dup_anon_rmap(ptepage, true, src_vma)) { pte_t src_pte_old = entry; - struct page *new; + struct folio *new_folio; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new = alloc_huge_page(dst_vma, addr, 1); - if (IS_ERR(new)) { + new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + if (IS_ERR(new_folio)) { put_page(ptepage); - ret = PTR_ERR(new); + ret = PTR_ERR(new_folio); break; } - copy_user_huge_page(new, ptepage, addr, dst_vma, + copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma, npages); put_page(ptepage); - /* Install the new huge page if src pte stable */ + /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, - new); - put_page(new); + &new_folio->page); + folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_folio(dst_vma, dst_pte, addr, page_folio(new)); + hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; @@ -5478,7 +5478,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; struct hstate *h = hstate_vma(vma); - struct page *old_page, *new_page; + struct page *old_page; + struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; unsigned long haddr = address & huge_page_mask(h); @@ -5539,9 +5540,9 @@ retry_avoidcopy: * be acquired again before returning to the caller, as expected. */ spin_unlock(ptl); - new_page = alloc_huge_page(vma, haddr, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); - if (IS_ERR(new_page)) { + if (IS_ERR(new_folio)) { /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -5586,7 +5587,7 @@ retry_avoidcopy: return 0; } - ret = vmf_error(PTR_ERR(new_page)); + ret = vmf_error(PTR_ERR(new_folio)); goto out_release_old; } @@ -5599,9 +5600,9 @@ retry_avoidcopy: goto out_release_all; } - copy_user_huge_page(new_page, old_page, address, vma, + copy_user_huge_page(&new_folio->page, old_page, address, vma, pages_per_huge_page(h)); - __SetPageUptodate(new_page); + __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + huge_page_size(h)); @@ -5618,12 +5619,12 @@ retry_avoidcopy: huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(old_page, vma, true); - hugepage_add_new_anon_rmap(new_page, vma, haddr); + hugepage_add_new_anon_rmap(new_folio, vma, haddr); set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, !unshare)); - SetHPageMigratable(new_page); + make_huge_pte(vma, &new_folio->page, !unshare)); + folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ - new_page = old_page; + new_folio = page_folio(old_page); } spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); @@ -5632,9 +5633,9 @@ out_release_all: * No restore in case of successful pagetable update (Break COW or * unshare) */ - if (new_page != old_page) - restore_reserve_on_error(h, vma, haddr, new_page); - put_page(new_page); + if (new_folio != page_folio(old_page)) + restore_reserve_on_error(h, vma, haddr, &new_folio->page); + folio_put(new_folio); out_release_old: put_page(old_page); @@ -5753,11 +5754,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; - struct page *page; + struct folio *folio; pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); - bool new_page, new_pagecache_page = false; + bool new_folio, new_pagecache_folio = false; u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* @@ -5776,9 +5777,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - new_page = false; - page = find_lock_page(mapping, idx); - if (!page) { + new_folio = false; + folio = filemap_lock_folio(mapping, idx); + if (!folio) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; @@ -5811,8 +5812,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, VM_UFFD_MISSING); } - page = alloc_huge_page(vma, haddr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(vma, haddr, 0); + if (IS_ERR(folio)) { /* * Returning error will result in faulting task being * sent SIGBUS. The hugetlb fault mutex prevents two @@ -5826,17 +5827,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * sure there really is no pte entry. */ if (hugetlb_pte_stable(h, mm, ptep, old_pte)) - ret = vmf_error(PTR_ERR(page)); + ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(page, address, pages_per_huge_page(h)); - __SetPageUptodate(page); - new_page = true; + clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(&folio->page, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -5845,13 +5846,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, page); - put_page(page); + restore_reserve_on_error(h, vma, haddr, &folio->page); + folio_put(folio); goto out; } - new_pagecache_page = true; + new_pagecache_folio = true; } else { - lock_page(page); + folio_lock(folio); if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -5864,7 +5865,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * don't have hwpoisoned swap entry for errored virtual address. * So we need to block hugepage fault by PG_hwpoison bit check. */ - if (unlikely(PageHWPoison(page))) { + if (unlikely(folio_test_hwpoison(folio))) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; @@ -5872,8 +5873,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* See comment in userfaultfd_missing() block above */ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { ret = 0; @@ -5907,10 +5908,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto backout; if (anon_rmap) - hugepage_add_new_anon_rmap(page, vma, haddr); + hugepage_add_new_anon_rmap(folio, vma, haddr); else - page_dup_file_rmap(page, true); - new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + page_dup_file_rmap(&folio->page, true); + new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); /* * If this pte was previously wr-protected, keep it wr-protected even @@ -5923,20 +5924,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, &folio->page, ptl); } spin_unlock(ptl); /* - * Only set HPageMigratable in newly allocated pages. Existing pages - * found in the pagecache may not have HPageMigratableset if they have + * Only set hugetlb_migratable in newly allocated pages. Existing pages + * found in the pagecache may not have hugetlb_migratable if they have * been isolated for migration. */ - if (new_page) - SetHPageMigratable(page); + if (new_folio) + folio_set_hugetlb_migratable(folio); - unlock_page(page); + folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -5945,11 +5946,11 @@ out: backout: spin_unlock(ptl); backout_unlocked: - if (new_page && !new_pagecache_page) - restore_reserve_on_error(h, vma, haddr, page); + if (new_folio && !new_pagecache_folio) + restore_reserve_on_error(h, vma, haddr, &folio->page); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } @@ -6173,16 +6174,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; - struct page *page; + struct folio *folio; int writable; - bool page_in_pagecache = false; + bool folio_in_pagecache = false; if (is_continue) { ret = -EFAULT; - page = find_lock_page(mapping, idx); - if (!page) + folio = filemap_lock_folio(mapping, idx); + if (!folio) goto out; - page_in_pagecache = true; + folio_in_pagecache = true; } else if (!*pagep) { /* If a page already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. @@ -6193,34 +6194,34 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } - ret = copy_huge_page_from_user(page, + ret = copy_huge_page_from_user(&folio->page, (const void __user *) src_addr, pages_per_huge_page(h), false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; - /* Free the allocated page which may have + /* Free the allocated folio which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + folio_put(folio); - /* Allocate a temporary page to hold the copied + /* Allocate a temporary folio to hold the copied * contents. */ - page = alloc_huge_page_vma(h, dst_vma, dst_addr); - if (!page) { + folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); + if (!folio) { ret = -ENOMEM; goto out; } - *pagep = page; + *pagep = &folio->page; /* Set the outparam pagep and return to the caller to * copy the contents outside the lock. Don't free the * page. @@ -6236,25 +6237,25 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { put_page(*pagep); ret = -ENOMEM; *pagep = NULL; goto out; } - copy_user_huge_page(page, *pagep, dst_addr, dst_vma, + copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma, pages_per_huge_page(h)); put_page(*pagep); *pagep = NULL; } /* - * The memory barrier inside __SetPageUptodate makes sure that + * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { @@ -6269,16 +6270,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = hugetlb_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(&folio->page, mapping, idx); if (ret) goto out_release_nounlock; - page_in_pagecache = true; + folio_in_pagecache = true; } ptl = huge_pte_lock(h, dst_mm, dst_pte); ret = -EIO; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) goto out_release_unlock; /* @@ -6290,10 +6291,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (page_in_pagecache) - page_dup_file_rmap(page, true); + if (folio_in_pagecache) + page_dup_file_rmap(&folio->page, true); else - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY @@ -6304,7 +6305,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, else writable = dst_vma->vm_flags & VM_WRITE; - _dst_pte = make_huge_pte(dst_vma, page, writable); + _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -6326,20 +6327,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spin_unlock(ptl); if (!is_continue) - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); out_release_nounlock: - if (!page_in_pagecache) - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + if (!folio_in_pagecache) + restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + folio_put(folio); goto out; } #endif /* CONFIG_USERFAULTFD */ @@ -6871,7 +6872,7 @@ bool hugetlb_reserve_pages(struct inode *inode, /* * pages in this range were added to the reserve * map between region_chg and region_add. This - * indicates a race with alloc_huge_page. Adjust + * indicates a race with alloc_hugetlb_folio. Adjust * the subpool and reserve counts modified above * based on the difference. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fc034b070645..7686f40c9750 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1210,9 +1210,11 @@ static struct page *new_page(struct page *page, unsigned long start) break; } - if (folio_test_hugetlb(src)) - return alloc_huge_page_vma(page_hstate(&src->page), + if (folio_test_hugetlb(src)) { + dst = alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); + return &dst->page; + } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; diff --git a/mm/rmap.c b/mm/rmap.c index 86fccc2b9fc9..8287f2cc327d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2534,15 +2534,13 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, !!(flags & RMAP_EXCLUSIVE)); } -void hugepage_add_new_anon_rmap(struct page *page, +void hugepage_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { - struct folio *folio = page_folio(page); - BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); - __page_set_anon_rmap(folio, page, vma, address, 1); + __page_set_anon_rmap(folio, &folio->page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ -- cgit v1.2.3-70-g09d2 From d2d7bb44bfbd29200426ba17741550d36e081f91 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:34 -0800 Subject: mm/hugetlb: convert restore_reserve_on_error to take in a folio Every caller of restore_reserve_on_error() is now passing in &folio->page, change the function to take in a folio directly and clean up the call sites. Link: https://lkml.kernel.org/r/20230125170537.96973-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 21 ++++++++++----------- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f89e12106e8a..c736947e73da 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -873,7 +873,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(&folio->page, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, &folio->page); + restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6408f85e5754..20ceaaea1697 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -726,7 +726,7 @@ struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *v int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page); + unsigned long address, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea8d4611779b..1f6270c586c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2802,9 +2802,9 @@ static long vma_del_reservation(struct hstate *h, * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: - * 1) A reservation was in place and the page consumed the reservation. - * HPageRestoreReserve is set in the page. - * 2) No reservation was in place for the page, so HPageRestoreReserve is + * 1) A reservation was in place and the folio consumed the reservation. + * hugetlb_restore_reserve is set in the folio. + * 2) No reservation was in place for the page, so hugetlb_restore_reserve is * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_page later in the error path will increment the @@ -2817,9 +2817,8 @@ static long vma_del_reservation(struct hstate *h, * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page) + unsigned long address, struct folio *folio) { - struct folio *folio = page_folio(page); long rc = vma_needs_reservation(h, vma, address); if (folio_test_hugetlb_restore_reserve(folio)) { @@ -5102,7 +5101,7 @@ again: entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, - &new_folio->page); + new_folio); folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; @@ -5634,7 +5633,7 @@ out_release_all: * unshare) */ if (new_folio != page_folio(old_page)) - restore_reserve_on_error(h, vma, haddr, &new_folio->page); + restore_reserve_on_error(h, vma, haddr, new_folio); folio_put(new_folio); out_release_old: put_page(old_page); @@ -5846,7 +5845,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, &folio->page); + restore_reserve_on_error(h, vma, haddr, folio); folio_put(folio); goto out; } @@ -5947,7 +5946,7 @@ backout: spin_unlock(ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) - restore_reserve_on_error(h, vma, haddr, &folio->page); + restore_reserve_on_error(h, vma, haddr, folio); folio_unlock(folio); folio_put(folio); @@ -6210,7 +6209,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* Free the allocated folio which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); /* Allocate a temporary folio to hold the copied @@ -6339,7 +6338,7 @@ out_release_unlock: folio_unlock(folio); out_release_nounlock: if (!folio_in_pagecache) - restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); goto out; } -- cgit v1.2.3-70-g09d2 From 9b91c0e277a3dbb165c2e4301be7a231dc2f76f7 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:35 -0800 Subject: mm/hugetlb: convert hugetlb_add_to_page_cache to take in a folio Every caller of hugetlb_add_to_page_cache() is now passing in &folio->page, change the function to take in a folio directly and clean up the call sites. Link: https://lkml.kernel.org/r/20230125170537.96973-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c736947e73da..cfd09f95551b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -871,7 +871,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); __folio_mark_uptodate(folio); - error = hugetlb_add_to_page_cache(&folio->page, mapping, index); + error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 20ceaaea1697..df6dd624ccfe 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -723,7 +723,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1f6270c586c0..de1f73e5e200 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5662,10 +5662,9 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return present; } -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); int err; @@ -5677,7 +5676,7 @@ int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, __folio_clear_locked(folio); return err; } - ClearHPageRestoreReserve(page); + folio_clear_hugetlb_restore_reserve(folio); /* * mark folio dirty so that it will not be removed from cache/file @@ -5836,7 +5835,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(&folio->page, mapping, idx); + int err = hugetlb_add_to_page_cache(folio, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -6269,7 +6268,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = hugetlb_add_to_page_cache(&folio->page, mapping, idx); + ret = hugetlb_add_to_page_cache(folio, mapping, idx); if (ret) goto out_release_nounlock; folio_in_pagecache = true; -- cgit v1.2.3-70-g09d2 From 371607a3c793d7183b0faecc1fb4aa88fadcf202 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:36 -0800 Subject: mm/hugetlb: convert hugetlb_wp() to take in a folio Change the pagecache_page argument of hugetlb_wp to pagecache_folio. Replaces a call to find_lock_page() with filemap_lock_folio(). Link: https://lkml.kernel.org/r/20230125170537.96973-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reported-by: gerald.schaefer@linux.ibm.com Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de1f73e5e200..3a01a9dbf445 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5472,7 +5472,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags, - struct page *pagecache_page, spinlock_t *ptl) + struct folio *pagecache_folio, spinlock_t *ptl) { const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; @@ -5529,7 +5529,7 @@ retry_avoidcopy: * of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && - old_page != pagecache_page) + page_folio(old_page) != pagecache_folio) outside_reserve = 1; get_page(old_page); @@ -5922,7 +5922,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, &folio->page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); } spin_unlock(ptl); @@ -5985,7 +5985,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, u32 hash; pgoff_t idx; struct page *page = NULL; - struct page *pagecache_page = NULL; + struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; @@ -6067,7 +6067,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_page = find_lock_page(mapping, idx); + pagecache_folio = filemap_lock_folio(mapping, idx); } ptl = huge_pte_lock(h, mm, ptep); @@ -6087,9 +6087,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, }; spin_unlock(ptl); - if (pagecache_page) { - unlock_page(pagecache_page); - put_page(pagecache_page); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -6098,11 +6098,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * hugetlb_wp() requires page locks of pte_page(entry) and - * pagecache_page, so here we need take the former one - * when page != pagecache_page or !pagecache_page. + * pagecache_folio, so here we need take the former one + * when page != pagecache_folio or !pagecache_folio. */ page = pte_page(entry); - if (page != pagecache_page) + if (page_folio(page) != pagecache_folio) if (!trylock_page(page)) { need_wait_lock = 1; goto out_ptl; @@ -6113,7 +6113,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_page, ptl); + pagecache_folio, ptl); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { entry = huge_pte_mkdirty(entry); @@ -6124,15 +6124,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, haddr, ptep); out_put_page: - if (page != pagecache_page) + if (page_folio(page) != pagecache_folio) unlock_page(page); put_page(page); out_ptl: spin_unlock(ptl); - if (pagecache_page) { - unlock_page(pagecache_page); - put_page(pagecache_page); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); } out_mutex: hugetlb_vma_unlock_read(vma); -- cgit v1.2.3-70-g09d2 From 9747b9e92418b61c2281561e0651803f1fad0159 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:36 +0800 Subject: mm: hugetlb: change to return bool for isolate_hugetlb() Now the isolate_hugetlb() only returns 0 or -EBUSY, and most users did not care about the negative value, thus we can convert the isolate_hugetlb() to return a boolean value to make code more clear when checking the hugetlb isolation state. Moreover converts 2 users which will consider the negative value returned by isolate_hugetlb(). No functional changes intended. [akpm@linux-foundation.org: shorten locked section, per SeongJae Park] Link: https://lkml.kernel.org/r/12a287c5bebc13df304387087bbecc6421510849.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Acked-by: Linus Torvalds Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 13 ++++++++----- mm/memory-failure.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 7 +++---- 5 files changed, 16 insertions(+), 14 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index df6dd624ccfe..5f5e4177b2e0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct folio *folio, struct list_head *list); +bool isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -413,9 +413,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct folio *folio, struct list_head *list) +static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) { - return -EBUSY; + return false; } static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3a01a9dbf445..07abcb6eb203 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2925,12 +2925,15 @@ retry: */ goto free_new; } else if (folio_ref_count(old_folio)) { + bool isolated; + /* * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(old_folio, list); + isolated = isolate_hugetlb(old_folio, list); + ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { @@ -3005,7 +3008,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && !isolate_hugetlb(folio, list)) + if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -7251,15 +7254,15 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -int isolate_hugetlb(struct folio *folio, struct list_head *list) +bool isolate_hugetlb(struct folio *folio, struct list_head *list) { - int ret = 0; + bool ret = true; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio) || !folio_test_hugetlb_migratable(folio) || !folio_try_get(folio)) { - ret = -EBUSY; + ret = false; goto unlock; } folio_clear_hugetlb_migratable(folio); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e504362fdb23..8604753bc644 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2508,7 +2508,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool isolated = false; if (PageHuge(page)) { - isolated = !isolate_hugetlb(page_folio(page), pagelist); + isolated = isolate_hugetlb(page_folio(page), pagelist); } else { bool lru = !__PageMovable(page); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2751bc3310fd..a256a241fd1d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -609,7 +609,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(folio, qp->pagelist) && + if (!isolate_hugetlb(folio, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate folio but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index 53010a142e7f..2db546a0618c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2095,6 +2095,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, struct vm_area_struct *vma; struct page *page; int err; + bool isolated; mmap_read_lock(mm); err = -EFAULT; @@ -2126,13 +2127,11 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - err = isolate_hugetlb(page_folio(page), pagelist); - if (!err) - err = 1; + isolated = isolate_hugetlb(page_folio(page), pagelist); + err = isolated ? 1 : -EBUSY; } } else { struct page *head; - bool isolated; head = compound_head(page); isolated = isolate_lru_page(head); -- cgit v1.2.3-70-g09d2