summaryrefslogtreecommitdiff
path: root/mm/gup.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
commit98931dd95fd489fcbfa97da563505a6f071d7c77 (patch)
tree44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm/gup.c
parentdf202b452fe6c6d6f1351bad485e2367ef1e644e (diff)
parentf403f22f8ccb12860b2b62fec3173c6ccd45938b (diff)
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: "Almost all of MM here. A few things are still getting finished off, reviewed, etc. - Yang Shi has improved the behaviour of khugepaged collapsing of readonly file-backed transparent hugepages. - Johannes Weiner has arranged for zswap memory use to be tracked and managed on a per-cgroup basis. - Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for runtime enablement of the recent huge page vmemmap optimization feature. - Baolin Wang contributes a series to fix some issues around hugetlb pagetable invalidation. - Zhenwei Pi has fixed some interactions between hwpoisoned pages and virtualization. - Tong Tiangen has enabled the use of the presently x86-only page_table_check debugging feature on arm64 and riscv. - David Vernet has done some fixup work on the memcg selftests. - Peter Xu has taught userfaultfd to handle write protection faults against shmem- and hugetlbfs-backed files. - More DAMON development from SeongJae Park - adding online tuning of the feature and support for monitoring of fixed virtual address ranges. Also easier discovery of which monitoring operations are available. - Nadav Amit has done some optimization of TLB flushing during mprotect(). - Neil Brown continues to labor away at improving our swap-over-NFS support. - David Hildenbrand has some fixes to anon page COWing versus get_user_pages(). - Peng Liu fixed some errors in the core hugetlb code. - Joao Martins has reduced the amount of memory consumed by device-dax's compound devmaps. - Some cleanups of the arch-specific pagemap code from Anshuman Khandual. - Muchun Song has found and fixed some errors in the TLB flushing of transparent hugepages. - Roman Gushchin has done more work on the memcg selftests. ... and, of course, many smaller fixes and cleanups. Notably, the customary million cleanup serieses from Miaohe Lin" * tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits) mm: kfence: use PAGE_ALIGNED helper selftests: vm: add the "settings" file with timeout variable selftests: vm: add "test_hmm.sh" to TEST_FILES selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests selftests: vm: add migration to the .gitignore selftests/vm/pkeys: fix typo in comment ksm: fix typo in comment selftests: vm: add process_mrelease tests Revert "mm/vmscan: never demote for memcg reclaim" mm/kfence: print disabling or re-enabling message include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace" include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion" mm: fix a potential infinite loop in start_isolate_page_range() MAINTAINERS: add Muchun as co-maintainer for HugeTLB zram: fix Kconfig dependency warning mm/shmem: fix shmem folio swapoff hang cgroup: fix an error handling path in alloc_pagecache_max_30M() mm: damon: use HPAGE_PMD_SIZE tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate nodemask.h: fix compilation error with GCC12 ...
Diffstat (limited to 'mm/gup.c')
-rw-r--r--mm/gup.c127
1 files changed, 119 insertions, 8 deletions
diff --git a/mm/gup.c b/mm/gup.c
index 501bc150792c..551264407624 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,6 +29,39 @@ struct follow_page_context {
unsigned int page_mask;
};
+static inline void sanity_check_pinned_pages(struct page **pages,
+ unsigned long npages)
+{
+ if (!IS_ENABLED(CONFIG_DEBUG_VM))
+ return;
+
+ /*
+ * We only pin anonymous pages if they are exclusive. Once pinned, we
+ * can no longer turn them possibly shared and PageAnonExclusive() will
+ * stick around until the page is freed.
+ *
+ * We'd like to verify that our pinned anonymous pages are still mapped
+ * exclusively. The issue with anon THP is that we don't know how
+ * they are/were mapped when pinning them. However, for anon
+ * THP we can assume that either the given page (PTE-mapped THP) or
+ * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
+ * neither is the case, there is certainly something wrong.
+ */
+ for (; npages; npages--, pages++) {
+ struct page *page = *pages;
+ struct folio *folio = page_folio(page);
+
+ if (!folio_test_anon(folio))
+ continue;
+ if (!folio_test_large(folio) || folio_test_hugetlb(folio))
+ VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
+ else
+ /* Either a PTE-mapped or a PMD-mapped THP. */
+ VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
+ !PageAnonExclusive(page), page);
+ }
+}
+
/*
* Return the folio with ref appropriately incremented,
* or NULL if that failed.
@@ -204,6 +237,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
*/
void unpin_user_page(struct page *page)
{
+ sanity_check_pinned_pages(&page, 1);
gup_put_folio(page_folio(page), 1, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_page);
@@ -272,6 +306,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
return;
}
+ sanity_check_pinned_pages(pages, npages);
for (i = 0; i < npages; i += nr) {
folio = gup_folio_next(pages, npages, i, &nr);
/*
@@ -344,6 +379,23 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
}
EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
+static void unpin_user_pages_lockless(struct page **pages, unsigned long npages)
+{
+ unsigned long i;
+ struct folio *folio;
+ unsigned int nr;
+
+ /*
+ * Don't perform any sanity checks because we might have raced with
+ * fork() and some anonymous pages might now actually be shared --
+ * which is why we're unpinning after all.
+ */
+ for (i = 0; i < npages; i += nr) {
+ folio = gup_folio_next(pages, npages, i, &nr);
+ gup_put_folio(folio, nr, FOLL_PIN);
+ }
+}
+
/**
* unpin_user_pages() - release an array of gup-pinned pages.
* @pages: array of pages to be marked dirty and released.
@@ -367,6 +419,7 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
if (WARN_ON(IS_ERR_VALUE(npages)))
return;
+ sanity_check_pinned_pages(pages, npages);
for (i = 0; i < npages; i += nr) {
folio = gup_folio_next(pages, npages, i, &nr);
gup_put_folio(folio, nr, FOLL_PIN);
@@ -506,6 +559,14 @@ retry:
}
}
+ if (!pte_write(pte) && gup_must_unshare(flags, page)) {
+ page = ERR_PTR(-EMLINK);
+ goto out;
+ }
+
+ VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
+
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
if (unlikely(!try_grab_page(page, flags))) {
page = ERR_PTR(-ENOMEM);
@@ -732,6 +793,11 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
* When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
* the device's dev_pagemap metadata to avoid repeating expensive lookups.
*
+ * When getting an anonymous page and the caller has to trigger unsharing
+ * of a shared anonymous page first, -EMLINK is returned. The caller should
+ * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
+ * relevant with FOLL_PIN and !FOLL_WRITE.
+ *
* On output, the @ctx->page_mask is set according to the size of the page.
*
* Return: the mapped (struct page *), %NULL if no mapping exists, or
@@ -787,6 +853,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
if (vma_is_secretmem(vma))
return NULL;
+ if (foll_flags & FOLL_PIN)
+ return NULL;
+
page = follow_page_mask(vma, address, foll_flags, &ctx);
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
@@ -852,7 +921,8 @@ unmap:
* is, *@locked will be set to 0 and -EBUSY returned.
*/
static int faultin_page(struct vm_area_struct *vma,
- unsigned long address, unsigned int *flags, int *locked)
+ unsigned long address, unsigned int *flags, bool unshare,
+ int *locked)
{
unsigned int fault_flags = 0;
vm_fault_t ret;
@@ -874,6 +944,11 @@ static int faultin_page(struct vm_area_struct *vma,
*/
fault_flags |= FAULT_FLAG_TRIED;
}
+ if (unshare) {
+ fault_flags |= FAULT_FLAG_UNSHARE;
+ /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
+ VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
+ }
ret = handle_mm_fault(vma, address, fault_flags, NULL);
if (ret & VM_FAULT_ERROR) {
@@ -1095,8 +1170,9 @@ retry:
cond_resched();
page = follow_page_mask(vma, start, foll_flags, &ctx);
- if (!page) {
- ret = faultin_page(vma, start, &foll_flags, locked);
+ if (!page || PTR_ERR(page) == -EMLINK) {
+ ret = faultin_page(vma, start, &foll_flags,
+ PTR_ERR(page) == -EMLINK, locked);
switch (ret) {
case 0:
goto retry;
@@ -2227,6 +2303,11 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
+ if (!pte_write(pte) && gup_must_unshare(flags, page)) {
+ gup_put_folio(folio, 1, flags);
+ goto pte_unmap;
+ }
+
/*
* We need to make the page accessible if and only if we are
* going to access its content (the FOLL_PIN case). Please
@@ -2407,6 +2488,11 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
return 0;
}
+ if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
*nr += refs;
folio_set_referenced(folio);
return 1;
@@ -2468,6 +2554,11 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
*nr += refs;
folio_set_referenced(folio);
return 1;
@@ -2503,6 +2594,11 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
*nr += refs;
folio_set_referenced(folio);
return 1;
@@ -2740,8 +2836,10 @@ static unsigned long lockless_pages_from_mm(unsigned long start,
*/
if (gup_flags & FOLL_PIN) {
if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
- unpin_user_pages(pages, nr_pinned);
+ unpin_user_pages_lockless(pages, nr_pinned);
return 0;
+ } else {
+ sanity_check_pinned_pages(pages, nr_pinned);
}
}
return nr_pinned;
@@ -2900,6 +2998,9 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
@@ -2922,6 +3023,9 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
*/
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return 0;
+
+ if (WARN_ON_ONCE(!pages))
+ return 0;
/*
* FOLL_FAST_ONLY is required in order to match the API description of
* this routine: no fall back to regular ("slow") GUP.
@@ -2949,8 +3053,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
+ * Should be at least nr_pages long.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
@@ -2973,6 +3076,9 @@ long pin_user_pages_remote(struct mm_struct *mm,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
@@ -2986,8 +3092,7 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
+ * Should be at least nr_pages long.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
@@ -3005,6 +3110,9 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags);
@@ -3023,6 +3131,9 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}