diff options
Diffstat (limited to 'mm/gup.c')
-rw-r--r-- | mm/gup.c | 335 |
1 files changed, 113 insertions, 222 deletions
@@ -123,6 +123,9 @@ retry: */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) { + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return NULL; + if (flags & FOLL_GET) return try_get_folio(page, refs); else if (flags & FOLL_PIN) { @@ -202,17 +205,22 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) * time. Cases: please see the try_grab_folio() documentation, with * "refs=1". * - * Return: true for success, or if no action was required (if neither FOLL_PIN - * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or - * FOLL_PIN was set, but the page could not be grabbed. + * Return: 0 for success, or if no action was required (if neither FOLL_PIN + * nor FOLL_GET was set, nothing is done). A negative error code for failure: + * + * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not + * be grabbed. */ -bool __must_check try_grab_page(struct page *page, unsigned int flags) +int __must_check try_grab_page(struct page *page, unsigned int flags) { struct folio *folio = page_folio(page); WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) - return false; + return -ENOMEM; + + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return -EREMOTEIO; if (flags & FOLL_GET) folio_ref_inc(folio); @@ -232,7 +240,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags) node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); } - return true; + return 0; } /** @@ -537,42 +545,13 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); - - /* - * Considering PTE level hugetlb, like continuous-PTE hugetlb on - * ARM64 architecture. - */ - if (is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd_pte(vma, address, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - -retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto retry; - } + if (!pte_present(pte)) + goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto no_page; @@ -615,7 +594,7 @@ retry: } } - if (!pte_write(pte) && gup_must_unshare(flags, page)) { + if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } @@ -624,10 +603,12 @@ retry: !PageAnonExclusive(page), page); /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ - if (unlikely(!try_grab_page(page, flags))) { - page = ERR_PTR(-ENOMEM); + ret = try_grab_page(page, flags); + if (unlikely(ret)) { + page = ERR_PTR(ret); goto out; } + /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see @@ -680,42 +661,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, pmdval = READ_ONCE(*pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd_pte(vma, address, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pmd_val(pmdval)))) { - page = follow_huge_pd(vma, address, - __hugepd(pmd_val(pmdval)), flags, - PMD_SHIFT); - if (page) - return page; + if (!pmd_present(pmdval)) return no_page_table(vma, flags); - } -retry: - if (!pmd_present(pmdval)) { - /* - * Should never reach here, if thp migration is not supported; - * Otherwise, it must be a thp migration entry. - */ - VM_BUG_ON(!thp_migration_supported() || - !is_pmd_migration_entry(pmdval)); - - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - - pmd_migration_entry_wait(mm, pmd); - pmdval = READ_ONCE(*pmd); - /* - * MADV_DONTNEED may convert the pmd to null because - * mmap_lock is held in read mode - */ - if (pmd_none(pmdval)) - return no_page_table(vma, flags); - goto retry; - } if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); @@ -729,18 +676,10 @@ retry: if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) return no_page_table(vma, flags); -retry_locked: ptl = pmd_lock(mm, pmd); - if (unlikely(pmd_none(*pmd))) { - spin_unlock(ptl); - return no_page_table(vma, flags); - } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - pmd_migration_entry_wait(mm, pmd); - goto retry_locked; + return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); @@ -783,20 +722,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, pud = pud_offset(p4dp, address); if (pud_none(*pud)) return no_page_table(vma, flags); - if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pud(mm, address, pud, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pud_val(*pud)))) { - page = follow_huge_pd(vma, address, - __hugepd(pud_val(*pud)), flags, - PUD_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); @@ -816,7 +741,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, struct follow_page_context *ctx) { p4d_t *p4d; - struct page *page; p4d = p4d_offset(pgdp, address); if (p4d_none(*p4d)) @@ -825,14 +749,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, if (unlikely(p4d_bad(*p4d))) return no_page_table(vma, flags); - if (is_hugepd(__hugepd(p4d_val(*p4d)))) { - page = follow_huge_pd(vma, address, - __hugepd(p4d_val(*p4d)), flags, - P4D_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } return follow_pud_mask(vma, address, p4d, flags, ctx); } @@ -870,10 +786,18 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, ctx->page_mask = 0; - /* make this handle hugepd */ - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN)); + /* + * Call hugetlb_follow_page_mask for hugetlb vmas as it will use + * special hugetlb page table walking code. This eliminates the + * need to check for hugetlb entries in the general walking code. + * + * hugetlb_follow_page_mask is only for follow_page() handling here. + * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. + */ + if (is_vm_hugetlb_page(vma)) { + page = hugetlb_follow_page_mask(vma, address, flags); + if (!page) + page = no_page_table(vma, flags); return page; } @@ -882,21 +806,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); - if (pgd_huge(*pgd)) { - page = follow_huge_pgd(mm, address, pgd, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pgd_val(*pgd)))) { - page = follow_huge_pd(vma, address, - __hugepd(pgd_val(*pgd)), flags, - PGDIR_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } - return follow_p4d_mask(vma, address, pgd, flags, ctx); } @@ -960,10 +869,9 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, goto unmap; *page = pte_page(*pte); } - if (unlikely(!try_grab_page(*page, gup_flags))) { - ret = -ENOMEM; + ret = try_grab_page(*page, gup_flags); + if (unlikely(ret)) goto unmap; - } out: ret = 0; unmap: @@ -989,8 +897,17 @@ static int faultin_page(struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (locked) + if (locked) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + /* + * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set + * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. + * That's because some callers may not be prepared to + * handle early exits caused by non-fatal signals. + */ + if (*flags & FOLL_INTERRUPTIBLE) + fault_flags |= FAULT_FLAG_INTERRUPTIBLE; + } if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { @@ -1058,6 +975,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; + if ((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA)) + return -EOPNOTSUPP; + if (vma_is_secretmem(vma)) return -EFAULT; @@ -1065,6 +985,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; + /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ + if (is_vm_hugetlb_page(vma)) + return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could @@ -1392,6 +1315,22 @@ retry: EXPORT_SYMBOL_GPL(fixup_user_fault); /* + * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is + * specified, it'll also respond to generic signals. The caller of GUP + * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption. + */ +static bool gup_signal_pending(unsigned int flags) +{ + if (fatal_signal_pending(current)) + return true; + + if (!(flags & FOLL_INTERRUPTIBLE)) + return false; + + return signal_pending(current); +} + +/* * Please note that this function, unlike __get_user_pages will not * return 0 for nr_pages > 0 without FOLL_NOWAIT */ @@ -1472,11 +1411,11 @@ retry: * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted - * by fatal signals, so we need to check it before we + * by fatal signals of even common signals, depending on + * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ - - if (fatal_signal_pending(current)) { + if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; @@ -2105,14 +2044,19 @@ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, + int *locked, unsigned int gup_flags) { + bool must_unlock = false; unsigned int flags; long rc, nr_pinned_pages; + if (locked && WARN_ON_ONCE(!*locked)) + return -EINVAL; + if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, gup_flags); + locked, gup_flags); /* * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM @@ -2126,8 +2070,13 @@ static long __gup_longterm_locked(struct mm_struct *mm, return -EINVAL; flags = memalloc_pin_save(); do { + if (locked && !*locked) { + mmap_read_lock(mm); + must_unlock = true; + *locked = 1; + } nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, - pages, vmas, NULL, + pages, vmas, locked, gup_flags); if (nr_pinned_pages <= 0) { rc = nr_pinned_pages; @@ -2137,6 +2086,10 @@ static long __gup_longterm_locked(struct mm_struct *mm, } while (rc == -EAGAIN); memalloc_pin_restore(flags); + if (locked && *locked && must_unlock) { + mmap_read_unlock(mm); + *locked = 0; + } return rc ? rc : nr_pinned_pages; } @@ -2160,35 +2113,6 @@ static bool is_valid_gup_flags(unsigned int gup_flags) } #ifdef CONFIG_MMU -static long __get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) -{ - /* - * Parts of FOLL_LONGTERM behavior are incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. However, this only comes up if locked is set, and there are - * callers that do request FOLL_LONGTERM, but do not set locked. So, - * allow what we can. - */ - if (gup_flags & FOLL_LONGTERM) { - if (WARN_ON_ONCE(locked)) - return -EINVAL; - /* - * This will check the vmas (even if our vmas arg is NULL) - * and return -ENOTSUPP if DAX isn't allowed in this case: - */ - return __gup_longterm_locked(mm, start, nr_pages, pages, - vmas, gup_flags | FOLL_TOUCH | - FOLL_REMOTE); - } - - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); -} - /** * get_user_pages_remote() - pin user pages in memory * @mm: mm_struct of target mm @@ -2257,8 +2181,8 @@ long get_user_pages_remote(struct mm_struct *mm, if (!is_valid_gup_flags(gup_flags)) return -EINVAL; - return __get_user_pages_remote(mm, start, nr_pages, gup_flags, - pages, vmas, locked); + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -2270,14 +2194,6 @@ long get_user_pages_remote(struct mm_struct *mm, { return 0; } - -static long __get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) -{ - return 0; -} #endif /* !CONFIG_MMU */ /** @@ -2304,7 +2220,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, return -EINVAL; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, gup_flags | FOLL_TOUCH); + pages, vmas, NULL, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -2330,18 +2246,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 1; long ret; - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - mmap_read_lock(mm); - ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL, - &locked, gup_flags | FOLL_TOUCH); + ret = __gup_longterm_locked(mm, start, nr_pages, pages, NULL, &locked, + gup_flags | FOLL_TOUCH); if (locked) mmap_read_unlock(mm); return ret; @@ -2468,7 +2375,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, goto pte_unmap; } - if (!pte_write(pte) && gup_must_unshare(flags, page)) { + if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2534,9 +2441,15 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, undo_dev_pagemap(nr, nr_start, flags, pages); break; } + + if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { + undo_dev_pagemap(nr, nr_start, flags, pages); + break; + } + SetPageReferenced(page); pages[*nr] = page; - if (unlikely(!try_grab_page(page, flags))) { + if (unlikely(try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } @@ -2654,7 +2567,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } - if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) { + if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2720,7 +2633,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) { + if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2760,7 +2673,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) { + if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2852,7 +2765,7 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; - if (unlikely(pud_huge(pud))) { + if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; @@ -2935,29 +2848,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) } #endif -static int __gup_longterm_unlocked(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - int ret; - - /* - * FIXME: FOLL_LONGTERM does not work with - * get_user_pages_unlocked() (see comments in that function) - */ - if (gup_flags & FOLL_LONGTERM) { - mmap_read_lock(current->mm); - ret = __gup_longterm_locked(current->mm, - start, nr_pages, - pages, NULL, gup_flags); - mmap_read_unlock(current->mm); - } else { - ret = get_user_pages_unlocked(start, nr_pages, - pages, gup_flags); - } - - return ret; -} - static unsigned long lockless_pages_from_mm(unsigned long start, unsigned long end, unsigned int gup_flags, @@ -3018,7 +2908,8 @@ static int internal_get_user_pages_fast(unsigned long start, if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | - FOLL_FAST_ONLY | FOLL_NOFAULT))) + FOLL_FAST_ONLY | FOLL_NOFAULT | + FOLL_PCI_P2PDMA))) return -EINVAL; if (gup_flags & FOLL_PIN) @@ -3041,8 +2932,8 @@ static int internal_get_user_pages_fast(unsigned long start, /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; - ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags, - pages); + ret = get_user_pages_unlocked(start, nr_pages - nr_pinned, pages, + gup_flags); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so @@ -3241,9 +3132,9 @@ long pin_user_pages_remote(struct mm_struct *mm, if (WARN_ON_ONCE(!pages)) return -EINVAL; - gup_flags |= FOLL_PIN; - return __get_user_pages_remote(mm, start, nr_pages, gup_flags, - pages, vmas, locked); + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, + gup_flags | FOLL_PIN | FOLL_TOUCH | + FOLL_REMOTE); } EXPORT_SYMBOL(pin_user_pages_remote); @@ -3277,7 +3168,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, gup_flags |= FOLL_PIN; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, gup_flags); + pages, vmas, NULL, gup_flags); } EXPORT_SYMBOL(pin_user_pages); |