diff options
Diffstat (limited to 'mm/userfaultfd.c')
-rw-r--r-- | mm/userfaultfd.c | 170 |
1 files changed, 169 insertions, 1 deletions
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index acc56c75ba99..ce13c4062647 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -391,7 +391,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; @@ -1763,3 +1763,171 @@ out: VM_WARN_ON(!moved && !err); return moved ? moved : err; } + +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, + vm_flags_t flags) +{ + const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; + + vm_flags_reset(vma, flags); + /* + * For shared mappings, we want to enable writenotify while + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. + */ + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) + vma_set_page_prot(vma); +} + +static void userfaultfd_set_ctx(struct vm_area_struct *vma, + struct userfaultfd_ctx *ctx, + unsigned long flags) +{ + vma_start_write(vma); + vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; + userfaultfd_set_vm_flags(vma, + (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); +} + +void userfaultfd_reset_ctx(struct vm_area_struct *vma) +{ + userfaultfd_set_ctx(vma, NULL, 0); +} + +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + struct vm_area_struct *ret; + + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(vma, start, end - start, false); + + ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, + vma->vm_flags & ~__VM_UFFD_FLAGS, + NULL_VM_UFFD_CTX); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + if (!IS_ERR(ret)) + userfaultfd_reset_ctx(ret); + + return ret; +} + +/* Assumes mmap write lock taken, and mm_struct pinned. */ +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, + unsigned long vm_flags, + unsigned long start, unsigned long end, + bool wp_async) +{ + VMA_ITERATOR(vmi, ctx->mm, start); + struct vm_area_struct *prev = vma_prev(&vmi); + unsigned long vma_end; + unsigned long new_flags; + + if (vma->vm_start < start) + prev = vma; + + for_each_vma_range(vmi, vma, end) { + cond_resched(); + + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); + BUG_ON(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (vma->vm_userfaultfd_ctx.ctx == ctx && + (vma->vm_flags & vm_flags) == vm_flags) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, + (struct vm_userfaultfd_ctx){ctx}); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + userfaultfd_set_ctx(vma, ctx, vm_flags); + + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) + hugetlb_unshare_all_pmds(vma); + +skip: + prev = vma; + start = vma->vm_end; + } + + return 0; +} + +void userfaultfd_release_new(struct userfaultfd_ctx *ctx) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + /* the various vma->vm_userfaultfd_ctx still points to it */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + if (vma->vm_userfaultfd_ctx.ctx == ctx) + userfaultfd_reset_ctx(vma); + } + mmap_write_unlock(mm); +} + +void userfaultfd_release_all(struct mm_struct *mm, + struct userfaultfd_ctx *ctx) +{ + struct vm_area_struct *vma, *prev; + VMA_ITERATOR(vmi, mm, 0); + + if (!mmget_not_zero(mm)) + return; + + /* + * Flush page faults out of all CPUs. NOTE: all page faults + * must be retried without returning VM_FAULT_SIGBUS if + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx + * changes while handle_userfault released the mmap_lock. So + * it's critical that released is set to true (above), before + * taking the mmap_lock for writing. + */ + mmap_write_lock(mm); + prev = NULL; + for_each_vma(vmi, vma) { + cond_resched(); + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & __VM_UFFD_FLAGS)); + if (vma->vm_userfaultfd_ctx.ctx != ctx) { + prev = vma; + continue; + } + + vma = userfaultfd_clear_vma(&vmi, prev, vma, + vma->vm_start, vma->vm_end); + prev = vma; + } + mmap_write_unlock(mm); + mmput(mm); +} |