diff options
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r-- | mm/khugepaged.c | 437 |
1 files changed, 298 insertions, 139 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0ec69b96b497..6b9d39d65b73 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -55,6 +55,9 @@ enum scan_result { SCAN_CGROUP_CHARGE_FAIL, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, + SCAN_STORE_FAILED, + SCAN_COPY_MC, + SCAN_PAGE_FILLED, }; #define CREATE_TRACE_POINTS @@ -685,20 +688,21 @@ out: return result; } -static void __collapse_huge_page_copy(pte_t *pte, struct page *page, - struct vm_area_struct *vma, - unsigned long address, - spinlock_t *ptl, - struct list_head *compound_pagelist) +static void __collapse_huge_page_copy_succeeded(pte_t *pte, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl, + struct list_head *compound_pagelist) { - struct page *src_page, *tmp; + struct page *src_page; + struct page *tmp; pte_t *_pte; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, page++, address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval; + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - clear_user_highpage(page, address); add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (is_zero_pfn(pte_pfn(pteval))) { /* @@ -710,7 +714,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } } else { src_page = pte_page(pteval); - copy_user_highpage(page, src_page, address, vma); if (!PageCompound(src_page)) release_pte_page(src_page); /* @@ -737,6 +740,87 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } } +static void __collapse_huge_page_copy_failed(pte_t *pte, + pmd_t *pmd, + pmd_t orig_pmd, + struct vm_area_struct *vma, + struct list_head *compound_pagelist) +{ + spinlock_t *pmd_ptl; + + /* + * Re-establish the PMD to point to the original page table + * entry. Restoring PMD needs to be done prior to releasing + * pages. Since pages are still isolated and locked here, + * acquiring anon_vma_lock_write is unnecessary. + */ + pmd_ptl = pmd_lock(vma->vm_mm, pmd); + pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); + spin_unlock(pmd_ptl); + /* + * Release both raw and compound pages isolated + * in __collapse_huge_page_isolate. + */ + release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); +} + +/* + * __collapse_huge_page_copy - attempts to copy memory contents from raw + * pages to a hugepage. Cleans up the raw pages if copying succeeds; + * otherwise restores the original page table and releases isolated raw pages. + * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. + * + * @pte: starting of the PTEs to copy from + * @page: the new hugepage to copy contents to + * @pmd: pointer to the new hugepage's PMD + * @orig_pmd: the original raw pages' PMD + * @vma: the original raw pages' virtual memory area + * @address: starting address to copy + * @ptl: lock on raw pages' PTEs + * @compound_pagelist: list that stores compound pages + */ +static int __collapse_huge_page_copy(pte_t *pte, + struct page *page, + pmd_t *pmd, + pmd_t orig_pmd, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl, + struct list_head *compound_pagelist) +{ + struct page *src_page; + pte_t *_pte; + pte_t pteval; + unsigned long _address; + int result = SCAN_SUCCEED; + + /* + * Copying pages' contents is subject to memory poison at any iteration. + */ + for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; + _pte++, page++, _address += PAGE_SIZE) { + pteval = *_pte; + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + clear_user_highpage(page, _address); + continue; + } + src_page = pte_page(pteval); + if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) { + result = SCAN_COPY_MC; + break; + } + } + + if (likely(result == SCAN_SUCCEED)) + __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, + compound_pagelist); + else + __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, + compound_pagelist); + + return result; +} + static void khugepaged_alloc_sleep(void) { DEFINE_WAIT(wait); @@ -976,12 +1060,19 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); int node = hpage_collapse_find_target_node(cc); + struct folio *folio; if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask)) return SCAN_ALLOC_HUGE_PAGE_FAIL; - if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp))) + + folio = page_folio(*hpage); + if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { + folio_put(folio); + *hpage = NULL; return SCAN_CGROUP_CHARGE_FAIL; + } count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); + return SCAN_SUCCEED; } @@ -1053,6 +1144,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, if (result != SCAN_SUCCEED) goto out_up_write; + vma_start_write(vma); anon_vma_lock_write(vma->anon_vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, @@ -1102,9 +1194,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, - &compound_pagelist); + result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd, + vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); + if (unlikely(result != SCAN_SUCCEED)) + goto out_up_write; + /* * spin_lock() below is not the equivalent of smp_wmb(), but * the smp_wmb() inside __SetPageUptodate() can be reused to @@ -1132,10 +1228,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, out_up_write: mmap_write_unlock(mm); out_nolock: - if (hpage) { - mem_cgroup_uncharge(page_folio(hpage)); + if (hpage) put_page(hpage); - } trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); return result; } @@ -1176,7 +1270,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * enabled swap entries. Please see * comment below for pte_uffd_wp(). */ - if (pte_swp_uffd_wp(pteval)) { + if (pte_swp_uffd_wp_any(pteval)) { result = SCAN_PTE_UFFD_WP; goto out_unmap; } @@ -1516,6 +1610,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto drop_hpage; } + /* Lock the vma before taking i_mmap and page table locks */ + vma_start_write(vma); + /* * We need to lock the mapping so that from here on, only GUP-fast and * hardware page walks can access the parts of the page tables that @@ -1693,6 +1790,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, result = SCAN_PTE_MAPPED_HUGEPAGE; if ((cc->is_khugepaged || is_target) && mmap_write_trylock(mm)) { + /* trylock for the same lock inversion as above */ + if (!vma_try_start_write(vma)) + goto unlock_next; + /* * Re-check whether we have an ->anon_vma, because * collapse_and_free_pmd() requires that either no @@ -1758,17 +1859,18 @@ next: * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; - * - scan page cache replacing old pages with the new one + * - scan page cache, locking old pages * + swap/gup in pages if necessary; - * + fill in gaps; - * + keep old pages around in case rollback is required; + * - copy data to new page + * - handle shmem holes + * + re-validate that holes weren't filled by someone else + * + check for userfaultfd + * - finalize updates to the page cache; * - if replacing succeeds: - * + copy data over; - * + free old pages; * + unlock huge page; + * + free old pages; * - if replacing failed; - * + put all pages back and unfreeze them; - * + restore gaps in the page cache; + * + unlock old pages * + unlock and free huge page; */ static int collapse_file(struct mm_struct *mm, unsigned long addr, @@ -1777,6 +1879,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, { struct address_space *mapping = file->f_mapping; struct page *hpage; + struct page *page; + struct page *tmp; + struct folio *folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1791,6 +1896,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (result != SCAN_SUCCEED) goto out; + __SetPageLocked(hpage); + if (is_shmem) + __SetPageSwapBacked(hpage); + hpage->index = start; + hpage->mapping = mapping; + /* * Ensure we have slots for all the pages in the range. This is * almost certainly a no-op because most of the pages must be present @@ -1803,26 +1914,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { result = SCAN_FAIL; - goto out; + goto rollback; } } while (1); - __SetPageLocked(hpage); - if (is_shmem) - __SetPageSwapBacked(hpage); - hpage->index = start; - hpage->mapping = mapping; - - /* - * At this point the hpage is locked and not up-to-date. - * It's safe to insert it into the page cache, because nobody would - * be able to map it or use it in another way until we unlock it. - */ - xas_set(&xas, start); for (index = start; index < end; index++) { - struct page *page = xas_next(&xas); - struct folio *folio; + page = xas_next(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { @@ -1837,13 +1935,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, result = SCAN_TRUNCATED; goto xa_locked; } - xas_set(&xas, index); + xas_set(&xas, index + 1); } if (!shmem_charge(mapping->host, 1)) { result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, hpage); nr_none++; continue; } @@ -1856,6 +1953,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, result = SCAN_FAIL; goto xa_unlocked; } + /* drain pagevecs to help isolate_lru_page() */ + lru_add_drain(); page = folio_file_page(folio, index); } else if (trylock_page(page)) { get_page(page); @@ -1976,12 +2075,16 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, VM_BUG_ON_PAGE(page != xas_load(&xas), page); /* - * The page is expected to have page_count() == 3: + * We control three references to the page: * - we hold a pin on it; * - one reference from page cache; * - one from isolate_lru_page; + * If those are the only references, then any new usage of the + * page will have to fetch it from the page cache. That requires + * locking the page to handle truncate, so any new usage will be + * blocked until we unlock page after collapse/during rollback. */ - if (!page_ref_freeze(page, 3)) { + if (page_count(page) != 3) { result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); putback_lru_page(page); @@ -1989,25 +2092,17 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } /* - * Add the page to the list to be able to undo the collapse if - * something go wrong. + * Accumulate the pages that are being collapsed. */ list_add_tail(&page->lru, &pagelist); - - /* Finally, replace with the new page. */ - xas_store(&xas, hpage); continue; out_unlock: unlock_page(page); put_page(page); goto xa_unlocked; } - nr = thp_nr_pages(hpage); - if (is_shmem) - __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); - else { - __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); + if (!is_shmem) { filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -2018,21 +2113,10 @@ out_unlock: smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); - goto xa_locked; } } - if (nr_none) { - __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); - /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); - } - - /* Join all the small entries into a single multi-index entry */ - xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -2044,101 +2128,174 @@ xa_unlocked: */ try_to_unmap_flush(); - if (result == SCAN_SUCCEED) { - struct page *page, *tmp; - struct folio *folio; + if (result != SCAN_SUCCEED) + goto rollback; - /* - * Replacing old pages with new one has succeeded, now we - * need to copy the content and free the old pages. - */ - index = start; - list_for_each_entry_safe(page, tmp, &pagelist, lru) { - while (index < page->index) { - clear_highpage(hpage + (index % HPAGE_PMD_NR)); - index++; - } - copy_highpage(hpage + (page->index % HPAGE_PMD_NR), - page); - list_del(&page->lru); - page->mapping = NULL; - page_ref_unfreeze(page, 1); - ClearPageActive(page); - ClearPageUnevictable(page); - unlock_page(page); - put_page(page); - index++; - } - while (index < end) { + /* + * The old pages are locked, so they won't change anymore. + */ + index = start; + list_for_each_entry(page, &pagelist, lru) { + while (index < page->index) { clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } + if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) { + result = SCAN_COPY_MC; + goto rollback; + } + index++; + } + while (index < end) { + clear_highpage(hpage + (index % HPAGE_PMD_NR)); + index++; + } - folio = page_folio(hpage); - folio_mark_uptodate(folio); - folio_ref_add(folio, HPAGE_PMD_NR - 1); + if (nr_none) { + struct vm_area_struct *vma; + int nr_none_check = 0; - if (is_shmem) - folio_mark_dirty(folio); - folio_add_lru(folio); + i_mmap_lock_read(mapping); + xas_lock_irq(&xas); - /* - * Remove pte page tables, so we can re-fault the page as huge. - */ - result = retract_page_tables(mapping, start, mm, addr, hpage, - cc); - unlock_page(hpage); - hpage = NULL; - } else { - struct page *page; + xas_set(&xas, start); + for (index = start; index < end; index++) { + if (!xas_next(&xas)) { + xas_store(&xas, XA_RETRY_ENTRY); + if (xas_error(&xas)) { + result = SCAN_STORE_FAILED; + goto immap_locked; + } + nr_none_check++; + } + } - /* Something went wrong: roll back page cache changes */ - xas_lock_irq(&xas); - if (nr_none) { - mapping->nrpages -= nr_none; - shmem_uncharge(mapping->host, nr_none); + if (nr_none != nr_none_check) { + result = SCAN_PAGE_FILLED; + goto immap_locked; } - xas_set(&xas, start); - xas_for_each(&xas, page, end - 1) { - page = list_first_entry_or_null(&pagelist, - struct page, lru); - if (!page || xas.xa_index < page->index) { - if (!nr_none) - break; - nr_none--; - /* Put holes back where they were */ - xas_store(&xas, NULL); - continue; + /* + * If userspace observed a missing page in a VMA with a MODE_MISSING + * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that + * page. If so, we need to roll back to avoid suppressing such an + * event. Since wp/minor userfaultfds don't give userspace any + * guarantees that the kernel doesn't fill a missing page with a zero + * page, so they don't matter here. + * + * Any userfaultfds registered after this point will not be able to + * observe any missing pages due to the previously inserted retry + * entries. + */ + vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { + if (userfaultfd_missing(vma)) { + result = SCAN_EXCEED_NONE_PTE; + goto immap_locked; } + } - VM_BUG_ON_PAGE(page->index != xas.xa_index, page); +immap_locked: + i_mmap_unlock_read(mapping); + if (result != SCAN_SUCCEED) { + xas_set(&xas, start); + for (index = start; index < end; index++) { + if (xas_next(&xas) == XA_RETRY_ENTRY) + xas_store(&xas, NULL); + } - /* Unfreeze the page. */ - list_del(&page->lru); - page_ref_unfreeze(page, 2); - xas_store(&xas, page); - xas_pause(&xas); xas_unlock_irq(&xas); - unlock_page(page); - putback_lru_page(page); - xas_lock_irq(&xas); + goto rollback; } - VM_BUG_ON(nr_none); + } else { + xas_lock_irq(&xas); + } + + nr = thp_nr_pages(hpage); + if (is_shmem) + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); + else + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); + + if (nr_none) { + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); + /* nr_none is always 0 for non-shmem. */ + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); + } + + /* + * Mark hpage as uptodate before inserting it into the page cache so + * that it isn't mistaken for an fallocated but unwritten page. + */ + folio = page_folio(hpage); + folio_mark_uptodate(folio); + folio_ref_add(folio, HPAGE_PMD_NR - 1); + + if (is_shmem) + folio_mark_dirty(folio); + folio_add_lru(folio); + + /* Join all the small entries into a single multi-index entry. */ + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_store(&xas, hpage); + WARN_ON_ONCE(xas_error(&xas)); + xas_unlock_irq(&xas); + + /* + * Remove pte page tables, so we can re-fault the page as huge. + */ + result = retract_page_tables(mapping, start, mm, addr, hpage, + cc); + unlock_page(hpage); + + /* + * The collapse has succeeded, so free the old pages. + */ + list_for_each_entry_safe(page, tmp, &pagelist, lru) { + list_del(&page->lru); + page->mapping = NULL; + ClearPageActive(page); + ClearPageUnevictable(page); + unlock_page(page); + folio_put_refs(page_folio(page), 3); + } + + goto out; + +rollback: + /* Something went wrong: roll back page cache changes */ + if (nr_none) { + xas_lock_irq(&xas); + mapping->nrpages -= nr_none; + shmem_uncharge(mapping->host, nr_none); xas_unlock_irq(&xas); + } - hpage->mapping = NULL; + list_for_each_entry_safe(page, tmp, &pagelist, lru) { + list_del(&page->lru); + unlock_page(page); + putback_lru_page(page); + put_page(page); + } + /* + * Undo the updates of filemap_nr_thps_inc for non-SHMEM + * file only. This undo is not needed unless failure is + * due to SCAN_COPY_MC. + */ + if (!is_shmem && result == SCAN_COPY_MC) { + filemap_nr_thps_dec(mapping); + /* + * Paired with smp_mb() in do_dentry_open() to + * ensure the update to nr_thps is visible. + */ + smp_mb(); } - if (hpage) - unlock_page(hpage); + hpage->mapping = NULL; + + unlock_page(hpage); + put_page(hpage); out: VM_BUG_ON(!list_empty(&pagelist)); - if (hpage) { - mem_cgroup_uncharge(page_folio(hpage)); - put_page(hpage); - } - trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result); return result; } @@ -2624,12 +2781,14 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_ALLOC_HUGE_PAGE_FAIL: return -ENOMEM; case SCAN_CGROUP_CHARGE_FAIL: + case SCAN_EXCEED_NONE_PTE: return -EBUSY; /* Resource temporary unavailable - trying again might succeed */ case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: + case SCAN_PAGE_FILLED: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to |