summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c300
1 files changed, 93 insertions, 207 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 406a3c28c026..2fe38212e07c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -34,11 +34,15 @@
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
+#include <linux/sched/sysctl.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
/*
* By default, transparent hugepage support is disabled in order to avoid
* risking an increased memory footprint for applications that are not
@@ -529,7 +533,7 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-bool is_transparent_hugepage(struct page *page)
+static inline bool is_transparent_hugepage(struct page *page)
{
if (!PageCompound(page))
return false;
@@ -538,7 +542,6 @@ bool is_transparent_hugepage(struct page *page)
return is_huge_zero_page(page) ||
page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
}
-EXPORT_SYMBOL_GPL(is_transparent_hugepage);
static unsigned long __thp_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len,
@@ -582,13 +585,10 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long ret;
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
- if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
- goto out;
-
ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
if (ret)
return ret;
-out:
+
return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
@@ -1303,7 +1303,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageHead(page), page);
- /* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
@@ -1319,10 +1318,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
}
/*
- * We can only reuse the page if nobody else maps the huge page or it's
- * part.
+ * See do_wp_page(): we can only map the page writable if there are
+ * no additional references. Note that we always drain the LRU
+ * pagevecs immediately after adding a THP.
*/
- if (reuse_swap_page(page)) {
+ if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page))
+ goto unlock_fallback;
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+ if (page_count(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1333,6 +1337,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
+unlock_fallback:
unlock_page(page);
spin_unlock(vmf->ptl);
fallback:
@@ -1380,39 +1385,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd, flags);
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- /*
- * We don't mlock() pte-mapped THPs. This way we can avoid
- * leaking mlocked pages into non-VM_LOCKED VMAs.
- *
- * For anon THP:
- *
- * In most cases the pmd is the only mapping of the page as we
- * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
- * writable private mappings in populate_vma_page_range().
- *
- * The only scenario when we have the page shared here is if we
- * mlocking read-only mapping shared over fork(). We skip
- * mlocking such pages.
- *
- * For file THP:
- *
- * We can expect PageDoubleMap() to be stable under page lock:
- * for file pages we set it in page_add_file_rmap(), which
- * requires page to be locked.
- */
-
- if (PageAnon(page) && compound_mapcount(page) != 1)
- goto skip_mlock;
- if (PageDoubleMap(page) || !page->mapping)
- goto skip_mlock;
- if (!trylock_page(page))
- goto skip_mlock;
- if (page->mapping && !PageDoubleMap(page))
- mlock_vma_page(page);
- unlock_page(page);
- }
-skip_mlock:
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
@@ -1610,7 +1582,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (pmd_present(orig_pmd)) {
page = pmd_page(orig_pmd);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
} else if (thp_migration_supported()) {
@@ -1766,17 +1738,28 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
#endif
- /*
- * Avoid trapping faults against the zero page. The read-only
- * data is likely to be read-cached on the local CPU and
- * local/remote hits to the zero page are not interesting.
- */
- if (prot_numa && is_huge_zero_pmd(*pmd))
- goto unlock;
+ if (prot_numa) {
+ struct page *page;
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (is_huge_zero_pmd(*pmd))
+ goto unlock;
- if (prot_numa && pmd_protnone(*pmd))
- goto unlock;
+ if (pmd_protnone(*pmd))
+ goto unlock;
+ page = pmd_page(*pmd);
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ node_is_toptier(page_to_nid(page)))
+ goto unlock;
+ }
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
* to not clear pmd intermittently to avoid race with MADV_DONTNEED
@@ -1995,7 +1978,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
set_page_dirty(page);
if (!PageReferenced(page) && pmd_young(old_pmd))
SetPageReferenced(page);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
put_page(page);
}
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
@@ -2055,9 +2038,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
uffd_wp = pmd_uffd_wp(old_pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ page_ref_add(page, HPAGE_PMD_NR - 1);
}
- VM_BUG_ON_PAGE(!page_count(page), page);
- page_ref_add(page, HPAGE_PMD_NR - 1);
/*
* Withdraw the table only after we mark the pmd entry invalid.
@@ -2129,6 +2112,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
}
unlock_page_memcg(page);
+
+ /* Above is effectively page_remove_rmap(page, vma, true) */
+ munlock_vma_page(page, vma, true);
}
smp_wmb(); /* make pte visible before pmd */
@@ -2136,19 +2122,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (freeze) {
for (i = 0; i < HPAGE_PMD_NR; i++) {
- page_remove_rmap(page + i, false);
+ page_remove_rmap(page + i, vma, false);
put_page(page + i);
}
}
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address, bool freeze, struct page *page)
+ unsigned long address, bool freeze, struct folio *folio)
{
spinlock_t *ptl;
struct mmu_notifier_range range;
- bool do_unlock_page = false;
- pmd_t _pmd;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
@@ -2157,54 +2141,22 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
ptl = pmd_lock(vma->vm_mm, pmd);
/*
- * If caller asks to setup a migration entries, we need a page to check
- * pmd against. Otherwise we can end up replacing wrong page.
+ * If caller asks to setup a migration entry, we need a folio to check
+ * pmd against. Otherwise we can end up replacing wrong folio.
*/
- VM_BUG_ON(freeze && !page);
- if (page) {
- VM_WARN_ON_ONCE(!PageLocked(page));
- if (page != pmd_page(*pmd))
+ VM_BUG_ON(freeze && !folio);
+ if (folio) {
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
+ if (folio != page_folio(pmd_page(*pmd)))
goto out;
}
-repeat:
- if (pmd_trans_huge(*pmd)) {
- if (!page) {
- page = pmd_page(*pmd);
- /*
- * An anonymous page must be locked, to ensure that a
- * concurrent reuse_swap_page() sees stable mapcount;
- * but reuse_swap_page() is not used on shmem or file,
- * and page lock must not be taken when zap_pmd_range()
- * calls __split_huge_pmd() while i_mmap_lock is held.
- */
- if (PageAnon(page)) {
- if (unlikely(!trylock_page(page))) {
- get_page(page);
- _pmd = *pmd;
- spin_unlock(ptl);
- lock_page(page);
- spin_lock(ptl);
- if (unlikely(!pmd_same(*pmd, _pmd))) {
- unlock_page(page);
- put_page(page);
- page = NULL;
- goto repeat;
- }
- put_page(page);
- }
- do_unlock_page = true;
- }
- }
- if (PageMlocked(page))
- clear_page_mlock(page);
- } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
- goto out;
- __split_huge_pmd_locked(vma, pmd, range.start, freeze);
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
+ is_pmd_migration_entry(*pmd))
+ __split_huge_pmd_locked(vma, pmd, range.start, freeze);
+
out:
spin_unlock(ptl);
- if (do_unlock_page)
- unlock_page(page);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
* They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2222,7 +2174,7 @@ out:
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
- bool freeze, struct page *page)
+ bool freeze, struct folio *folio)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -2243,7 +2195,7 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
pmd = pmd_offset(pud, address);
- __split_huge_pmd(vma, pmd, address, freeze, page);
+ __split_huge_pmd(vma, pmd, address, freeze, folio);
}
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
@@ -2283,6 +2235,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void unmap_page(struct page *page)
{
+ struct folio *folio = page_folio(page);
enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
TTU_SYNC;
@@ -2293,26 +2246,27 @@ static void unmap_page(struct page *page)
* pages can simply be left unmapped, then faulted back on demand.
* If that is ever changed (perhaps for mlock), update remap_page().
*/
- if (PageAnon(page))
- try_to_migrate(page, ttu_flags);
+ if (folio_test_anon(folio))
+ try_to_migrate(folio, ttu_flags);
else
- try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK);
+ try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
-static void remap_page(struct page *page, unsigned int nr)
+static void remap_page(struct folio *folio, unsigned long nr)
{
- int i;
+ int i = 0;
/* If unmap_page() uses try_to_migrate() on file, remove this check */
- if (!PageAnon(page))
+ if (!folio_test_anon(folio))
return;
- if (PageTransHuge(page)) {
- remove_migration_ptes(page, page, true);
- } else {
- for (i = 0; i < nr; i++)
- remove_migration_ptes(page + i, page + i, true);
+ for (;;) {
+ remove_migration_ptes(folio, folio, true);
+ i += folio_nr_pages(folio);
+ if (i >= nr)
+ break;
+ folio = folio_next(folio);
}
}
@@ -2332,8 +2286,11 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
} else {
/* head is still on lru (and we have it frozen) */
VM_WARN_ON(!PageLRU(head));
+ if (PageUnevictable(tail))
+ tail->mlock_count = 0;
+ else
+ list_add_tail(&tail->lru, &head->lru);
SetPageLRU(tail);
- list_add_tail(&tail->lru, &head->lru);
}
}
@@ -2469,7 +2426,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
local_irq_enable();
- remap_page(head, nr);
+ remap_page(folio, nr);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2494,91 +2451,20 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
}
-int total_mapcount(struct page *page)
-{
- int i, compound, nr, ret;
-
- VM_BUG_ON_PAGE(PageTail(page), page);
-
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) + 1;
-
- compound = compound_mapcount(page);
- nr = compound_nr(page);
- if (PageHuge(page))
- return compound;
- ret = compound;
- for (i = 0; i < nr; i++)
- ret += atomic_read(&page[i]._mapcount) + 1;
- /* File pages has compound_mapcount included in _mapcount */
- if (!PageAnon(page))
- return ret - compound * nr;
- if (PageDoubleMap(page))
- ret -= nr;
- return ret;
-}
-
-/*
- * This calculates accurately how many mappings a transparent hugepage
- * has (unlike page_mapcount() which isn't fully accurate). This full
- * accuracy is primarily needed to know if copy-on-write faults can
- * reuse the page and change the mapping to read-write instead of
- * copying them. At the same time this returns the total_mapcount too.
- *
- * The function returns the highest mapcount any one of the subpages
- * has. If the return value is one, even if different processes are
- * mapping different subpages of the transparent hugepage, they can
- * all reuse it, because each process is reusing a different subpage.
- *
- * The total_mapcount is instead counting all virtual mappings of the
- * subpages. If the total_mapcount is equal to "one", it tells the
- * caller all mappings belong to the same "mm" and in turn the
- * anon_vma of the transparent hugepage can become the vma->anon_vma
- * local one as no other process may be mapping any of the subpages.
- *
- * It would be more accurate to replace page_mapcount() with
- * page_trans_huge_mapcount(), however we only use
- * page_trans_huge_mapcount() in the copy-on-write faults where we
- * need full accuracy to avoid breaking page pinning, because
- * page_trans_huge_mapcount() is slower than page_mapcount().
- */
-int page_trans_huge_mapcount(struct page *page)
-{
- int i, ret;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- if (likely(!PageTransCompound(page)))
- return atomic_read(&page->_mapcount) + 1;
-
- page = compound_head(page);
-
- ret = 0;
- for (i = 0; i < thp_nr_pages(page); i++) {
- int mapcount = atomic_read(&page[i]._mapcount) + 1;
- ret = max(ret, mapcount);
- }
-
- if (PageDoubleMap(page))
- ret -= 1;
-
- return ret + compound_mapcount(page);
-}
-
/* Racy check whether the huge page can be split */
-bool can_split_huge_page(struct page *page, int *pextra_pins)
+bool can_split_folio(struct folio *folio, int *pextra_pins)
{
int extra_pins;
/* Additional pins from page cache */
- if (PageAnon(page))
- extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
+ if (folio_test_anon(folio))
+ extra_pins = folio_test_swapcache(folio) ?
+ folio_nr_pages(folio) : 0;
else
- extra_pins = thp_nr_pages(page);
+ extra_pins = folio_nr_pages(folio);
if (pextra_pins)
*pextra_pins = extra_pins;
- return total_mapcount(page) == page_count(page) - extra_pins - 1;
+ return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
}
/*
@@ -2602,7 +2488,8 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
*/
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
- struct page *head = compound_head(page);
+ struct folio *folio = page_folio(page);
+ struct page *head = &folio->page;
struct deferred_split *ds_queue = get_deferred_split_queue(head);
XA_STATE(xas, &head->mapping->i_pages, head->index);
struct anon_vma *anon_vma = NULL;
@@ -2622,7 +2509,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
* reference to it and then lock the anon_vma for write. This
- * is similar to page_lock_anon_vma_read except the write lock
+ * is similar to folio_lock_anon_vma_read except the write lock
* is taken to serialise against parallel split or collapse
* operations.
*/
@@ -2669,7 +2556,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* Racy check if we can split the page, before unmap_page() will
* split PMDs
*/
- if (!can_split_huge_page(head, &extra_pins)) {
+ if (!can_split_folio(folio, &extra_pins)) {
ret = -EBUSY;
goto out_unlock;
}
@@ -2719,7 +2606,7 @@ fail:
if (mapping)
xas_unlock(&xas);
local_irq_enable();
- remap_page(head, thp_nr_pages(head));
+ remap_page(folio, folio_nr_pages(folio));
ret = -EBUSY;
}
@@ -2953,7 +2840,6 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
*/
for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
struct vm_area_struct *vma = find_vma(mm, addr);
- unsigned int follflags;
struct page *page;
if (!vma || addr < vma->vm_start)
@@ -2966,8 +2852,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
}
/* FOLL_DUMP to ignore special (like zero) pages */
- follflags = FOLL_GET | FOLL_DUMP;
- page = follow_page(vma, addr, follflags);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
if (IS_ERR(page))
continue;
@@ -2978,7 +2863,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
goto next;
total++;
- if (!can_split_huge_page(compound_head(page), NULL))
+ if (!can_split_folio(page_folio(page), NULL))
goto next;
if (!trylock_page(page))
@@ -3171,8 +3056,9 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
put_page(page);
+ trace_set_migration_pmd(address, pmd_val(pmdswp));
}
void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
@@ -3197,14 +3083,14 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
- flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
if (PageAnon(new))
page_add_anon_rmap(new, vma, mmun_start, true);
else
- page_add_file_rmap(new, true);
+ page_add_file_rmap(new, vma, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
- if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
- mlock_vma_page(new);
+
+ /* No need to invalidate - it was non-present before */
update_mmu_cache_pmd(vma, address, pvmw->pmd);
+ trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif