diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 442 |
1 files changed, 144 insertions, 298 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a3a6e2dee97..def84d8bcf2d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -56,16 +56,6 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) -{ - return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, - 1 << order); -} -#else -static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) -{ - return false; -} #endif static unsigned long hugetlb_cma_size __initdata; @@ -82,14 +72,14 @@ static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, * free_huge_pages, and surplus_huge_pages. */ -DEFINE_SPINLOCK(hugetlb_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock); /* * Serializes faults on the same logical page. This is used to * prevent spurious OOMs when the hugepage pool is fully utilized. */ -static int num_fault_mutexes; -struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; +static int num_fault_mutexes __ro_after_init; +struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -100,6 +90,17 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); +static void hugetlb_free_folio(struct folio *folio) +{ +#ifdef CONFIG_CMA + int nid = folio_nid(folio); + + if (cma_free_folio(hugetlb_cma[nid], folio)) + return; +#endif + folio_put(folio); +} + static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) @@ -1512,95 +1513,54 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -/* used to demote non-gigantic_huge pages as well */ -static void __destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order, bool demote) -{ - int i; - int nr_pages = 1 << order; - struct page *p; - - atomic_set(&folio->_entire_mapcount, 0); - atomic_set(&folio->_large_mapcount, 0); - atomic_set(&folio->_pincount, 0); - - for (i = 1; i < nr_pages; i++) { - p = folio_page(folio, i); - p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; - p->mapping = NULL; - clear_compound_head(p); - if (!demote) - set_page_refcounted(p); - } - - __folio_clear_head(folio); -} - -static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, - unsigned int order) -{ - __destroy_compound_gigantic_folio(folio, order, true); -} - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order) -{ - __destroy_compound_gigantic_folio(folio, order, false); -} - -static void free_gigantic_folio(struct folio *folio, unsigned int order) -{ - /* - * If the page isn't allocated using the cma allocator, - * cma_release() returns false. - */ -#ifdef CONFIG_CMA - int nid = folio_nid(folio); - - if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) - return; -#endif - - free_contig_range(folio_pfn(folio), 1 << order); -} - #ifdef CONFIG_CONTIG_ALLOC static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - struct page *page; - unsigned long nr_pages = pages_per_huge_page(h); + struct folio *folio; + int order = huge_page_order(h); + bool retried = false; + if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - +retry: + folio = NULL; #ifdef CONFIG_CMA { int node; - if (hugetlb_cma[nid]) { - page = cma_alloc(hugetlb_cma[nid], nr_pages, - huge_page_order(h), true); - if (page) - return page_folio(page); - } + if (hugetlb_cma[nid]) + folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); - if (!(gfp_mask & __GFP_THISNODE)) { + if (!folio && !(gfp_mask & __GFP_THISNODE)) { for_each_node_mask(node, *nodemask) { if (node == nid || !hugetlb_cma[node]) continue; - page = cma_alloc(hugetlb_cma[node], nr_pages, - huge_page_order(h), true); - if (page) - return page_folio(page); + folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); + if (folio) + break; } } } #endif + if (!folio) { + folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); + if (!folio) + return NULL; + } - page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); - return page ? page_folio(page) : NULL; + if (folio_ref_freeze(folio, 1)) + return folio; + + pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio)); + hugetlb_free_folio(folio); + if (!retried) { + retried = true; + goto retry; + } + return NULL; } #else /* !CONFIG_CONTIG_ALLOC */ @@ -1617,10 +1577,6 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, { return NULL; } -static inline void free_gigantic_folio(struct folio *folio, - unsigned int order) { } -static inline void destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order) { } #endif /* @@ -1748,18 +1704,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, folio_ref_unfreeze(folio, 1); - /* - * Non-gigantic pages demoted from CMA allocated gigantic pages - * need to be given back to CMA in free_gigantic_folio. - */ - if (hstate_is_gigantic(h) || - hugetlb_cma_folio(folio, huge_page_order(h))) { - destroy_compound_gigantic_folio(folio, huge_page_order(h)); - free_gigantic_folio(folio, huge_page_order(h)); - } else { - INIT_LIST_HEAD(&folio->_deferred_list); - folio_put(folio); - } + INIT_LIST_HEAD(&folio->_deferred_list); + hugetlb_free_folio(folio); } /* @@ -2032,95 +1978,6 @@ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni spin_unlock_irq(&hugetlb_lock); } -static bool __prep_compound_gigantic_folio(struct folio *folio, - unsigned int order, bool demote) -{ - int i, j; - int nr_pages = 1 << order; - struct page *p; - - __folio_clear_reserved(folio); - for (i = 0; i < nr_pages; i++) { - p = folio_page(folio, i); - - /* - * For gigantic hugepages allocated through bootmem at - * boot, it's safer to be consistent with the not-gigantic - * hugepages and clear the PG_reserved bit from all tail pages - * too. Otherwise drivers using get_user_pages() to access tail - * pages may get the reference counting wrong if they see - * PG_reserved set on a tail page (despite the head page not - * having PG_reserved set). Enforcing this consistency between - * head and tail pages allows drivers to optimize away a check - * on the head page when they need know if put_page() is needed - * after get_user_pages(). - */ - if (i != 0) /* head page cleared above */ - __ClearPageReserved(p); - /* - * Subtle and very unlikely - * - * Gigantic 'page allocators' such as memblock or cma will - * return a set of pages with each page ref counted. We need - * to turn this set of pages into a compound page with tail - * page ref counts set to zero. Code such as speculative page - * cache adding could take a ref on a 'to be' tail page. - * We need to respect any increased ref count, and only set - * the ref count to zero if count is currently 1. If count - * is not 1, we return an error. An error return indicates - * the set of pages can not be converted to a gigantic page. - * The caller who allocated the pages should then discard the - * pages using the appropriate free interface. - * - * In the case of demote, the ref count will be zero. - */ - if (!demote) { - if (!page_ref_freeze(p, 1)) { - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); - goto out_error; - } - } else { - VM_BUG_ON_PAGE(page_count(p), p); - } - if (i != 0) - set_compound_head(p, &folio->page); - } - __folio_set_head(folio); - /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */ - folio_set_order(folio, order); - atomic_set(&folio->_entire_mapcount, -1); - atomic_set(&folio->_large_mapcount, -1); - atomic_set(&folio->_pincount, 0); - return true; - -out_error: - /* undo page modifications made above */ - for (j = 0; j < i; j++) { - p = folio_page(folio, j); - if (j != 0) - clear_compound_head(p); - set_page_refcounted(p); - } - /* need to clear PG_reserved on remaining tail pages */ - for (; j < nr_pages; j++) { - p = folio_page(folio, j); - __ClearPageReserved(p); - } - return false; -} - -static bool prep_compound_gigantic_folio(struct folio *folio, - unsigned int order) -{ - return __prep_compound_gigantic_folio(folio, order, false); -} - -static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, - unsigned int order) -{ - return __prep_compound_gigantic_folio(folio, order, true); -} - /* * Find and lock address space (mapping) in write mode. * @@ -2159,7 +2016,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, */ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) alloc_try_hard = false; - gfp_mask |= __GFP_COMP|__GFP_NOWARN; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) @@ -2206,48 +2062,16 @@ retry: return folio; } -static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) -{ - struct folio *folio; - bool retry = false; - -retry: - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); - else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, - nid, nmask, node_alloc_noretry); - if (!folio) - return NULL; - - if (hstate_is_gigantic(h)) { - if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { - /* - * Rare failure to convert pages to compound page. - * Free pages and try again - ONCE! - */ - free_gigantic_folio(folio, huge_page_order(h)); - if (!retry) { - retry = true; - goto retry; - } - return NULL; - } - } - - return folio; -} - static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, - node_alloc_noretry); + if (hstate_is_gigantic(h)) + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + else + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; @@ -2265,7 +2089,10 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (hstate_is_gigantic(h)) + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + else + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; @@ -2549,9 +2376,8 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { - gfp_t gfp = gfp_mask | __GFP_NOWARN; + gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ @@ -3333,6 +3159,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); + __ClearPageReserved(folio_page(folio, pfn - head_pfn)); __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); ret = page_ref_freeze(page, 1); @@ -3921,101 +3748,120 @@ out: return 0; } -static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) +static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, + struct list_head *src_list) { - int i, nid = folio_nid(folio); - struct hstate *target_hstate; - struct page *subpage; - struct folio *inner_folio; - int rc = 0; - - target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); - - remove_hugetlb_folio(h, folio, false); - spin_unlock_irq(&hugetlb_lock); + long rc; + struct folio *folio, *next; + LIST_HEAD(dst_list); + LIST_HEAD(ret_list); - /* - * If vmemmap already existed for folio, the remove routine above would - * have cleared the hugetlb folio flag. Hence the folio is technically - * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be - * passed hugetlb folios and will BUG otherwise. - */ - if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore_folio(h, folio); - if (rc) { - /* Allocation of vmemmmap failed, we can not demote folio */ - spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); - return rc; - } - } - - /* - * Use destroy_compound_hugetlb_folio_for_demote for all huge page - * sizes as it will not ref count folios. - */ - destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); + rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list); + list_splice_init(&ret_list, src_list); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. * Without the mutex, pages added to target hstate could be marked * as surplus. * - * Note that we already hold h->resize_lock. To prevent deadlock, + * Note that we already hold src->resize_lock. To prevent deadlock, * use the convention of always taking larger size hstate mutex first. */ - mutex_lock(&target_hstate->resize_lock); - for (i = 0; i < pages_per_huge_page(h); - i += pages_per_huge_page(target_hstate)) { - subpage = folio_page(folio, i); - inner_folio = page_folio(subpage); - if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_folio_for_demote(inner_folio, - target_hstate->order); - else - prep_compound_page(subpage, target_hstate->order); - folio_change_private(inner_folio, NULL); - prep_new_hugetlb_folio(target_hstate, inner_folio, nid); - free_huge_folio(inner_folio); + mutex_lock(&dst->resize_lock); + + list_for_each_entry_safe(folio, next, src_list, lru) { + int i; + + if (folio_test_hugetlb_vmemmap_optimized(folio)) + continue; + + list_del(&folio->lru); + + split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); + pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst)); + + for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { + struct page *page = folio_page(folio, i); + + page->mapping = NULL; + clear_compound_head(page); + prep_compound_page(page, dst->order); + + init_new_hugetlb_folio(dst, page_folio(page)); + list_add(&page->lru, &dst_list); + } } - mutex_unlock(&target_hstate->resize_lock); - spin_lock_irq(&hugetlb_lock); + prep_and_add_allocated_folios(dst, &dst_list); - /* - * Not absolutely necessary, but for consistency update max_huge_pages - * based on pool changes for the demoted page. - */ - h->max_huge_pages--; - target_hstate->max_huge_pages += - pages_per_huge_page(h) / pages_per_huge_page(target_hstate); + mutex_unlock(&dst->resize_lock); return rc; } -static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, + unsigned long nr_to_demote) __must_hold(&hugetlb_lock) { int nr_nodes, node; - struct folio *folio; + struct hstate *dst; + long rc = 0; + long nr_demoted = 0; lockdep_assert_held(&hugetlb_lock); /* We should never get here if no demote order */ - if (!h->demote_order) { + if (!src->demote_order) { pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); return -EINVAL; /* internal error */ } + dst = size_to_hstate(PAGE_SIZE << src->demote_order); - for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { - list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { + for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) { + LIST_HEAD(list); + struct folio *folio, *next; + + list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) { if (folio_test_hwpoison(folio)) continue; - return demote_free_hugetlb_folio(h, folio); + + remove_hugetlb_folio(src, folio, false); + list_add(&folio->lru, &list); + + if (++nr_demoted == nr_to_demote) + break; + } + + spin_unlock_irq(&hugetlb_lock); + + rc = demote_free_hugetlb_folios(src, dst, &list); + + spin_lock_irq(&hugetlb_lock); + + list_for_each_entry_safe(folio, next, &list, lru) { + list_del(&folio->lru); + add_hugetlb_folio(src, folio, false); + + nr_demoted--; } + + if (rc < 0 || nr_demoted == nr_to_demote) + break; } /* + * Not absolutely necessary, but for consistency update max_huge_pages + * based on pool changes for the demoted page. + */ + src->max_huge_pages -= nr_demoted; + dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst)); + + if (rc < 0) + return rc; + + if (nr_demoted) + return nr_demoted; + /* * Only way to get here is if all pages on free lists are poisoned. * Return -EBUSY so that caller will not retry. */ @@ -4249,6 +4095,8 @@ static ssize_t demote_store(struct kobject *kobj, spin_lock_irq(&hugetlb_lock); while (nr_demote) { + long rc; + /* * Check for available pages to demote each time thorough the * loop as demote_pool_huge_page will drop hugetlb_lock. @@ -4261,11 +4109,13 @@ static ssize_t demote_store(struct kobject *kobj, if (!nr_available) break; - err = demote_pool_huge_page(h, n_mask); - if (err) + rc = demote_pool_huge_page(h, n_mask, nr_demote); + if (rc < 0) { + err = rc; break; + } - nr_demote--; + nr_demote -= rc; } spin_unlock_irq(&hugetlb_lock); @@ -7227,7 +7077,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, return 0; } -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, unsigned long addr, pgoff_t idx) @@ -7389,7 +7239,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, return 1; } -#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -7412,7 +7262,7 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { return false; } -#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -7510,7 +7360,7 @@ unsigned long hugetlb_mask_last_page(struct hstate *h) /* See description above. Architectures can provide their own version. */ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) { -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING if (huge_page_size(h) == PMD_SIZE) return PUD_SIZE - PMD_SIZE; #endif @@ -7519,10 +7369,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ -/* - * These functions are overwritable if your architecture needs its own - * behavior. - */ bool isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; |