diff options
Diffstat (limited to 'mm/hugetlb_vmemmap.c')
-rw-r--r-- | mm/hugetlb_vmemmap.c | 589 |
1 files changed, 453 insertions, 136 deletions
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 1362feb3c6c9..20f414c0379f 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Optimize vmemmap pages associated with HugeTLB + * HugeTLB Vmemmap Optimization (HVO) * - * Copyright (c) 2020, Bytedance. All rights reserved. + * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> * @@ -10,84 +10,443 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt -#include <linux/memory.h> +#include <linux/pgtable.h> +#include <linux/bootmem_info.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" -/* - * There are a lot of struct page structures associated with each HugeTLB page. - * For tail pages, the value of compound_head is the same. So we can reuse first - * page of head page structures. We map the virtual addresses of all the pages - * of tail page structures to the head page struct, and then free these page - * frames. Therefore, we need to reserve one pages as vmemmap areas. +/** + * struct vmemmap_remap_walk - walk vmemmap page table + * + * @remap_pte: called for each lowest-level entry (PTE). + * @nr_walked: the number of walked pte. + * @reuse_page: the page which is reused for the tail vmemmap pages. + * @reuse_addr: the virtual address of the @reuse_page page. + * @vmemmap_pages: the list head of the vmemmap pages that can be freed + * or is mapped from. */ -#define RESERVE_VMEMMAP_NR 1U -#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) - -enum vmemmap_optimize_mode { - VMEMMAP_OPTIMIZE_OFF, - VMEMMAP_OPTIMIZE_ON, +struct vmemmap_remap_walk { + void (*remap_pte)(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk); + unsigned long nr_walked; + struct page *reuse_page; + unsigned long reuse_addr; + struct list_head *vmemmap_pages; }; -DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, - hugetlb_optimize_vmemmap_key); -EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + pmd_t __pmd; + int i; + unsigned long addr = start; + struct page *page = pmd_page(*pmd); + pte_t *pgtable = pte_alloc_one_kernel(&init_mm); + + if (!pgtable) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, &__pmd, pgtable); + + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { + pte_t entry, *pte; + pgprot_t pgprot = PAGE_KERNEL; + + entry = mk_pte(page + i, pgprot); + pte = pte_offset_kernel(&__pmd, addr); + set_pte_at(&init_mm, addr, pte, entry); + } -static enum vmemmap_optimize_mode vmemmap_optimize_mode = - IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* + * Higher order allocations from buddy allocator must be able to + * be treated as indepdenent small pages (as they can be freed + * individually). + */ + if (!PageReserved(page)) + split_page(page, get_order(PMD_SIZE)); + + /* Make pte visible before pmd. See comment in pmd_install(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); + + return 0; +} -static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { - if (vmemmap_optimize_mode == to) - return; + int leaf; - if (to == VMEMMAP_OPTIMIZE_OFF) - static_branch_dec(&hugetlb_optimize_vmemmap_key); - else - static_branch_inc(&hugetlb_optimize_vmemmap_key); - WRITE_ONCE(vmemmap_optimize_mode, to); + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + +static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + + /* + * The reuse_page is found 'first' in table walk before we start + * remapping (which is calling @walk->remap_pte). + */ + if (!walk->reuse_page) { + walk->reuse_page = pte_page(*pte); + /* + * Because the reuse address is part of the range that we are + * walking, skip the reuse address range. + */ + addr += PAGE_SIZE; + pte++; + walk->nr_walked++; + } + + for (; addr != end; addr += PAGE_SIZE, pte++) { + walk->remap_pte(pte, addr, walk); + walk->nr_walked++; + } } -static int __init hugetlb_vmemmap_early_param(char *buf) +static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) { - bool enable; - enum vmemmap_optimize_mode mode; + pmd_t *pmd; + unsigned long next; - if (kstrtobool(buf, &enable)) - return -EINVAL; + pmd = pmd_offset(pud, addr); + do { + int ret; - mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; - vmemmap_optimize_mode_switch(mode); + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; + + next = pmd_addr_end(addr, end); + vmemmap_pte_range(pmd, addr, next, walk); + } while (pmd++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(p4d, addr); + do { + int ret; + + next = pud_addr_end(addr, end); + ret = vmemmap_pmd_range(pud, addr, next, walk); + if (ret) + return ret; + } while (pud++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + int ret; + + next = p4d_addr_end(addr, end); + ret = vmemmap_pud_range(p4d, addr, next, walk); + if (ret) + return ret; + } while (p4d++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_remap_range(unsigned long start, unsigned long end, + struct vmemmap_remap_walk *walk) +{ + unsigned long addr = start; + unsigned long next; + pgd_t *pgd; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + + pgd = pgd_offset_k(addr); + do { + int ret; + + next = pgd_addr_end(addr, end); + ret = vmemmap_p4d_range(pgd, addr, next, walk); + if (ret) + return ret; + } while (pgd++, addr = next, addr != end); + + /* + * We only change the mapping of the vmemmap virtual address range + * [@start + PAGE_SIZE, end), so we only need to flush the TLB which + * belongs to the range. + */ + flush_tlb_kernel_range(start + PAGE_SIZE, end); return 0; } -early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); /* - * Previously discarded vmemmap pages will be allocated and remapping - * after this function returns zero. + * Free a vmemmap page. A vmemmap page can be allocated from the memblock + * allocator or buddy allocator. If the PG_reserved flag is set, it means + * that it allocated from the memblock allocator, just free it via the + * free_bootmem_page(). Otherwise, use __free_page(). */ -int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) +static inline void free_vmemmap_page(struct page *page) +{ + if (PageReserved(page)) + free_bootmem_page(page); + else + __free_page(page); +} + +/* Free a list of the vmemmap pages */ +static void free_vmemmap_page_list(struct list_head *list) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + list_del(&page->lru); + free_vmemmap_page(page); + } +} + +static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + /* + * Remap the tail pages as read-only to catch illegal write operation + * to the tail pages. + */ + pgprot_t pgprot = PAGE_KERNEL_RO; + pte_t entry = mk_pte(walk->reuse_page, pgprot); + struct page *page = pte_page(*pte); + + list_add_tail(&page->lru, walk->vmemmap_pages); + set_pte_at(&init_mm, addr, pte, entry); +} + +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + pgprot_t pgprot = PAGE_KERNEL; + struct page *page; + void *to; + + BUG_ON(pte_page(*pte) != walk->reuse_page); + + page = list_first_entry(walk->vmemmap_pages, struct page, lru); + list_del(&page->lru); + to = page_to_virt(page); + copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); + + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); +} + +/** + * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) + * to the page which @reuse is mapped to, then free vmemmap + * which the range are mapped to. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_free(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_remap_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* + * In order to make remapping routine most efficient for the huge pages, + * the routine of vmemmap page table walking has the following rules + * (see more details from the vmemmap_pte_range()): + * + * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) + * should be continuous. + * - The @reuse address is part of the range [@reuse, @end) that we are + * walking which is passed to vmemmap_remap_range(). + * - The @reuse address is the first in the complete range. + * + * So we need to make sure that @start and @reuse meet the above rules. + */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_read_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + if (ret && walk.nr_walked) { + end = reuse + walk.nr_walked * PAGE_SIZE; + /* + * vmemmap_pages contains pages from the previous + * vmemmap_remap_range call which failed. These + * are pages which were removed from the vmemmap. + * They will be restored in the following call. + */ + walk = (struct vmemmap_remap_walk) { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + vmemmap_remap_range(reuse, end, &walk); + } + mmap_read_unlock(&init_mm); + + free_vmemmap_page_list(&vmemmap_pages); + + return ret; +} + +static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, + gfp_t gfp_mask, struct list_head *list) +{ + unsigned long nr_pages = (end - start) >> PAGE_SHIFT; + int nid = page_to_nid((struct page *)start); + struct page *page, *next; + + while (nr_pages--) { + page = alloc_pages_node(nid, gfp_mask, 0); + if (!page) + goto out; + list_add_tail(&page->lru, list); + } + + return 0; +out: + list_for_each_entry_safe(page, next, list, lru) + __free_pages(page, 0); + return -ENOMEM; +} + +/** + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) + * to the page which is from the @vmemmap_pages + * respectively. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * @gfp_mask: GFP flag for allocating vmemmap pages. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_alloc(unsigned long start, unsigned long end, + unsigned long reuse, gfp_t gfp_mask) +{ + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) + return -ENOMEM; + + mmap_read_lock(&init_mm); + vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return 0; +} + +DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); + +static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); +core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); + +/** + * hugetlb_vmemmap_restore - restore previously optimized (by + * hugetlb_vmemmap_optimize()) vmemmap pages which + * will be reallocated and remapped. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be restored. + * + * Return: %0 if @head's vmemmap pages have been reallocated and remapped, + * negative error code otherwise. + */ +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) { int ret; - unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; if (!HPageVmemmapOptimized(head)) return 0; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); - vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* - * The pages which the vmemmap virtual address range [@vmemmap_addr, + * The pages which the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end) are mapped to are freed to the buddy allocator, and * the range is mapped to the page which @vmemmap_reuse is mapped to. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ - ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); if (!ret) { ClearHPageVmemmapOptimized(head); @@ -97,11 +456,14 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) return ret; } -static unsigned int vmemmap_optimizable_pages(struct hstate *h, - struct page *head) +/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ +static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) { - if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) - return 0; + if (!READ_ONCE(vmemmap_optimize_enabled)) + return false; + + if (!hugetlb_vmemmap_optimizable(h)) + return false; if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { pmd_t *pmdp, pmd; @@ -144,118 +506,73 @@ static unsigned int vmemmap_optimizable_pages(struct hstate *h, * +-------------------------------------------+ */ if (PageVmemmapSelfHosted(vmemmap_page)) - return 0; + return false; } - return hugetlb_optimize_vmemmap_pages(h); + return true; } -void hugetlb_vmemmap_free(struct hstate *h, struct page *head) +/** + * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be optimized. + * + * This function only tries to optimize @head's vmemmap pages and does not + * guarantee that the optimization will succeed after it returns. The caller + * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages + * have been optimized. + */ +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { - unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; - vmemmap_pages = vmemmap_optimizable_pages(h, head); - if (!vmemmap_pages) + if (!vmemmap_should_optimize(h, head)) return; static_branch_inc(&hugetlb_optimize_vmemmap_key); - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* - * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) + * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) * to the page which @vmemmap_reuse is mapped to, then free the pages - * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. + * which the range [@vmemmap_start, @vmemmap_end] is mapped to. */ - if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse)) static_branch_dec(&hugetlb_optimize_vmemmap_key); else SetHPageVmemmapOptimized(head); } -void __init hugetlb_vmemmap_init(struct hstate *h) -{ - unsigned int nr_pages = pages_per_huge_page(h); - unsigned int vmemmap_pages; - - /* - * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, - * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. - */ - BUILD_BUG_ON(__NR_USED_SUBPAGE >= - RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - - if (!is_power_of_2(sizeof(struct page))) { - pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); - static_branch_disable(&hugetlb_optimize_vmemmap_key); - return; - } - - vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; - /* - * The head page is not to be freed to buddy allocator, the other tail - * pages will map to the head page, so they can be freed. - * - * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true - * on some architectures (e.g. aarch64). See Documentation/arm64/ - * hugetlbpage.rst for more details. - */ - if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) - h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; - - pr_info("can optimize %d vmemmap pages for %s\n", - h->optimize_vmemmap_pages, h->name); -} - -#ifdef CONFIG_PROC_SYSCTL -static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos) -{ - int ret; - enum vmemmap_optimize_mode mode; - static DEFINE_MUTEX(sysctl_mutex); - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&sysctl_mutex); - mode = vmemmap_optimize_mode; - table->data = &mode; - ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (write && !ret) - vmemmap_optimize_mode_switch(mode); - mutex_unlock(&sysctl_mutex); - - return ret; -} - static struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", - .maxlen = sizeof(enum vmemmap_optimize_mode), + .data = &vmemmap_optimize_enabled, + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = hugetlb_optimize_vmemmap_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .proc_handler = proc_dobool, }, { } }; -static __init int hugetlb_vmemmap_sysctls_init(void) +static int __init hugetlb_vmemmap_init(void) { - /* - * If "struct page" crosses page boundaries, the vmemmap pages cannot - * be optimized. - */ - if (is_power_of_2(sizeof(struct page))) - register_sysctl_init("vm", hugetlb_vmemmap_sysctls); - + /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ + BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE); + + if (IS_ENABLED(CONFIG_PROC_SYSCTL)) { + const struct hstate *h; + + for_each_hstate(h) { + if (hugetlb_vmemmap_optimizable(h)) { + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + break; + } + } + } return 0; } -late_initcall(hugetlb_vmemmap_sysctls_init); -#endif /* CONFIG_PROC_SYSCTL */ +late_initcall(hugetlb_vmemmap_init); |