diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 317 |
1 files changed, 198 insertions, 119 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e3bc949ebcc..3334a769eb91 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -50,7 +50,6 @@ #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> -#include <linux/page_ext.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> #include <linux/compaction.h> @@ -136,6 +135,55 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON +DEFINE_STATIC_KEY_TRUE(init_on_alloc); +#else +DEFINE_STATIC_KEY_FALSE(init_on_alloc); +#endif +EXPORT_SYMBOL(init_on_alloc); + +#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON +DEFINE_STATIC_KEY_TRUE(init_on_free); +#else +DEFINE_STATIC_KEY_FALSE(init_on_free); +#endif +EXPORT_SYMBOL(init_on_free); + +static int __init early_init_on_alloc(char *buf) +{ + int ret; + bool bool_result; + + if (!buf) + return -EINVAL; + ret = kstrtobool(buf, &bool_result); + if (bool_result && page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); + if (bool_result) + static_branch_enable(&init_on_alloc); + else + static_branch_disable(&init_on_alloc); + return ret; +} +early_param("init_on_alloc", early_init_on_alloc); + +static int __init early_init_on_free(char *buf) +{ + int ret; + bool bool_result; + + if (!buf) + return -EINVAL; + ret = kstrtobool(buf, &bool_result); + if (bool_result && page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); + if (bool_result) + static_branch_enable(&init_on_free); + else + static_branch_disable(&init_on_free); + return ret; +} +early_param("init_on_free", early_init_on_free); /* * A cached value of the page's pageblock's migratetype, used when the page is @@ -224,8 +272,6 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { [ZONE_MOVABLE] = 0, }; -EXPORT_SYMBOL(totalram_pages); - static char * const zone_names[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA "DMA", @@ -404,7 +450,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM - return __pfn_to_section(pfn)->pageblock_flags; + return section_to_usemap(__pfn_to_section(pfn)); #else return page_zone(page)->pageblock_flags; #endif /* CONFIG_SPARSEMEM */ @@ -624,6 +670,7 @@ out: void free_compound_page(struct page *page) { + mem_cgroup_uncharge(page); __free_pages_ok(page, compound_order(page)); } @@ -646,30 +693,29 @@ void prep_compound_page(struct page *page, unsigned int order) #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly - = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); + +#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT +DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); +#else +DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +#endif EXPORT_SYMBOL(_debug_pagealloc_enabled); -bool _debug_guardpage_enabled __read_mostly; + +DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static int __init early_debug_pagealloc(char *buf) { - if (!buf) - return -EINVAL; - return kstrtobool(buf, &_debug_pagealloc_enabled); -} -early_param("debug_pagealloc", early_debug_pagealloc); + bool enable = false; -static bool need_debug_guardpage(void) -{ - /* If we don't use debug_pagealloc, we don't need guard page */ - if (!debug_pagealloc_enabled()) - return false; + if (kstrtobool(buf, &enable)) + return -EINVAL; - if (!debug_guardpage_minorder()) - return false; + if (enable) + static_branch_enable(&_debug_pagealloc_enabled); - return true; + return 0; } +early_param("debug_pagealloc", early_debug_pagealloc); static void init_debug_guardpage(void) { @@ -679,14 +725,9 @@ static void init_debug_guardpage(void) if (!debug_guardpage_minorder()) return; - _debug_guardpage_enabled = true; + static_branch_enable(&_debug_guardpage_enabled); } -struct page_ext_operations debug_guardpage_ops = { - .need = need_debug_guardpage, - .init = init_debug_guardpage, -}; - static int __init debug_guardpage_minorder_setup(char *buf) { unsigned long res; @@ -704,20 +745,13 @@ early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return false; if (order >= debug_guardpage_minorder()) return false; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); - + __SetPageGuard(page); INIT_LIST_HEAD(&page->lru); set_page_private(page, order); /* Guard pages are not available for any usage */ @@ -729,23 +763,16 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + __ClearPageGuard(page); set_page_private(page, 0); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -struct page_ext_operations debug_guardpage_ops; static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -1090,6 +1117,14 @@ out: return ret; } +static void kernel_init_free_pages(struct page *page, int numpages) +{ + int i; + + for (i = 0; i < numpages; i++) + clear_highpage(page + i); +} + static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free) { @@ -1141,6 +1176,9 @@ static __always_inline bool free_pages_prepare(struct page *page, PAGE_SIZE << order); } arch_free_page(page, order); + if (want_init_on_free()) + kernel_init_free_pages(page, 1 << order); + kernel_poison_pages(page, 1 << order, 0); if (debug_pagealloc_enabled()) kernel_map_pages(page, 1 << order, 0); @@ -1151,19 +1189,36 @@ static __always_inline bool free_pages_prepare(struct page *page, } #ifdef CONFIG_DEBUG_VM -static inline bool free_pcp_prepare(struct page *page) +/* + * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed + * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when + * moved from pcp lists to free lists. + */ +static bool free_pcp_prepare(struct page *page) { return free_pages_prepare(page, 0, true); } -static inline bool bulkfree_pcp_prepare(struct page *page) +static bool bulkfree_pcp_prepare(struct page *page) { - return false; + if (debug_pagealloc_enabled()) + return free_pages_check(page); + else + return false; } #else +/* + * With DEBUG_VM disabled, order-0 pages being freed are checked only when + * moving from pcp lists to free list in order to reduce overhead. With + * debug_pagealloc enabled, they are checked also immediately when being freed + * to the pcp lists. + */ static bool free_pcp_prepare(struct page *page) { - return free_pages_prepare(page, 0, false); + if (debug_pagealloc_enabled()) + return free_pages_prepare(page, 0, true); + else + return free_pages_prepare(page, 0, false); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1904,6 +1959,10 @@ void __init page_alloc_init_late(void) for_each_populated_zone(zone) set_zone_contiguous(zone); + +#ifdef CONFIG_DEBUG_PAGEALLOC + init_debug_guardpage(); +#endif } #ifdef CONFIG_CMA @@ -2021,28 +2080,44 @@ static inline int check_new_page(struct page *page) static inline bool free_pages_prezeroed(void) { - return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled(); + return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled()) || want_init_on_free(); } #ifdef CONFIG_DEBUG_VM -static bool check_pcp_refill(struct page *page) +/* + * With DEBUG_VM enabled, order-0 pages are checked for expected state when + * being allocated from pcp lists. With debug_pagealloc also enabled, they are + * also checked when pcp lists are refilled from the free lists. + */ +static inline bool check_pcp_refill(struct page *page) { - return false; + if (debug_pagealloc_enabled()) + return check_new_page(page); + else + return false; } -static bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page) { return check_new_page(page); } #else -static bool check_pcp_refill(struct page *page) +/* + * With DEBUG_VM disabled, free order-0 pages are checked for expected state + * when pcp lists are being refilled from the free lists. With debug_pagealloc + * enabled, they are also checked when being allocated from the pcp lists. + */ +static inline bool check_pcp_refill(struct page *page) { return check_new_page(page); } -static bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page) { - return false; + if (debug_pagealloc_enabled()) + return check_new_page(page); + else + return false; } #endif /* CONFIG_DEBUG_VM */ @@ -2076,13 +2151,10 @@ inline void post_alloc_hook(struct page *page, unsigned int order, static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { - int i; - post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) - for (i = 0; i < (1 << order); i++) - clear_highpage(page + i); + if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) + kernel_init_free_pages(page, 1 << order); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -2167,27 +2239,12 @@ static int move_freepages(struct zone *zone, unsigned int order; int pages_moved = 0; -#ifndef CONFIG_HOLES_IN_ZONE - /* - * page_zone is not safe to call in this context when - * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant - * anyway as we check zone boundaries in move_freepages_block(). - * Remove at a later date when no bug reports exist related to - * grouping pages by mobility - */ - VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && - pfn_valid(page_to_pfn(end_page)) && - page_zone(start_page) != page_zone(end_page)); -#endif for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); - if (!PageBuddy(page)) { /* * We assume that pages that could be isolated for @@ -2202,6 +2259,10 @@ static int move_freepages(struct zone *zone, continue; } + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); + order = page_order(page); move_to_free_area(page, &zone->free_area[order], migratetype); page += 1 << order; @@ -3451,7 +3512,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= - RECLAIM_DISTANCE; + node_reclaim_distance; } #else /* CONFIG_NUMA */ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) @@ -3895,14 +3956,22 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, goto check_priority; /* + * compaction was skipped because there are not enough order-0 pages + * to work with, so we retry only if it looks like reclaim can help. + */ + if (compaction_needs_reclaim(compact_result)) { + ret = compaction_zonelist_suitable(ac, order, alloc_flags); + goto out; + } + + /* * make sure the compaction wasn't deferred or didn't bail out early * due to locks contention before we declare that we should give up. - * But do not retry if the given zonelist is not suitable for - * compaction. + * But the next retry should use a higher priority if allowed, so + * we don't just keep bailing out endlessly. */ if (compaction_withdrawn(compact_result)) { - ret = compaction_zonelist_suitable(ac, order, alloc_flags); - goto out; + goto check_priority; } /* @@ -4031,7 +4100,6 @@ static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { - struct reclaim_state reclaim_state; int progress; unsigned int noreclaim_flag; unsigned long pflags; @@ -4043,13 +4111,10 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); - reclaim_state.reclaimed_slab = 0; - current->reclaim_state = &reclaim_state; progress = try_to_free_pages(ac->zonelist, order, gfp_mask, ac->nodemask); - current->reclaim_state = NULL; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); psi_memstall_leave(&pflags); @@ -5854,11 +5919,12 @@ void __ref memmap_init_zone_device(struct zone *zone, { unsigned long pfn, end_pfn = start_pfn + size; struct pglist_data *pgdat = zone->zone_pgdat; + struct vmem_altmap *altmap = pgmap_altmap(pgmap); unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; int nid = pgdat->node_id; - if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone))) + if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) return; /* @@ -5866,9 +5932,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * of the pages reserved for the memmap, so we can just jump to * the end of that region and start processing the device pages. */ - if (pgmap->altmap_valid) { - struct vmem_altmap *altmap = &pgmap->altmap; - + if (altmap) { start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); size = end_pfn - start_pfn; } @@ -5888,12 +5952,12 @@ void __ref memmap_init_zone_device(struct zone *zone, __SetPageReserved(page); /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer and hmm_data. It is a bug if a ZONE_DEVICE - * page is ever freed or placed on a driver-private list. + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. */ page->pgmap = pgmap; - page->hmm_data = 0; + page->zone_device_data = NULL; /* * Mark the block movable so that blocks are reserved for @@ -5908,7 +5972,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * pfn out of zone. * * Please note that MEMMAP_HOTPLUG path doesn't clear memmap - * because this is done early in sparse_add_one_section + * because this is done early in section_activate() */ if (!(pfn & (pageblock_nr_pages - 1))) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); @@ -5916,7 +5980,7 @@ void __ref memmap_init_zone_device(struct zone *zone, } } - pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), + pr_info("%s initialised %lu pages in %ums\n", __func__, size, jiffies_to_msecs(jiffies - start)); } @@ -6583,9 +6647,11 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages, #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { - spin_lock_init(&pgdat->split_queue_lock); - INIT_LIST_HEAD(&pgdat->split_queue); - pgdat->split_queue_len = 0; + struct deferred_split *ds_queue = &pgdat->deferred_split_queue; + + spin_lock_init(&ds_queue->split_queue_lock); + INIT_LIST_HEAD(&ds_queue->split_queue); + ds_queue->split_queue_len = 0; } #else static void pgdat_init_split_queue(struct pglist_data *pgdat) {} @@ -7285,12 +7351,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) (u64)zone_movable_pfn[i] << PAGE_SHIFT); } - /* Print out the early node map */ + /* + * Print out the early node map, and initialize the + * subsection-map relative to active online memory ranges to + * enable future "sub-section" extensions of the memory map. + */ pr_info("Early memory node ranges\n"); - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); + subsection_map_init(start_pfn, end_pfn - start_pfn); + } /* Initialise every node */ mminit_verify_pageflags_layout(); @@ -7520,10 +7592,28 @@ static int page_alloc_cpu_dead(unsigned int cpu) return 0; } +#ifdef CONFIG_NUMA +int hashdist = HASHDIST_DEFAULT; + +static int __init set_hashdist(char *str) +{ + if (!str) + return 0; + hashdist = simple_strtoul(str, &str, 0); + return 1; +} +__setup("hashdist=", set_hashdist); +#endif + void __init page_alloc_init(void) { int ret; +#ifdef CONFIG_NUMA + if (num_node_state(N_MEMORY) == 1) + hashdist = 0; +#endif + ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, "mm/page_alloc:dead", NULL, page_alloc_cpu_dead); @@ -7908,19 +7998,6 @@ out: return ret; } -#ifdef CONFIG_NUMA -int hashdist = HASHDIST_DEFAULT; - -static int __init set_hashdist(char *str) -{ - if (!str) - return 0; - hashdist = simple_strtoul(str, &str, 0); - return 1; -} -__setup("hashdist=", set_hashdist); -#endif - #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but @@ -7967,6 +8044,7 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned long log2qty, size; void *table = NULL; gfp_t gfp_flags; + bool virt; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -8023,6 +8101,7 @@ void *__init alloc_large_system_hash(const char *tablename, gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { + virt = false; size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) @@ -8030,26 +8109,26 @@ void *__init alloc_large_system_hash(const char *tablename, else table = memblock_alloc_raw(size, SMP_CACHE_BYTES); - } else if (hashdist) { + } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + virt = true; } else { /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - if (get_order(size) < MAX_ORDER) { - table = alloc_pages_exact(size, gfp_flags); - kmemleak_alloc(table, size, 1, gfp_flags); - } + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); } } while (!table && size > PAGE_SIZE && --log2qty); if (!table) panic("Failed to allocate %s hash table\n", tablename); - pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); + pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, + virt ? "vmalloc" : "linear"); if (_hash_shift) *_hash_shift = log2qty; @@ -8128,7 +8207,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (!hugepage_migration_supported(page_hstate(head))) goto unmovable; - skip_pages = (1 << compound_order(head)) - (page - head); + skip_pages = compound_nr(head) - (page - head); iter += skip_pages - 1; continue; } |