diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 155 |
1 files changed, 103 insertions, 52 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 506eac8b38af..48aaf7b9f253 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,7 +18,6 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/interrupt.h> -#include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/bootmem.h> @@ -126,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +/* + * A cached value of the page's pageblock's migratetype, used when the page is + * put on a pcplist. Used to avoid the pageblock migratetype lookup when + * freeing from pcplists in most cases, at the cost of possibly becoming stale. + * Also the migratetype set in the page does not necessarily match the pcplist + * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any + * other index - this ensures that it will be put on the correct CMA freelist. + */ +static inline int get_pcppage_migratetype(struct page *page) +{ + return page->index; +} + +static inline void set_pcppage_migratetype(struct page *page, int migratetype) +{ + page->index = migratetype; +} + #ifdef CONFIG_PM_SLEEP /* * The following functions are used by the suspend/hibernate code to temporarily @@ -207,6 +224,9 @@ static char * const zone_names[MAX_NR_ZONES] = { "HighMem", #endif "Movable", +#ifdef CONFIG_ZONE_DEVICE + "Device", +#endif }; int min_free_kbytes = 1024; @@ -246,9 +266,7 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { - int nid = early_pfn_to_nid(pfn); - - if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) return true; return false; @@ -791,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - mt = get_freepage_migratetype(page); + + mt = get_pcppage_migratetype(page); + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ if (unlikely(has_isolate_pageblock(zone))) mt = get_pageblock_migratetype(page); @@ -955,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - set_freepage_migratetype(page, migratetype); free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); } @@ -983,21 +1004,21 @@ static void __init __free_pages_boot_core(struct page *page, #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) -/* Only safe to use early in boot when initialisation is single-threaded */ + static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; int __meminit early_pfn_to_nid(unsigned long pfn) { + static DEFINE_SPINLOCK(early_pfn_lock); int nid; - /* The system will behave unpredictably otherwise */ - BUG_ON(system_state != SYSTEM_BOOTING); - + spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; + if (nid < 0) + nid = 0; + spin_unlock(&early_pfn_lock); + + return nid; } #endif @@ -1062,7 +1083,15 @@ static void __init deferred_free_range(struct page *page, __free_pages_boot_core(page, pfn, 0); } -static __initdata DECLARE_RWSEM(pgdat_init_rwsem); +/* Completion tracking for deferred_init_memmap() threads */ +static atomic_t pgdat_init_n_undone __initdata; +static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); + +static inline void __init pgdat_init_report_one_done(void) +{ + if (atomic_dec_and_test(&pgdat_init_n_undone)) + complete(&pgdat_init_all_done_comp); +} /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) @@ -1079,7 +1108,7 @@ static int __init deferred_init_memmap(void *data) const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); if (first_init_pfn == ULONG_MAX) { - up_read(&pgdat_init_rwsem); + pgdat_init_report_one_done(); return 0; } @@ -1179,7 +1208,8 @@ free_range: pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, jiffies_to_msecs(jiffies - start)); - up_read(&pgdat_init_rwsem); + + pgdat_init_report_one_done(); return 0; } @@ -1187,14 +1217,17 @@ void __init page_alloc_init_late(void) { int nid; + /* There will be num_node_state(N_MEMORY) threads */ + atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); for_each_node_state(nid, N_MEMORY) { - down_read(&pgdat_init_rwsem); kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); } /* Block until all are initialised */ - down_write(&pgdat_init_rwsem); - up_write(&pgdat_init_rwsem); + wait_for_completion(&pgdat_init_all_done_comp); + + /* Reinit limits that are based on free pages after the kernel is up */ + files_maxfiles_init(); } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ @@ -1287,6 +1320,10 @@ static inline int check_new_page(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(atomic_read(&page->_count) != 0)) bad_reason = "nonzero _count"; + if (unlikely(page->flags & __PG_HWPOISON)) { + bad_reason = "HWPoisoned (hardware-corrupted)"; + bad_flags = __PG_HWPOISON; + } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; @@ -1330,12 +1367,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_owner(page, order, gfp_flags); /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking * steps that will free more memory. The caller should avoid the page * being used for !PFMEMALLOC purposes. */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + if (alloc_flags & ALLOC_NO_WATERMARKS) + set_page_pfmemalloc(page); + else + clear_page_pfmemalloc(page); return 0; } @@ -1364,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); - set_freepage_migratetype(page, migratetype); + set_pcppage_migratetype(page, migratetype); return page; } @@ -1441,7 +1481,6 @@ int move_freepages(struct zone *zone, order = page_order(page); list_move(&page->lru, &zone->free_area[order].free_list[migratetype]); - set_freepage_migratetype(page, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -1611,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) expand(zone, page, order, current_order, area, start_migratetype); /* - * The freepage_migratetype may differ from pageblock's + * The pcppage_migratetype may differ from pageblock's * migratetype depending on the decisions in - * try_to_steal_freepages(). This is OK as long as it - * does not differ for MIGRATE_CMA pageblocks. For CMA - * we need to make sure unallocated pages flushed from - * pcp lists are returned to the correct freelist. + * find_suitable_fallback(). This is OK as long as it does not + * differ for MIGRATE_CMA pageblocks. Those can be used as + * fallback only via special __rmqueue_cma_fallback() function */ - set_freepage_migratetype(page, start_migratetype); + set_pcppage_migratetype(page, start_migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); @@ -1694,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, else list_add_tail(&page->lru, list); list = &page->lru; - if (is_migrate_cma(get_freepage_migratetype(page))) + if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1891,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold) return; migratetype = get_pfnblock_migratetype(page, pfn); - set_freepage_migratetype(page, migratetype); + set_pcppage_migratetype(page, migratetype); local_irq_save(flags); __count_vm_event(PGFREE); @@ -1950,6 +1988,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) void split_page(struct page *page, unsigned int order) { int i; + gfp_t gfp_mask; VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); @@ -1963,10 +2002,11 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - set_page_owner(page, 0, 0); + gfp_mask = get_page_owner_gfp(page); + set_page_owner(page, 0, gfp_mask); for (i = 1; i < (1 << order); i++) { set_page_refcounted(page + i); - set_page_owner(page + i, 0, 0); + set_page_owner(page + i, 0, gfp_mask); } } EXPORT_SYMBOL_GPL(split_page); @@ -1996,6 +2036,8 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); + set_page_owner(page, order, __GFP_MOVABLE); + /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; @@ -2007,7 +2049,7 @@ int __isolate_free_page(struct page *page, unsigned int order) } } - set_page_owner(page, order, 0); + return 1UL << order; } @@ -2092,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), - get_freepage_migratetype(page)); + get_pcppage_migratetype(page)); } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); @@ -2673,6 +2715,12 @@ static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) { + struct oom_control oc = { + .zonelist = ac->zonelist, + .nodemask = ac->nodemask, + .gfp_mask = gfp_mask, + .order = order, + }; struct page *page; *did_some_progress = 0; @@ -2724,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) - || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: mutex_unlock(&oom_lock); @@ -3328,7 +3375,7 @@ refill: atomic_add(size - 1, &page->_count); /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page->pfmemalloc; + nc->pfmemalloc = page_is_pfmemalloc(page); nc->pagecnt_bias = size; nc->offset = size; } @@ -3467,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact); * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. - * Note this is not alloc_pages_exact_node() which allocates on a specific node, - * but is not exact. */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { @@ -5043,6 +5088,10 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, { unsigned long zone_start_pfn, zone_end_pfn; + /* When hotadd a new node from cpu_up(), the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; @@ -5106,6 +5155,10 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; + /* When hotadd a new node from cpu_up(), the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -5275,8 +5328,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * * NOTE: pgdat should get zeroed by caller. */ -static void __paginginit free_area_init_core(struct pglist_data *pgdat, - unsigned long node_start_pfn, unsigned long node_end_pfn) +static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; @@ -5427,7 +5479,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5439,7 +5492,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, start_pfn, end_pfn); + free_area_init_core(pgdat); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5450,11 +5503,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, */ void __init setup_nr_node_ids(void) { - unsigned int node; - unsigned int highest = 0; + unsigned int highest; - for_each_node_mask(node, node_possible_map) - highest = node; + highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); nr_node_ids = highest + 1; } #endif @@ -5975,7 +6026,7 @@ void __init mem_init_print_info(const char *str) * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved * - * The per-cpu batchsize and zone watermarks are determined by present_pages. + * The per-cpu batchsize and zone watermarks are determined by managed_pages. * In the DMA zone, a significant percentage may be consumed by kernel image * and other unfreeable allocations which can skew the watermarks badly. This * function may optionally be used to account for unfreeable pages in the @@ -6028,7 +6079,7 @@ void __init page_alloc_init(void) } /* - * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio + * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio * or min_free_kbytes changes. */ static void calculate_totalreserve_pages(void) @@ -6072,7 +6123,7 @@ static void calculate_totalreserve_pages(void) /* * setup_per_zone_lowmem_reserve - called whenever - * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone + * sysctl_lowmem_reserve_ratio changes. Ensures that each zone * has a correct pages reserved value, so an adequate number of * pages are left in the zone after a successful __alloc_pages(). */ |