diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 439 | 
1 files changed, 264 insertions, 175 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 170bbf144cfa..e5486d47406e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -126,13 +126,97 @@ typedef int __bitwise fpi_t;  static DEFINE_MUTEX(pcp_batch_high_lock);  #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) -struct pagesets { -	local_lock_t lock; -}; -static DEFINE_PER_CPU(struct pagesets, pagesets) = { -	.lock = INIT_LOCAL_LOCK(lock), -}; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * On SMP, spin_trylock is sufficient protection. + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + */ +#define pcp_trylock_prepare(flags)	do { } while (0) +#define pcp_trylock_finish(flag)	do { } while (0) +#else + +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ +#define pcp_trylock_prepare(flags)	local_irq_save(flags) +#define pcp_trylock_finish(flags)	local_irq_restore(flags) +#endif + +/* + * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid + * a migration causing the wrong PCP to be locked and remote memory being + * potentially allocated, pin the task to the CPU for the lookup+lock. + * preempt_disable is used on !RT because it is faster than migrate_disable. + * migrate_disable is used on RT because otherwise RT spinlock usage is + * interfered with and a high priority task cannot preempt the allocator. + */ +#ifndef CONFIG_PREEMPT_RT +#define pcpu_task_pin()		preempt_disable() +#define pcpu_task_unpin()	preempt_enable() +#else +#define pcpu_task_pin()		migrate_disable() +#define pcpu_task_unpin()	migrate_enable() +#endif + +/* + * Generic helper to lookup and a per-cpu variable with an embedded spinlock. + * Return value should be used with equivalent unlock helper. + */ +#define pcpu_spin_lock(type, member, ptr)				\ +({									\ +	type *_ret;							\ +	pcpu_task_pin();						\ +	_ret = this_cpu_ptr(ptr);					\ +	spin_lock(&_ret->member);					\ +	_ret;								\ +}) + +#define pcpu_spin_lock_irqsave(type, member, ptr, flags)		\ +({									\ +	type *_ret;							\ +	pcpu_task_pin();						\ +	_ret = this_cpu_ptr(ptr);					\ +	spin_lock_irqsave(&_ret->member, flags);			\ +	_ret;								\ +}) + +#define pcpu_spin_trylock_irqsave(type, member, ptr, flags)		\ +({									\ +	type *_ret;							\ +	pcpu_task_pin();						\ +	_ret = this_cpu_ptr(ptr);					\ +	if (!spin_trylock_irqsave(&_ret->member, flags)) {		\ +		pcpu_task_unpin();					\ +		_ret = NULL;						\ +	}								\ +	_ret;								\ +}) + +#define pcpu_spin_unlock(member, ptr)					\ +({									\ +	spin_unlock(&ptr->member);					\ +	pcpu_task_unpin();						\ +}) + +#define pcpu_spin_unlock_irqrestore(member, ptr, flags)			\ +({									\ +	spin_unlock_irqrestore(&ptr->member, flags);			\ +	pcpu_task_unpin();						\ +}) + +/* struct per_cpu_pages specific helpers. */ +#define pcp_spin_lock(ptr)						\ +	pcpu_spin_lock(struct per_cpu_pages, lock, ptr) + +#define pcp_spin_lock_irqsave(ptr, flags)				\ +	pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) +#define pcp_spin_trylock_irqsave(ptr, flags)				\ +	pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) + +#define pcp_spin_unlock(ptr)						\ +	pcpu_spin_unlock(lock, ptr) + +#define pcp_spin_unlock_irqrestore(ptr, flags)				\ +	pcpu_spin_unlock_irqrestore(lock, ptr, flags)  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID  DEFINE_PER_CPU(int, numa_node);  EXPORT_PER_CPU_SYMBOL(numa_node); @@ -151,13 +235,7 @@ DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */  EXPORT_PER_CPU_SYMBOL(_numa_mem_);  #endif -/* work_structs for global per-cpu drains */ -struct pcpu_drain { -	struct zone *zone; -	struct work_struct work; -};  static DEFINE_MUTEX(pcpu_drain_mutex); -static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);  #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY  volatile unsigned long latent_entropy __latent_entropy; @@ -524,7 +602,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,  {  	unsigned long *bitmap;  	unsigned long bitidx, word_bitidx; -	unsigned long old_word, word; +	unsigned long word;  	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);  	BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); @@ -540,12 +618,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,  	flags <<= bitidx;  	word = READ_ONCE(bitmap[word_bitidx]); -	for (;;) { -		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); -		if (word == old_word) -			break; -		word = old_word; -	} +	do { +	} while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));  }  void set_pageblock_migratetype(struct page *page, int migratetype) @@ -653,7 +727,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)  #ifdef CONFIG_TRANSPARENT_HUGEPAGE  	if (order > PAGE_ALLOC_COSTLY_ORDER) {  		VM_BUG_ON(order != pageblock_order); -		base = PAGE_ALLOC_COSTLY_ORDER + 1; +		return NR_LOWORDER_PCP_LISTS;  	}  #else  	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -667,7 +741,7 @@ static inline int pindex_to_order(unsigned int pindex)  	int order = pindex / MIGRATE_PCPTYPES;  #ifdef CONFIG_TRANSPARENT_HUGEPAGE -	if (order > PAGE_ALLOC_COSTLY_ORDER) +	if (pindex == NR_LOWORDER_PCP_LISTS)  		order = pageblock_order;  #else  	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -744,6 +818,14 @@ void prep_compound_page(struct page *page, unsigned int order)  	prep_compound_head(page, order);  } +void destroy_large_folio(struct folio *folio) +{ +	enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + +	VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); +	compound_page_dtors[dtor](&folio->page); +} +  #ifdef CONFIG_DEBUG_PAGEALLOC  unsigned int _debug_guardpage_minorder; @@ -785,7 +867,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,  		return false;  	__SetPageGuard(page); -	INIT_LIST_HEAD(&page->lru); +	INIT_LIST_HEAD(&page->buddy_list);  	set_page_private(page, order);  	/* Guard pages are not available for any usage */  	__mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -928,7 +1010,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone,  {  	struct free_area *area = &zone->free_area[order]; -	list_add(&page->lru, &area->free_list[migratetype]); +	list_add(&page->buddy_list, &area->free_list[migratetype]);  	area->nr_free++;  } @@ -938,7 +1020,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,  {  	struct free_area *area = &zone->free_area[order]; -	list_add_tail(&page->lru, &area->free_list[migratetype]); +	list_add_tail(&page->buddy_list, &area->free_list[migratetype]);  	area->nr_free++;  } @@ -952,7 +1034,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,  {  	struct free_area *area = &zone->free_area[order]; -	list_move_tail(&page->lru, &area->free_list[migratetype]); +	list_move_tail(&page->buddy_list, &area->free_list[migratetype]);  }  static inline void del_page_from_free_list(struct page *page, struct zone *zone, @@ -962,7 +1044,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,  	if (page_reported(page))  		__ClearPageReported(page); -	list_del(&page->lru); +	list_del(&page->buddy_list);  	__ClearPageBuddy(page);  	set_page_private(page, 0);  	zone->free_area[order].nr_free--; @@ -1296,18 +1378,14 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)  	       PageSkipKASanPoison(page);  } -static void kernel_init_free_pages(struct page *page, int numpages) +static void kernel_init_pages(struct page *page, int numpages)  {  	int i;  	/* s390's use of memset() could override KASAN redzones. */  	kasan_disable_current(); -	for (i = 0; i < numpages; i++) { -		u8 tag = page_kasan_tag(page + i); -		page_kasan_tag_reset(page + i); -		clear_highpage(page + i); -		page_kasan_tag_set(page + i, tag); -	} +	for (i = 0; i < numpages; i++) +		clear_highpage_kasan_tagged(page + i);  	kasan_enable_current();  } @@ -1396,7 +1474,7 @@ static __always_inline bool free_pages_prepare(struct page *page,  			init = false;  	}  	if (init) -		kernel_init_free_pages(page, 1 << order); +		kernel_init_pages(page, 1 << order);  	/*  	 * arch_free_page() can make the page's contents inaccessible.  s390 @@ -1473,10 +1551,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,  	/* Ensure requested pindex is drained first. */  	pindex = pindex - 1; -	/* -	 * local_lock_irq held so equivalent to spin_lock_irqsave for -	 * both PREEMPT_RT and non-PREEMPT_RT configurations. -	 */ +	/* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */  	spin_lock(&zone->lock);  	isolated_pageblocks = has_isolate_pageblock(zone); @@ -1504,11 +1579,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,  		do {  			int mt; -			page = list_last_entry(list, struct page, lru); +			page = list_last_entry(list, struct page, pcp_list);  			mt = get_pcppage_migratetype(page);  			/* must delete to avoid corrupting pcp list */ -			list_del(&page->lru); +			list_del(&page->pcp_list);  			count -= nr_pages;  			pcp->count -= nr_pages; @@ -2442,7 +2517,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,  	}  	/* If memory is still not initialized, do it now. */  	if (init) -		kernel_init_free_pages(page, 1 << order); +		kernel_init_pages(page, 1 << order);  	/* Propagate __GFP_SKIP_KASAN_POISON to page flags. */  	if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))  		SetPageSkipKASanPoison(page); @@ -3045,10 +3120,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,  {  	int i, allocated = 0; -	/* -	 * local_lock_irq held so equivalent to spin_lock_irqsave for -	 * both PREEMPT_RT and non-PREEMPT_RT configurations. -	 */ +	/* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */  	spin_lock(&zone->lock);  	for (i = 0; i < count; ++i) {  		struct page *page = __rmqueue(zone, order, migratetype, @@ -3069,7 +3141,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,  		 * for IO devices that can merge IO requests if the physical  		 * pages are ordered properly.  		 */ -		list_add_tail(&page->lru, list); +		list_add_tail(&page->pcp_list, list);  		allocated++;  		if (is_migrate_cma(get_pcppage_migratetype(page)))  			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -3092,51 +3164,48 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,   * Called from the vmstat counter updater to drain pagesets of this   * currently executing processor on remote nodes after they have   * expired. - * - * Note that this function must be called with the thread pinned to - * a single processor.   */  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)  { -	unsigned long flags;  	int to_drain, batch; -	local_lock_irqsave(&pagesets.lock, flags);  	batch = READ_ONCE(pcp->batch);  	to_drain = min(pcp->count, batch); -	if (to_drain > 0) +	if (to_drain > 0) { +		unsigned long flags; + +		/* +		 * free_pcppages_bulk expects IRQs disabled for zone->lock +		 * so even though pcp->lock is not intended to be IRQ-safe, +		 * it's needed in this context. +		 */ +		spin_lock_irqsave(&pcp->lock, flags);  		free_pcppages_bulk(zone, to_drain, pcp, 0); -	local_unlock_irqrestore(&pagesets.lock, flags); +		spin_unlock_irqrestore(&pcp->lock, flags); +	}  }  #endif  /*   * Drain pcplists of the indicated processor and zone. - * - * The processor must either be the current processor and the - * thread pinned to the current processor or a processor that - * is not online.   */  static void drain_pages_zone(unsigned int cpu, struct zone *zone)  { -	unsigned long flags;  	struct per_cpu_pages *pcp; -	local_lock_irqsave(&pagesets.lock, flags); -  	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); -	if (pcp->count) -		free_pcppages_bulk(zone, pcp->count, pcp, 0); +	if (pcp->count) { +		unsigned long flags; -	local_unlock_irqrestore(&pagesets.lock, flags); +		/* See drain_zone_pages on why this is disabling IRQs */ +		spin_lock_irqsave(&pcp->lock, flags); +		free_pcppages_bulk(zone, pcp->count, pcp, 0); +		spin_unlock_irqrestore(&pcp->lock, flags); +	}  }  /*   * Drain pcplists of all zones on the indicated processor. - * - * The processor must either be the current processor and the - * thread pinned to the current processor or a processor that - * is not online.   */  static void drain_pages(unsigned int cpu)  { @@ -3149,9 +3218,6 @@ static void drain_pages(unsigned int cpu)  /*   * Spill all of this CPU's per-cpu pages back into the buddy allocator. - * - * The CPU has to be pinned. When zone parameter is non-NULL, spill just - * the single zone's pages.   */  void drain_local_pages(struct zone *zone)  { @@ -3163,24 +3229,6 @@ void drain_local_pages(struct zone *zone)  		drain_pages(cpu);  } -static void drain_local_pages_wq(struct work_struct *work) -{ -	struct pcpu_drain *drain; - -	drain = container_of(work, struct pcpu_drain, work); - -	/* -	 * drain_all_pages doesn't use proper cpu hotplug protection so -	 * we can race with cpu offline when the WQ can move this from -	 * a cpu pinned worker to an unbound one. We can operate on a different -	 * cpu which is alright but we also have to make sure to not move to -	 * a different one. -	 */ -	migrate_disable(); -	drain_local_pages(drain->zone); -	migrate_enable(); -} -  /*   * The implementation of drain_all_pages(), exposing an extra parameter to   * drain on all cpus. @@ -3202,13 +3250,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)  	static cpumask_t cpus_with_pcps;  	/* -	 * Make sure nobody triggers this path before mm_percpu_wq is fully -	 * initialized. -	 */ -	if (WARN_ON_ONCE(!mm_percpu_wq)) -		return; - -	/*  	 * Do not drain if one is already in progress unless it's specific to  	 * a zone. Such callers are primarily CMA and memory hotplug and need  	 * the drain to be complete when the call returns. @@ -3257,14 +3298,11 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)  	}  	for_each_cpu(cpu, &cpus_with_pcps) { -		struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); - -		drain->zone = zone; -		INIT_WORK(&drain->work, drain_local_pages_wq); -		queue_work_on(cpu, mm_percpu_wq, &drain->work); +		if (zone) +			drain_pages_zone(cpu, zone); +		else +			drain_pages(cpu);  	} -	for_each_cpu(cpu, &cpus_with_pcps) -		flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);  	mutex_unlock(&pcpu_drain_mutex);  } @@ -3273,8 +3311,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)   * Spill all the per-cpu pages from all CPUs back into the buddy allocator.   *   * When zone parameter is non-NULL, spill just the single zone's pages. - * - * Note that this can be extremely slow as the draining happens in a workqueue.   */  void drain_all_pages(struct zone *zone)  { @@ -3319,7 +3355,7 @@ void mark_free_pages(struct zone *zone)  	for_each_migratetype_order(order, t) {  		list_for_each_entry(page, -				&zone->free_area[order].free_list[t], lru) { +				&zone->free_area[order].free_list[t], buddy_list) {  			unsigned long i;  			pfn = page_to_pfn(page); @@ -3396,19 +3432,17 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,  	return min(READ_ONCE(pcp->batch) << 2, high);  } -static void free_unref_page_commit(struct page *page, int migratetype, +static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, +				   struct page *page, int migratetype,  				   unsigned int order)  { -	struct zone *zone = page_zone(page); -	struct per_cpu_pages *pcp;  	int high;  	int pindex;  	bool free_high;  	__count_vm_event(PGFREE); -	pcp = this_cpu_ptr(zone->per_cpu_pageset);  	pindex = order_to_pindex(migratetype, order); -	list_add(&page->lru, &pcp->lists[pindex]); +	list_add(&page->pcp_list, &pcp->lists[pindex]);  	pcp->count += 1 << order;  	/* @@ -3433,6 +3467,9 @@ static void free_unref_page_commit(struct page *page, int migratetype,  void free_unref_page(struct page *page, unsigned int order)  {  	unsigned long flags; +	unsigned long __maybe_unused UP_flags; +	struct per_cpu_pages *pcp; +	struct zone *zone;  	unsigned long pfn = page_to_pfn(page);  	int migratetype; @@ -3455,9 +3492,16 @@ void free_unref_page(struct page *page, unsigned int order)  		migratetype = MIGRATE_MOVABLE;  	} -	local_lock_irqsave(&pagesets.lock, flags); -	free_unref_page_commit(page, migratetype, order); -	local_unlock_irqrestore(&pagesets.lock, flags); +	zone = page_zone(page); +	pcp_trylock_prepare(UP_flags); +	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); +	if (pcp) { +		free_unref_page_commit(zone, pcp, page, migratetype, order); +		pcp_spin_unlock_irqrestore(pcp, flags); +	} else { +		free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); +	} +	pcp_trylock_finish(UP_flags);  }  /* @@ -3466,6 +3510,8 @@ void free_unref_page(struct page *page, unsigned int order)  void free_unref_page_list(struct list_head *list)  {  	struct page *page, *next; +	struct per_cpu_pages *pcp = NULL; +	struct zone *locked_zone = NULL;  	unsigned long flags;  	int batch_count = 0;  	int migratetype; @@ -3490,8 +3536,18 @@ void free_unref_page_list(struct list_head *list)  		}  	} -	local_lock_irqsave(&pagesets.lock, flags);  	list_for_each_entry_safe(page, next, list, lru) { +		struct zone *zone = page_zone(page); + +		/* Different zone, different pcp lock. */ +		if (zone != locked_zone) { +			if (pcp) +				pcp_spin_unlock_irqrestore(pcp, flags); + +			locked_zone = zone; +			pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); +		} +  		/*  		 * Non-isolated types over MIGRATE_PCPTYPES get added  		 * to the MIGRATE_MOVABLE pcp list. @@ -3501,19 +3557,21 @@ void free_unref_page_list(struct list_head *list)  			migratetype = MIGRATE_MOVABLE;  		trace_mm_page_free_batched(page); -		free_unref_page_commit(page, migratetype, 0); +		free_unref_page_commit(zone, pcp, page, migratetype, 0);  		/*  		 * Guard against excessive IRQ disabled times when we get  		 * a large list of pages to free.  		 */  		if (++batch_count == SWAP_CLUSTER_MAX) { -			local_unlock_irqrestore(&pagesets.lock, flags); +			pcp_spin_unlock_irqrestore(pcp, flags);  			batch_count = 0; -			local_lock_irqsave(&pagesets.lock, flags); +			pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);  		}  	} -	local_unlock_irqrestore(&pagesets.lock, flags); + +	if (pcp) +		pcp_spin_unlock_irqrestore(pcp, flags);  }  /* @@ -3638,6 +3696,43 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,  #endif  } +static __always_inline +struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, +			   unsigned int order, unsigned int alloc_flags, +			   int migratetype) +{ +	struct page *page; +	unsigned long flags; + +	do { +		page = NULL; +		spin_lock_irqsave(&zone->lock, flags); +		/* +		 * order-0 request can reach here when the pcplist is skipped +		 * due to non-CMA allocation context. HIGHATOMIC area is +		 * reserved for high-order atomic allocation, so order-0 +		 * request should skip it. +		 */ +		if (order > 0 && alloc_flags & ALLOC_HARDER) +			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); +		if (!page) { +			page = __rmqueue(zone, order, migratetype, alloc_flags); +			if (!page) { +				spin_unlock_irqrestore(&zone->lock, flags); +				return NULL; +			} +		} +		__mod_zone_freepage_state(zone, -(1 << order), +					  get_pcppage_migratetype(page)); +		spin_unlock_irqrestore(&zone->lock, flags); +	} while (check_new_pages(page, order)); + +	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); +	zone_statistics(preferred_zone, zone, 1); + +	return page; +} +  /* Remove page from the per-cpu list, caller must protect the list */  static inline  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, @@ -3671,8 +3766,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,  				return NULL;  		} -		page = list_first_entry(list, struct page, lru); -		list_del(&page->lru); +		page = list_first_entry(list, struct page, pcp_list); +		list_del(&page->pcp_list);  		pcp->count -= 1 << order;  	} while (check_new_pcp(page, order)); @@ -3689,19 +3784,29 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,  	struct list_head *list;  	struct page *page;  	unsigned long flags; +	unsigned long __maybe_unused UP_flags; -	local_lock_irqsave(&pagesets.lock, flags); +	/* +	 * spin_trylock may fail due to a parallel drain. In the future, the +	 * trylock will also protect against IRQ reentrancy. +	 */ +	pcp_trylock_prepare(UP_flags); +	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); +	if (!pcp) { +		pcp_trylock_finish(UP_flags); +		return NULL; +	}  	/*  	 * On allocation, reduce the number of pages that are batch freed.  	 * See nr_pcp_free() where free_factor is increased for subsequent  	 * frees.  	 */ -	pcp = this_cpu_ptr(zone->per_cpu_pageset);  	pcp->free_factor >>= 1;  	list = &pcp->lists[order_to_pindex(migratetype, order)];  	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); -	local_unlock_irqrestore(&pagesets.lock, flags); +	pcp_spin_unlock_irqrestore(pcp, flags); +	pcp_trylock_finish(UP_flags);  	if (page) {  		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);  		zone_statistics(preferred_zone, zone, 1); @@ -3718,9 +3823,14 @@ struct page *rmqueue(struct zone *preferred_zone,  			gfp_t gfp_flags, unsigned int alloc_flags,  			int migratetype)  { -	unsigned long flags;  	struct page *page; +	/* +	 * We most definitely don't want callers attempting to +	 * allocate greater than order-1 page units with __GFP_NOFAIL. +	 */ +	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); +  	if (likely(pcp_allowed_order(order))) {  		/*  		 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and @@ -3730,53 +3840,23 @@ struct page *rmqueue(struct zone *preferred_zone,  				migratetype != MIGRATE_MOVABLE) {  			page = rmqueue_pcplist(preferred_zone, zone, order,  					gfp_flags, migratetype, alloc_flags); -			goto out; +			if (likely(page)) +				goto out;  		}  	} -	/* -	 * We most definitely don't want callers attempting to -	 * allocate greater than order-1 page units with __GFP_NOFAIL. -	 */ -	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - -	do { -		page = NULL; -		spin_lock_irqsave(&zone->lock, flags); -		/* -		 * order-0 request can reach here when the pcplist is skipped -		 * due to non-CMA allocation context. HIGHATOMIC area is -		 * reserved for high-order atomic allocation, so order-0 -		 * request should skip it. -		 */ -		if (order > 0 && alloc_flags & ALLOC_HARDER) -			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); -		if (!page) { -			page = __rmqueue(zone, order, migratetype, alloc_flags); -			if (!page) -				goto failed; -		} -		__mod_zone_freepage_state(zone, -(1 << order), -					  get_pcppage_migratetype(page)); -		spin_unlock_irqrestore(&zone->lock, flags); -	} while (check_new_pages(page, order)); - -	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); -	zone_statistics(preferred_zone, zone, 1); +	page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, +							migratetype);  out:  	/* Separate test+clear to avoid unnecessary atomics */ -	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { +	if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {  		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);  		wakeup_kswapd(zone, 0, 0, zone_idx(zone));  	}  	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);  	return page; - -failed: -	spin_unlock_irqrestore(&zone->lock, flags); -	return NULL;  }  #ifdef CONFIG_FAIL_PAGE_ALLOC @@ -4095,7 +4175,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,  retry:  	/*  	 * Scan zonelist, looking for a zone with enough free. -	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. +	 * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.  	 */  	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;  	z = ac->preferred_zoneref; @@ -5202,10 +5282,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,  			*alloc_flags |= ALLOC_CPUSET;  	} -	fs_reclaim_acquire(gfp_mask); -	fs_reclaim_release(gfp_mask); - -	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); +	might_alloc(gfp_mask);  	if (should_fail_alloc_page(gfp_mask, order))  		return false; @@ -5253,6 +5330,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,  {  	struct page *page;  	unsigned long flags; +	unsigned long __maybe_unused UP_flags;  	struct zone *zone;  	struct zoneref *z;  	struct per_cpu_pages *pcp; @@ -5333,11 +5411,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,  	if (unlikely(!zone))  		goto failed; +	/* Is a parallel drain in progress? */ +	pcp_trylock_prepare(UP_flags); +	pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); +	if (!pcp) +		goto failed_irq; +  	/* Attempt the batch allocation */ -	local_lock_irqsave(&pagesets.lock, flags); -	pcp = this_cpu_ptr(zone->per_cpu_pageset);  	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; -  	while (nr_populated < nr_pages) {  		/* Skip existing pages */ @@ -5350,8 +5431,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,  								pcp, pcp_list);  		if (unlikely(!page)) {  			/* Try and allocate at least one page */ -			if (!nr_account) +			if (!nr_account) { +				pcp_spin_unlock_irqrestore(pcp, flags);  				goto failed_irq; +			}  			break;  		}  		nr_account++; @@ -5364,7 +5447,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,  		nr_populated++;  	} -	local_unlock_irqrestore(&pagesets.lock, flags); +	pcp_spin_unlock_irqrestore(pcp, flags); +	pcp_trylock_finish(UP_flags);  	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);  	zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); @@ -5373,7 +5457,7 @@ out:  	return nr_populated;  failed_irq: -	local_unlock_irqrestore(&pagesets.lock, flags); +	pcp_trylock_finish(UP_flags);  failed:  	page = __alloc_pages(gfp, 0, preferred_nid, nodemask); @@ -5804,14 +5888,14 @@ long si_mem_available(void)  	/*  	 * Estimate the amount of memory available for userspace allocations, -	 * without causing swapping. +	 * without causing swapping or OOM.  	 */  	available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;  	/*  	 * Not all the page cache can be freed, otherwise the system will -	 * start swapping. Assume at least half of the page cache, or the -	 * low watermark worth of cache, needs to stay. +	 * start swapping or thrashing. Assume at least half of the page +	 * cache, or the low watermark worth of cache, needs to stay.  	 */  	pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];  	pagecache -= min(pagecache / 2, wmark_low); @@ -5939,7 +6023,7 @@ static void show_migration_types(unsigned char type)  void show_free_areas(unsigned int filter, nodemask_t *nodemask)  {  	unsigned long free_pcp = 0; -	int cpu; +	int cpu, nid;  	struct zone *zone;  	pg_data_t *pgdat; @@ -6127,7 +6211,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)  		printk(KERN_CONT "= %lukB\n", K(total));  	} -	hugetlb_show_meminfo(); +	for_each_online_node(nid) { +		if (show_mem_node_skip(filter, nid, nodemask)) +			continue; +		hugetlb_show_meminfo_node(nid); +	}  	printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); @@ -7013,6 +7101,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta  	memset(pcp, 0, sizeof(*pcp));  	memset(pzstats, 0, sizeof(*pzstats)); +	spin_lock_init(&pcp->lock);  	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)  		INIT_LIST_HEAD(&pcp->lists[pindex]);  | 
