diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 205 | 
1 files changed, 119 insertions, 86 deletions
| diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d666df5ef95..63358d9f9aa9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -43,6 +43,7 @@  #include <linux/vmalloc.h>  #include <linux/vmstat.h>  #include <linux/mempolicy.h> +#include <linux/memremap.h>  #include <linux/stop_machine.h>  #include <linux/sort.h>  #include <linux/pfn.h> @@ -114,13 +115,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock);  unsigned long totalram_pages __read_mostly;  unsigned long totalreserve_pages __read_mostly;  unsigned long totalcma_pages __read_mostly; -/* - * When calculating the number of globally allowed dirty pages, there - * is a certain number of per-zone reserves that should not be - * considered dirtyable memory.  This is the sum of those reserves - * over all existing zones that contribute dirtyable memory. - */ -unsigned long dirty_balance_reserve __read_mostly;  int percpu_pagelist_fraction;  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -229,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = {  #endif  }; -static void free_compound_page(struct page *page);  compound_page_dtor * const compound_page_dtors[] = {  	NULL,  	free_compound_page,  #ifdef CONFIG_HUGETLB_PAGE  	free_huge_page,  #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +	free_transhuge_page, +#endif  };  int min_free_kbytes = 1024; @@ -457,7 +453,7 @@ out:   * This usage means that zero-order pages may not be compound.   */ -static void free_compound_page(struct page *page) +void free_compound_page(struct page *page)  {  	__free_pages_ok(page, compound_order(page));  } @@ -473,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order)  	for (i = 1; i < nr_pages; i++) {  		struct page *p = page + i;  		set_page_count(p, 0); +		p->mapping = TAIL_MAPPING;  		set_compound_head(p, page);  	} +	atomic_set(compound_mapcount_ptr(page), -1);  }  #ifdef CONFIG_DEBUG_PAGEALLOC @@ -739,7 +737,7 @@ static inline int free_pages_check(struct page *page)  	const char *bad_reason = NULL;  	unsigned long bad_flags = 0; -	if (unlikely(page_mapcount(page))) +	if (unlikely(atomic_read(&page->_mapcount) != -1))  		bad_reason = "nonzero mapcount";  	if (unlikely(page->mapping != NULL))  		bad_reason = "non-NULL mapping"; @@ -812,7 +810,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,  		do {  			int mt;	/* migratetype of the to-be-freed page */ -			page = list_entry(list->prev, struct page, lru); +			page = list_last_entry(list, struct page, lru);  			/* must delete as __free_one_page list manipulates */  			list_del(&page->lru); @@ -863,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)  		ret = 0;  		goto out;  	} +	switch (page - head_page) { +	case 1: +		/* the first tail page: ->mapping is compound_mapcount() */ +		if (unlikely(compound_mapcount(page))) { +			bad_page(page, "nonzero compound_mapcount", 0); +			goto out; +		} +		break; +	case 2: +		/* +		 * the second tail page: ->mapping is +		 * page_deferred_list().next -- ignore value. +		 */ +		break; +	default: +		if (page->mapping != TAIL_MAPPING) { +			bad_page(page, "corrupted mapping in tail page", 0); +			goto out; +		} +		break; +	}  	if (unlikely(!PageTail(page))) {  		bad_page(page, "PageTail not set", 0);  		goto out; @@ -873,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)  	}  	ret = 0;  out: +	page->mapping = NULL;  	clear_compound_head(page);  	return ret;  } @@ -1336,7 +1356,7 @@ static inline int check_new_page(struct page *page)  	const char *bad_reason = NULL;  	unsigned long bad_flags = 0; -	if (unlikely(page_mapcount(page))) +	if (unlikely(atomic_read(&page->_mapcount) != -1))  		bad_reason = "nonzero mapcount";  	if (unlikely(page->mapping != NULL))  		bad_reason = "non-NULL mapping"; @@ -1417,11 +1437,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,  	/* Find a page of the appropriate size in the preferred list */  	for (current_order = order; current_order < MAX_ORDER; ++current_order) {  		area = &(zone->free_area[current_order]); -		if (list_empty(&area->free_list[migratetype])) -			continue; - -		page = list_entry(area->free_list[migratetype].next, +		page = list_first_entry_or_null(&area->free_list[migratetype],  							struct page, lru); +		if (!page) +			continue;  		list_del(&page->lru);  		rmv_page_order(page);  		area->nr_free--; @@ -1700,12 +1719,12 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)  		for (order = 0; order < MAX_ORDER; order++) {  			struct free_area *area = &(zone->free_area[order]); -			if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) +			page = list_first_entry_or_null( +					&area->free_list[MIGRATE_HIGHATOMIC], +					struct page, lru); +			if (!page)  				continue; -			page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next, -						struct page, lru); -  			/*  			 * It should never happen but changes to locking could  			 * inadvertently allow a per-cpu drain to add pages @@ -1753,7 +1772,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)  		if (fallback_mt == -1)  			continue; -		page = list_entry(area->free_list[fallback_mt].next, +		page = list_first_entry(&area->free_list[fallback_mt],  						struct page, lru);  		if (can_steal)  			steal_suitable_fallback(zone, page, start_migratetype); @@ -1788,7 +1807,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)   * Call me with the zone->lock already held.   */  static struct page *__rmqueue(struct zone *zone, unsigned int order, -				int migratetype, gfp_t gfp_flags) +				int migratetype)  {  	struct page *page; @@ -1818,7 +1837,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,  	spin_lock(&zone->lock);  	for (i = 0; i < count; ++i) { -		struct page *page = __rmqueue(zone, order, migratetype, 0); +		struct page *page = __rmqueue(zone, order, migratetype);  		if (unlikely(page == NULL))  			break; @@ -1988,7 +2007,7 @@ void mark_free_pages(struct zone *zone)  	unsigned long pfn, max_zone_pfn;  	unsigned long flags;  	unsigned int order, t; -	struct list_head *curr; +	struct page *page;  	if (zone_is_empty(zone))  		return; @@ -1998,17 +2017,17 @@ void mark_free_pages(struct zone *zone)  	max_zone_pfn = zone_end_pfn(zone);  	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)  		if (pfn_valid(pfn)) { -			struct page *page = pfn_to_page(pfn); - +			page = pfn_to_page(pfn);  			if (!swsusp_page_is_forbidden(page))  				swsusp_unset_page_free(page);  		}  	for_each_migratetype_order(order, t) { -		list_for_each(curr, &zone->free_area[order].free_list[t]) { +		list_for_each_entry(page, +				&zone->free_area[order].free_list[t], lru) {  			unsigned long i; -			pfn = page_to_pfn(list_entry(curr, struct page, lru)); +			pfn = page_to_pfn(page);  			for (i = 0; i < (1UL << order); i++)  				swsusp_set_page_free(pfn_to_page(pfn + i));  		} @@ -2212,9 +2231,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,  		}  		if (cold) -			page = list_entry(list->prev, struct page, lru); +			page = list_last_entry(list, struct page, lru);  		else -			page = list_entry(list->next, struct page, lru); +			page = list_first_entry(list, struct page, lru);  		list_del(&page->lru);  		pcp->count--; @@ -2241,7 +2260,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,  				trace_mm_page_alloc_zone_locked(page, order, migratetype);  		}  		if (!page) -			page = __rmqueue(zone, order, migratetype, gfp_flags); +			page = __rmqueue(zone, order, migratetype);  		spin_unlock(&zone->lock);  		if (!page)  			goto failed; @@ -2740,8 +2759,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,  			goto out;  	}  	/* Exhausted what can be done so it's blamo time */ -	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) +	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {  		*did_some_progress = 1; + +		if (gfp_mask & __GFP_NOFAIL) { +			page = get_page_from_freelist(gfp_mask, order, +					ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); +			/* +			 * fallback to ignore cpuset restriction if our nodes +			 * are depleted +			 */ +			if (!page) +				page = get_page_from_freelist(gfp_mask, order, +					ALLOC_NO_WATERMARKS, ac); +		} +	}  out:  	mutex_unlock(&oom_lock);  	return page; @@ -2876,28 +2908,6 @@ retry:  	return page;  } -/* - * This is called in the allocator slow-path if the allocation request is of - * sufficient urgency to ignore watermarks and take other desperate measures - */ -static inline struct page * -__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, -				const struct alloc_context *ac) -{ -	struct page *page; - -	do { -		page = get_page_from_freelist(gfp_mask, order, -						ALLOC_NO_WATERMARKS, ac); - -		if (!page && gfp_mask & __GFP_NOFAIL) -			wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, -									HZ/50); -	} while (!page && (gfp_mask & __GFP_NOFAIL)); - -	return page; -} -  static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)  {  	struct zoneref *z; @@ -3042,28 +3052,36 @@ retry:  		 * allocations are system rather than user orientated  		 */  		ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); - -		page = __alloc_pages_high_priority(gfp_mask, order, ac); - -		if (page) { +		page = get_page_from_freelist(gfp_mask, order, +						ALLOC_NO_WATERMARKS, ac); +		if (page)  			goto got_pg; -		}  	}  	/* Caller is not willing to reclaim, we can't balance anything */  	if (!can_direct_reclaim) {  		/* -		 * All existing users of the deprecated __GFP_NOFAIL are -		 * blockable, so warn of any new users that actually allow this -		 * type of allocation to fail. +		 * All existing users of the __GFP_NOFAIL are blockable, so warn +		 * of any new users that actually allow this type of allocation +		 * to fail.  		 */  		WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);  		goto nopage;  	}  	/* Avoid recursion of direct reclaim */ -	if (current->flags & PF_MEMALLOC) +	if (current->flags & PF_MEMALLOC) { +		/* +		 * __GFP_NOFAIL request from this context is rather bizarre +		 * because we cannot reclaim anything and only can loop waiting +		 * for somebody to do a work for us. +		 */ +		if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { +			cond_resched(); +			goto retry; +		}  		goto nopage; +	}  	/* Avoid allocations with no watermarks from looping endlessly */  	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) @@ -3402,7 +3420,8 @@ EXPORT_SYMBOL(__free_page_frag);  /*   * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup. + * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is + * equivalent to alloc_pages.   *   * It should be used when the caller would like to use kmalloc, but since the   * allocation is large, it has to fall back to the page allocator. @@ -4147,8 +4166,7 @@ static void set_zonelist_order(void)  static void build_zonelists(pg_data_t *pgdat)  { -	int j, node, load; -	enum zone_type i; +	int i, node, load;  	nodemask_t used_mask;  	int local_node, prev_node;  	struct zonelist *zonelist; @@ -4168,7 +4186,7 @@ static void build_zonelists(pg_data_t *pgdat)  	nodes_clear(used_mask);  	memset(node_order, 0, sizeof(node_order)); -	j = 0; +	i = 0;  	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {  		/* @@ -4185,12 +4203,12 @@ static void build_zonelists(pg_data_t *pgdat)  		if (order == ZONELIST_ORDER_NODE)  			build_zonelists_in_node_order(pgdat, node);  		else -			node_order[j++] = node;	/* remember order */ +			node_order[i++] = node;	/* remember order */  	}  	if (order == ZONELIST_ORDER_ZONE) {  		/* calculate node order -- i.e., DMA last! */ -		build_zonelists_in_zone_order(pgdat, j); +		build_zonelists_in_zone_order(pgdat, i);  	}  	build_thisnode_zonelists(pgdat); @@ -4468,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size)  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  		unsigned long start_pfn, enum memmap_context context)  { -	pg_data_t *pgdat = NODE_DATA(nid); +	struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));  	unsigned long end_pfn = start_pfn + size; +	pg_data_t *pgdat = NODE_DATA(nid);  	unsigned long pfn; -	struct zone *z;  	unsigned long nr_initialised = 0;  	if (highest_memmap_pfn < end_pfn - 1)  		highest_memmap_pfn = end_pfn - 1; -	z = &pgdat->node_zones[zone]; +	/* +	 * Honor reservation requested by the driver for this ZONE_DEVICE +	 * memory +	 */ +	if (altmap && start_pfn == altmap->base_pfn) +		start_pfn += altmap->reserve; +  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {  		/*  		 * There can be holes in boot-time mem_map[]s @@ -5956,20 +5980,12 @@ static void calculate_totalreserve_pages(void)  			if (max > zone->managed_pages)  				max = zone->managed_pages; + +			zone->totalreserve_pages = max; +  			reserve_pages += max; -			/* -			 * Lowmem reserves are not available to -			 * GFP_HIGHUSER page cache allocations and -			 * kswapd tries to balance zones to their high -			 * watermark.  As a result, neither should be -			 * regarded as dirtyable memory, to prevent a -			 * situation where reclaim has to clean pages -			 * in order to balance the zones. -			 */ -			zone->dirty_balance_reserve = max;  		}  	} -	dirty_balance_reserve = reserve_pages;  	totalreserve_pages = reserve_pages;  } @@ -6724,8 +6740,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,  	if (ret)  		return ret; +	/* +	 * In case of -EBUSY, we'd like to know which page causes problem. +	 * So, just fall through. We will check it in test_pages_isolated(). +	 */  	ret = __alloc_contig_migrate_range(&cc, start, end); -	if (ret) +	if (ret && ret != -EBUSY)  		goto done;  	/* @@ -6752,12 +6772,25 @@ int alloc_contig_range(unsigned long start, unsigned long end,  	outer_start = start;  	while (!PageBuddy(pfn_to_page(outer_start))) {  		if (++order >= MAX_ORDER) { -			ret = -EBUSY; -			goto done; +			outer_start = start; +			break;  		}  		outer_start &= ~0UL << order;  	} +	if (outer_start != start) { +		order = page_order(pfn_to_page(outer_start)); + +		/* +		 * outer_start page could be small order buddy page and +		 * it doesn't include start page. Adjust outer_start +		 * in this case to report failed page properly +		 * on tracepoint in test_pages_isolated() +		 */ +		if (outer_start + (1UL << order) <= start) +			outer_start = start; +	} +  	/* Make sure the range is really isolated. */  	if (test_pages_isolated(outer_start, end, false)) {  		pr_info("%s: [%lx, %lx) PFNs busy\n", | 
