diff options
Diffstat (limited to 'mm/compaction.c')
-rw-r--r-- | mm/compaction.c | 618 |
1 files changed, 467 insertions, 151 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 585de54dbe8c..79bfe0e06907 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -7,6 +7,7 @@ * * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> */ +#include <linux/cpu.h> #include <linux/swap.h> #include <linux/migrate.h> #include <linux/compaction.h> @@ -17,6 +18,8 @@ #include <linux/balloon_compaction.h> #include <linux/page-isolation.h> #include <linux/kasan.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -39,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define CREATE_TRACE_POINTS #include <trace/events/compaction.h> +#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) +#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) +#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) +#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) + static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -71,49 +79,6 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } -/* - * Check that the whole (or subset of) a pageblock given by the interval of - * [start_pfn, end_pfn) is valid and within the same zone, before scanning it - * with the migration of free compaction scanner. The scanners then need to - * use only pfn_valid_within() check for arches that allow holes within - * pageblocks. - * - * Return struct page pointer of start_pfn, or NULL if checks were not passed. - * - * It's possible on some configurations to have a setup like node0 node1 node0 - * i.e. it's possible that all pages within a zones range of pages do not - * belong to a single zone. We assume that a border between node0 and node1 - * can occur within a single pageblock, but not a node0 node1 node0 - * interleaving within a single pageblock. It is therefore sufficient to check - * the first and last page of a pageblock and avoid checking each individual - * page in a pageblock. - */ -static struct page *pageblock_pfn_to_page(unsigned long start_pfn, - unsigned long end_pfn, struct zone *zone) -{ - struct page *start_page; - struct page *end_page; - - /* end_pfn is one past the range we are checking */ - end_pfn--; - - if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) - return NULL; - - start_page = pfn_to_page(start_pfn); - - if (page_zone(start_page) != zone) - return NULL; - - end_page = pfn_to_page(end_pfn); - - /* This gives a shorter code than deriving page_zone(end_page) */ - if (page_zone_id(start_page) != page_zone_id(end_page)) - return NULL; - - return start_page; -} - #ifdef CONFIG_COMPACTION /* Do not skip compaction more than 64 times */ @@ -200,7 +165,8 @@ static void reset_cached_positions(struct zone *zone) { zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; - zone->compact_cached_free_pfn = zone_end_pfn(zone); + zone->compact_cached_free_pfn = + pageblock_start_pfn(zone_end_pfn(zone) - 1); } /* @@ -475,25 +441,23 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); + if (!isolated) + break; + total_isolated += isolated; + cc->nr_freepages += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); page++; } - - /* If a page was split, advance to the end of it */ - if (isolated) { - cc->nr_freepages += isolated; - if (!strict && - cc->nr_migratepages <= cc->nr_freepages) { - blockpfn += isolated; - break; - } - - blockpfn += isolated - 1; - cursor += isolated - 1; - continue; + if (!strict && cc->nr_migratepages <= cc->nr_freepages) { + blockpfn += isolated; + break; } + /* Advance to the end of split page */ + blockpfn += isolated - 1; + cursor += isolated - 1; + continue; isolate_fail: if (strict) @@ -503,6 +467,9 @@ isolate_fail: } + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); + /* * There is a tiny chance that we have read bogus compound_order(), * so be careful to not go outside of the pageblock. @@ -524,9 +491,6 @@ isolate_fail: if (strict && blockpfn < end_pfn) total_isolated = 0; - if (locked) - spin_unlock_irqrestore(&cc->zone->lock, flags); - /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); @@ -554,13 +518,17 @@ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long isolated, pfn, block_end_pfn; + unsigned long isolated, pfn, block_start_pfn, block_end_pfn; LIST_HEAD(freelist); pfn = start_pfn; - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_start_pfn = pageblock_start_pfn(pfn); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; + block_end_pfn = pageblock_end_pfn(pfn); for (; pfn < end_pfn; pfn += isolated, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { /* Protect pfn from changing by isolate_freepages_block */ unsigned long isolate_start_pfn = pfn; @@ -573,11 +541,13 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_start_pfn = pageblock_start_pfn(pfn); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); } - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, @@ -666,12 +636,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, { struct zone *zone = cc->zone; unsigned long nr_scanned = 0, nr_isolated = 0; - struct list_head *migratelist = &cc->migratepages; struct lruvec *lruvec; unsigned long flags = 0; bool locked = false; struct page *page = NULL, *valid_page = NULL; unsigned long start_pfn = low_pfn; + bool skip_on_failure = false; + unsigned long next_skip_pfn = 0; /* * Ensure that there are not too many pages isolated from the LRU @@ -692,10 +663,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (compact_should_abort(cc)) return 0; + if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { + skip_on_failure = true; + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { bool is_lru; + if (skip_on_failure && low_pfn >= next_skip_pfn) { + /* + * We have isolated all migration candidates in the + * previous order-aligned block, and did not skip it due + * to failure. We should migrate the pages now and + * hopefully succeed compaction. + */ + if (nr_isolated) + break; + + /* + * We failed to isolate in the previous order-aligned + * block. Set the new boundary to the end of the + * current block. Note we can't simply increase + * next_skip_pfn by 1 << order, as low_pfn might have + * been incremented by a higher number due to skipping + * a compound or a high-order buddy page in the + * previous loop iteration. + */ + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + /* * Periodically drop the lock (if held) regardless of its * contention, to give chance to IRQs. Abort async compaction @@ -707,7 +705,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, break; if (!pfn_valid_within(low_pfn)) - continue; + goto isolate_fail; nr_scanned++; page = pfn_to_page(low_pfn); @@ -762,11 +760,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (likely(comp_order < MAX_ORDER)) low_pfn += (1UL << comp_order) - 1; - continue; + goto isolate_fail; } if (!is_lru) - continue; + goto isolate_fail; /* * Migration will fail if an anonymous page is pinned in memory, @@ -775,7 +773,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (!page_mapping(page) && page_count(page) > page_mapcount(page)) - continue; + goto isolate_fail; /* If we already hold the lock, we can skip some rechecking */ if (!locked) { @@ -786,7 +784,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) - continue; + goto isolate_fail; /* * Page become compound since the non-locked check, @@ -795,7 +793,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (unlikely(PageCompound(page))) { low_pfn += (1UL << compound_order(page)) - 1; - continue; + goto isolate_fail; } } @@ -803,7 +801,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) - continue; + goto isolate_fail; VM_BUG_ON_PAGE(PageCompound(page), page); @@ -811,15 +809,55 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, del_page_from_lru_list(page, lruvec, page_lru(page)); isolate_success: - list_add(&page->lru, migratelist); + list_add(&page->lru, &cc->migratepages); cc->nr_migratepages++; nr_isolated++; + /* + * Record where we could have freed pages by migration and not + * yet flushed them to buddy allocator. + * - this is the lowest page that was isolated and likely be + * then freed by migration. + */ + if (!cc->last_migrated_pfn) + cc->last_migrated_pfn = low_pfn; + /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; break; } + + continue; +isolate_fail: + if (!skip_on_failure) + continue; + + /* + * We have isolated some pages, but then failed. Release them + * instead of migrating, as we cannot form the cc->order buddy + * page anyway. + */ + if (nr_isolated) { + if (locked) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + locked = false; + } + acct_isolated(zone, cc); + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + cc->last_migrated_pfn = 0; + nr_isolated = 0; + } + + if (low_pfn < next_skip_pfn) { + low_pfn = next_skip_pfn - 1; + /* + * The check near the loop beginning would have updated + * next_skip_pfn too, but this is a bit simpler. + */ + next_skip_pfn += 1UL << cc->order; + } } /* @@ -863,33 +901,30 @@ unsigned long isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn, block_end_pfn; + unsigned long pfn, block_start_pfn, block_end_pfn; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_start_pfn = pageblock_start_pfn(pfn); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; + block_end_pfn = pageblock_end_pfn(pfn); for (; pfn < end_pfn; pfn = block_end_pfn, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { block_end_pfn = min(block_end_pfn, end_pfn); - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) continue; pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, ISOLATE_UNEVICTABLE); - /* - * In case of fatal failure, release everything that might - * have been isolated in the previous iteration, and signal - * the failure back to caller. - */ - if (!pfn) { - putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; + if (!pfn) break; - } if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) break; @@ -960,10 +995,10 @@ static void isolate_freepages(struct compact_control *cc) * is using. */ isolate_start_pfn = cc->free_pfn; - block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_start_pfn = pageblock_start_pfn(cc->free_pfn); block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); - low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); + low_pfn = pageblock_end_pfn(cc->migrate_pfn); /* * Isolate free pages until enough are available to migrate the @@ -974,6 +1009,7 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { + unsigned long isolated; /* * This can iterate a massively long zone without finding any @@ -998,8 +1034,12 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, false); + isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, freelist, false); + /* If isolation failed early, do not continue needlessly */ + if (!isolated && isolate_start_pfn < block_end_pfn && + cc->nr_migratepages > cc->nr_freepages) + break; /* * If we isolated enough freepages, or aborted due to async @@ -1103,8 +1143,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1; static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { - unsigned long low_pfn, end_pfn; - unsigned long isolate_start_pfn; + unsigned long block_start_pfn; + unsigned long block_end_pfn; + unsigned long low_pfn; struct page *page; const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | @@ -1115,16 +1156,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; + block_start_pfn = pageblock_start_pfn(low_pfn); + if (block_start_pfn < zone->zone_start_pfn) + block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(low_pfn); /* * Iterate over whole pageblocks until we find the first suitable. * Do not cross the free scanner. */ - for (; end_pfn <= cc->free_pfn; - low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { + for (; block_end_pfn <= cc->free_pfn; + low_pfn = block_end_pfn, + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { /* * This can potentially iterate a massively long zone with @@ -1135,7 +1181,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, && compact_should_abort(cc)) break; - page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); if (!page) continue; @@ -1153,9 +1200,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; /* Perform the isolation */ - isolate_start_pfn = low_pfn; - low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, - isolate_mode); + low_pfn = isolate_migratepages_block(cc, low_pfn, + block_end_pfn, isolate_mode); if (!low_pfn || cc->contended) { acct_isolated(zone, cc); @@ -1163,15 +1209,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } /* - * Record where we could have freed pages by migration and not - * yet flushed them to buddy allocator. - * - this is the lowest page that could have been isolated and - * then freed by migration. - */ - if (cc->nr_migratepages && !cc->last_migrated_pfn) - cc->last_migrated_pfn = isolate_start_pfn; - - /* * Either we isolated something and proceed with migration. Or * we failed and compact_zone should decide if we should * continue or not. @@ -1195,7 +1232,7 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static int __compact_finished(struct zone *zone, struct compact_control *cc, +static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { unsigned int order; @@ -1211,14 +1248,17 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, /* * Mark that the PG_migrate_skip information should be cleared - * by kswapd when it goes to sleep. kswapd does not set the + * by kswapd when it goes to sleep. kcompactd does not set the * flag itself as the decision to be clear should be directly * based on an allocation request. */ - if (!current_is_kswapd()) + if (cc->direct_compaction) zone->compact_blockskip_flush = true; - return COMPACT_COMPLETE; + if (cc->whole_zone) + return COMPACT_COMPLETE; + else + return COMPACT_PARTIAL_SKIPPED; } if (is_via_compact_memory(cc->order)) @@ -1258,8 +1298,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_NO_SUITABLE_PAGE; } -static int compact_finished(struct zone *zone, struct compact_control *cc, - const int migratetype) +static enum compact_result compact_finished(struct zone *zone, + struct compact_control *cc, + const int migratetype) { int ret; @@ -1278,8 +1319,10 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_PARTIAL - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ -static unsigned long __compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) +static enum compact_result __compaction_suitable(struct zone *zone, int order, + unsigned int alloc_flags, + int classzone_idx, + unsigned long wmark_target) { int fragindex; unsigned long watermark; @@ -1302,7 +1345,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, * allocated and for a short time, the footprint is higher */ watermark += (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) + if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, + alloc_flags, wmark_target)) return COMPACT_SKIPPED; /* @@ -1323,12 +1367,14 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, return COMPACT_CONTINUE; } -unsigned long compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) +enum compact_result compaction_suitable(struct zone *zone, int order, + unsigned int alloc_flags, + int classzone_idx) { - unsigned long ret; + enum compact_result ret; - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); + ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + zone_page_state(zone, NR_FREE_PAGES)); trace_mm_compaction_suitable(zone, order, ret); if (ret == COMPACT_NOT_SUITABLE_ZONE) ret = COMPACT_SKIPPED; @@ -1336,9 +1382,42 @@ unsigned long compaction_suitable(struct zone *zone, int order, return ret; } -static int compact_zone(struct zone *zone, struct compact_control *cc) +bool compaction_zonelist_suitable(struct alloc_context *ac, int order, + int alloc_flags) { - int ret; + struct zone *zone; + struct zoneref *z; + + /* + * Make sure at least one zone would pass __compaction_suitable if we continue + * retrying the reclaim. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + unsigned long available; + enum compact_result compact_result; + + /* + * Do not consider all the reclaimable memory because we do not + * want to trash just for a single high order allocation which + * is even not guaranteed to appear even if __compaction_suitable + * is happy about the watermark check. + */ + available = zone_reclaimable_pages(zone) / order; + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + compact_result = __compaction_suitable(zone, order, alloc_flags, + ac_classzone_idx(ac), available); + if (compact_result != COMPACT_SKIPPED && + compact_result != COMPACT_NOT_SUITABLE_ZONE) + return true; + } + + return false; +} + +static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) +{ + enum compact_result ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); @@ -1346,22 +1425,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); - switch (ret) { - case COMPACT_PARTIAL: - case COMPACT_SKIPPED: - /* Compaction is likely to fail */ + /* Compaction is likely to fail */ + if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED) return ret; - case COMPACT_CONTINUE: - /* Fall through to compaction */ - ; - } + + /* huh, compaction_suitable is returning something unexpected */ + VM_BUG_ON(ret != COMPACT_CONTINUE); /* * Clear pageblock skip if there were failures recently and compaction - * is about to be retried after being deferred. kswapd does not do - * this reset as it'll reset the cached information when going to sleep. + * is about to be retried after being deferred. */ - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + if (compaction_restarting(zone, cc->order)) __reset_isolation_suitable(zone); /* @@ -1371,15 +1446,19 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) */ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { - cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); zone->compact_cached_free_pfn = cc->free_pfn; } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { cc->migrate_pfn = start_pfn; zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } + + if (cc->migrate_pfn == start_pfn) + cc->whole_zone = true; + cc->last_migrated_pfn = 0; trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, @@ -1427,6 +1506,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ret = COMPACT_CONTENDED; goto out; } + /* + * We failed to migrate at least one page in the current + * order-aligned block, so skip the rest of it. + */ + if (cc->direct_compaction && + (cc->mode == MIGRATE_ASYNC)) { + cc->migrate_pfn = block_end_pfn( + cc->migrate_pfn - 1, cc->order); + /* Draining pcplists is useless in this case */ + cc->last_migrated_pfn = 0; + + } } check_drain: @@ -1440,7 +1531,7 @@ check_drain: if (cc->order > 0 && cc->last_migrated_pfn) { int cpu; unsigned long current_block_start = - cc->migrate_pfn & ~((1UL << cc->order) - 1); + block_start_pfn(cc->migrate_pfn, cc->order); if (cc->last_migrated_pfn < current_block_start) { cpu = get_cpu(); @@ -1465,7 +1556,7 @@ out: cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); /* The cached pfn is always the first in a pageblock */ - free_pfn &= ~(pageblock_nr_pages-1); + free_pfn = pageblock_start_pfn(free_pfn); /* * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() @@ -1483,11 +1574,11 @@ out: return ret; } -static unsigned long compact_zone_order(struct zone *zone, int order, +static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, int classzone_idx) { - unsigned long ret; + enum compact_result ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, @@ -1497,6 +1588,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .mode = mode, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, + .direct_compaction = true, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1524,15 +1616,15 @@ int sysctl_extfrag_threshold = 500; * * This is the main entry point for direct page compaction. */ -unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended) +enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; - int rc = COMPACT_DEFERRED; + enum compact_result rc = COMPACT_SKIPPED; int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ *contended = COMPACT_CONTENDED_NONE; @@ -1546,15 +1638,17 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { - int status; + enum compact_result status; int zone_contended; - if (compaction_deferred(zone, order)) + if (compaction_deferred(zone, order)) { + rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; + } status = compact_zone_order(zone, order, gfp_mask, mode, &zone_contended, alloc_flags, - ac->classzone_idx); + ac_classzone_idx(ac)); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1564,7 +1658,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - ac->classzone_idx, alloc_flags)) { + ac_classzone_idx(ac), alloc_flags)) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller @@ -1586,7 +1680,8 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, goto break_loop; } - if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { + if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE || + status == COMPACT_PARTIAL_SKIPPED)) { /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up @@ -1621,7 +1716,7 @@ break_loop: * If at least one zone wasn't deferred or skipped, we report if all * zones that were tried were lock contended. */ - if (rc > COMPACT_SKIPPED && all_zones_contended) + if (rc > COMPACT_INACTIVE && all_zones_contended) *contended = COMPACT_CONTENDED_LOCK; return rc; @@ -1759,4 +1854,225 @@ void compaction_unregister_node(struct node *node) } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ +static inline bool kcompactd_work_requested(pg_data_t *pgdat) +{ + return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); +} + +static bool kcompactd_node_suitable(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + + for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + + if (!populated_zone(zone)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + classzone_idx) == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static void kcompactd_do_work(pg_data_t *pgdat) +{ + /* + * With no special task, compact all zones so that a page of requested + * order is allocatable. + */ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .classzone_idx = pgdat->kcompactd_classzone_idx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + + }; + bool success = false; + + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, + cc.classzone_idx); + count_vm_event(KCOMPACTD_WAKE); + + for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { + int status; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + if (compaction_deferred(zone, cc.order)) + continue; + + if (compaction_suitable(zone, cc.order, 0, zoneid) != + COMPACT_CONTINUE) + continue; + + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + if (kthread_should_stop()) + return; + status = compact_zone(zone, &cc); + + if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), + cc.classzone_idx, 0)) { + success = true; + compaction_defer_reset(zone, cc.order, false); + } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { + /* + * We use sync migration mode here, so we defer like + * sync direct compaction does. + */ + defer_compaction(zone, cc.order); + } + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + /* + * Regardless of success, we are done until woken up next. But remember + * the requested order/classzone_idx in case it was higher/tighter than + * our current ones + */ + if (pgdat->kcompactd_max_order <= cc.order) + pgdat->kcompactd_max_order = 0; + if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; +} + +void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ + if (!order) + return; + + if (pgdat->kcompactd_max_order < order) + pgdat->kcompactd_max_order = order; + + if (pgdat->kcompactd_classzone_idx > classzone_idx) + pgdat->kcompactd_classzone_idx = classzone_idx; + + if (!waitqueue_active(&pgdat->kcompactd_wait)) + return; + + if (!kcompactd_node_suitable(pgdat)) + return; + + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, + classzone_idx); + wake_up_interruptible(&pgdat->kcompactd_wait); +} + +/* + * The background compaction daemon, started as a kernel thread + * from the init process. + */ +static int kcompactd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + pgdat->kcompactd_max_order = 0; + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + wait_event_freezable(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat)); + + kcompactd_do_work(pgdat); + } + + return 0; +} + +/* + * This kcompactd start function will be called by init and node-hot-add. + * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. + */ +int kcompactd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kcompactd) + return 0; + + pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + if (IS_ERR(pgdat->kcompactd)) { + pr_err("Failed to start kcompactd on node %d\n", nid); + ret = PTR_ERR(pgdat->kcompactd); + pgdat->kcompactd = NULL; + } + return ret; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kcompactd_stop(int nid) +{ + struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; + + if (kcompactd) { + kthread_stop(kcompactd); + NODE_DATA(nid)->kcompactd = NULL; + } +} + +/* + * It's optimal to keep kcompactd on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes + * away, we get changed to run anywhere: as the first one comes back, + * restore their cpu bindings. + */ +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); + } + } + return NOTIFY_OK; +} + +static int __init kcompactd_init(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) + kcompactd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} +subsys_initcall(kcompactd_init) + #endif /* CONFIG_COMPACTION */ |