summaryrefslogtreecommitdiff
path: root/mm/compaction.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/compaction.c')
-rw-r--r--mm/compaction.c618
1 files changed, 467 insertions, 151 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 585de54dbe8c..79bfe0e06907 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -7,6 +7,7 @@
*
* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
*/
+#include <linux/cpu.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
@@ -17,6 +18,8 @@
#include <linux/balloon_compaction.h>
#include <linux/page-isolation.h>
#include <linux/kasan.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -39,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>
+#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
+#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order))
+#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
+#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
+
static unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
@@ -71,49 +79,6 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
-/*
- * Check that the whole (or subset of) a pageblock given by the interval of
- * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
- *
- * Return struct page pointer of start_pfn, or NULL if checks were not passed.
- *
- * It's possible on some configurations to have a setup like node0 node1 node0
- * i.e. it's possible that all pages within a zones range of pages do not
- * belong to a single zone. We assume that a border between node0 and node1
- * can occur within a single pageblock, but not a node0 node1 node0
- * interleaving within a single pageblock. It is therefore sufficient to check
- * the first and last page of a pageblock and avoid checking each individual
- * page in a pageblock.
- */
-static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
- unsigned long end_pfn, struct zone *zone)
-{
- struct page *start_page;
- struct page *end_page;
-
- /* end_pfn is one past the range we are checking */
- end_pfn--;
-
- if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
- return NULL;
-
- start_page = pfn_to_page(start_pfn);
-
- if (page_zone(start_page) != zone)
- return NULL;
-
- end_page = pfn_to_page(end_pfn);
-
- /* This gives a shorter code than deriving page_zone(end_page) */
- if (page_zone_id(start_page) != page_zone_id(end_page))
- return NULL;
-
- return start_page;
-}
-
#ifdef CONFIG_COMPACTION
/* Do not skip compaction more than 64 times */
@@ -200,7 +165,8 @@ static void reset_cached_positions(struct zone *zone)
{
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
- zone->compact_cached_free_pfn = zone_end_pfn(zone);
+ zone->compact_cached_free_pfn =
+ pageblock_start_pfn(zone_end_pfn(zone) - 1);
}
/*
@@ -475,25 +441,23 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
+ if (!isolated)
+ break;
+
total_isolated += isolated;
+ cc->nr_freepages += isolated;
for (i = 0; i < isolated; i++) {
list_add(&page->lru, freelist);
page++;
}
-
- /* If a page was split, advance to the end of it */
- if (isolated) {
- cc->nr_freepages += isolated;
- if (!strict &&
- cc->nr_migratepages <= cc->nr_freepages) {
- blockpfn += isolated;
- break;
- }
-
- blockpfn += isolated - 1;
- cursor += isolated - 1;
- continue;
+ if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
+ blockpfn += isolated;
+ break;
}
+ /* Advance to the end of split page */
+ blockpfn += isolated - 1;
+ cursor += isolated - 1;
+ continue;
isolate_fail:
if (strict)
@@ -503,6 +467,9 @@ isolate_fail:
}
+ if (locked)
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
/*
* There is a tiny chance that we have read bogus compound_order(),
* so be careful to not go outside of the pageblock.
@@ -524,9 +491,6 @@ isolate_fail:
if (strict && blockpfn < end_pfn)
total_isolated = 0;
- if (locked)
- spin_unlock_irqrestore(&cc->zone->lock, flags);
-
/* Update the pageblock-skip if the whole pageblock was scanned */
if (blockpfn == end_pfn)
update_pageblock_skip(cc, valid_page, total_isolated, false);
@@ -554,13 +518,17 @@ unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long isolated, pfn, block_end_pfn;
+ unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
LIST_HEAD(freelist);
pfn = start_pfn;
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_start_pfn = pageblock_start_pfn(pfn);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
+ block_end_pfn = pageblock_end_pfn(pfn);
for (; pfn < end_pfn; pfn += isolated,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
/* Protect pfn from changing by isolate_freepages_block */
unsigned long isolate_start_pfn = pfn;
@@ -573,11 +541,13 @@ isolate_freepages_range(struct compact_control *cc,
* scanning range to right one.
*/
if (pfn >= block_end_pfn) {
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_start_pfn = pageblock_start_pfn(pfn);
+ block_end_pfn = pageblock_end_pfn(pfn);
block_end_pfn = min(block_end_pfn, end_pfn);
}
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
@@ -666,12 +636,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
{
struct zone *zone = cc->zone;
unsigned long nr_scanned = 0, nr_isolated = 0;
- struct list_head *migratelist = &cc->migratepages;
struct lruvec *lruvec;
unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
+ bool skip_on_failure = false;
+ unsigned long next_skip_pfn = 0;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -692,10 +663,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (compact_should_abort(cc))
return 0;
+ if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
+ skip_on_failure = true;
+ next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+ }
+
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
bool is_lru;
+ if (skip_on_failure && low_pfn >= next_skip_pfn) {
+ /*
+ * We have isolated all migration candidates in the
+ * previous order-aligned block, and did not skip it due
+ * to failure. We should migrate the pages now and
+ * hopefully succeed compaction.
+ */
+ if (nr_isolated)
+ break;
+
+ /*
+ * We failed to isolate in the previous order-aligned
+ * block. Set the new boundary to the end of the
+ * current block. Note we can't simply increase
+ * next_skip_pfn by 1 << order, as low_pfn might have
+ * been incremented by a higher number due to skipping
+ * a compound or a high-order buddy page in the
+ * previous loop iteration.
+ */
+ next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+ }
+
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort async compaction
@@ -707,7 +705,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
break;
if (!pfn_valid_within(low_pfn))
- continue;
+ goto isolate_fail;
nr_scanned++;
page = pfn_to_page(low_pfn);
@@ -762,11 +760,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (likely(comp_order < MAX_ORDER))
low_pfn += (1UL << comp_order) - 1;
- continue;
+ goto isolate_fail;
}
if (!is_lru)
- continue;
+ goto isolate_fail;
/*
* Migration will fail if an anonymous page is pinned in memory,
@@ -775,7 +773,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (!page_mapping(page) &&
page_count(page) > page_mapcount(page))
- continue;
+ goto isolate_fail;
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
@@ -786,7 +784,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
- continue;
+ goto isolate_fail;
/*
* Page become compound since the non-locked check,
@@ -795,7 +793,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (unlikely(PageCompound(page))) {
low_pfn += (1UL << compound_order(page)) - 1;
- continue;
+ goto isolate_fail;
}
}
@@ -803,7 +801,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
- continue;
+ goto isolate_fail;
VM_BUG_ON_PAGE(PageCompound(page), page);
@@ -811,15 +809,55 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
del_page_from_lru_list(page, lruvec, page_lru(page));
isolate_success:
- list_add(&page->lru, migratelist);
+ list_add(&page->lru, &cc->migratepages);
cc->nr_migratepages++;
nr_isolated++;
+ /*
+ * Record where we could have freed pages by migration and not
+ * yet flushed them to buddy allocator.
+ * - this is the lowest page that was isolated and likely be
+ * then freed by migration.
+ */
+ if (!cc->last_migrated_pfn)
+ cc->last_migrated_pfn = low_pfn;
+
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
break;
}
+
+ continue;
+isolate_fail:
+ if (!skip_on_failure)
+ continue;
+
+ /*
+ * We have isolated some pages, but then failed. Release them
+ * instead of migrating, as we cannot form the cc->order buddy
+ * page anyway.
+ */
+ if (nr_isolated) {
+ if (locked) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ locked = false;
+ }
+ acct_isolated(zone, cc);
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ cc->last_migrated_pfn = 0;
+ nr_isolated = 0;
+ }
+
+ if (low_pfn < next_skip_pfn) {
+ low_pfn = next_skip_pfn - 1;
+ /*
+ * The check near the loop beginning would have updated
+ * next_skip_pfn too, but this is a bit simpler.
+ */
+ next_skip_pfn += 1UL << cc->order;
+ }
}
/*
@@ -863,33 +901,30 @@ unsigned long
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long pfn, block_end_pfn;
+ unsigned long pfn, block_start_pfn, block_end_pfn;
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_start_pfn = pageblock_start_pfn(pfn);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
+ block_end_pfn = pageblock_end_pfn(pfn);
for (; pfn < end_pfn; pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
block_end_pfn = min(block_end_pfn, end_pfn);
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
continue;
pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
ISOLATE_UNEVICTABLE);
- /*
- * In case of fatal failure, release everything that might
- * have been isolated in the previous iteration, and signal
- * the failure back to caller.
- */
- if (!pfn) {
- putback_movable_pages(&cc->migratepages);
- cc->nr_migratepages = 0;
+ if (!pfn)
break;
- }
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
break;
@@ -960,10 +995,10 @@ static void isolate_freepages(struct compact_control *cc)
* is using.
*/
isolate_start_pfn = cc->free_pfn;
- block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+ block_start_pfn = pageblock_start_pfn(cc->free_pfn);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
- low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+ low_pfn = pageblock_end_pfn(cc->migrate_pfn);
/*
* Isolate free pages until enough are available to migrate the
@@ -974,6 +1009,7 @@ static void isolate_freepages(struct compact_control *cc)
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
+ unsigned long isolated;
/*
* This can iterate a massively long zone without finding any
@@ -998,8 +1034,12 @@ static void isolate_freepages(struct compact_control *cc)
continue;
/* Found a block suitable for isolating free pages from. */
- isolate_freepages_block(cc, &isolate_start_pfn,
- block_end_pfn, freelist, false);
+ isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ block_end_pfn, freelist, false);
+ /* If isolation failed early, do not continue needlessly */
+ if (!isolated && isolate_start_pfn < block_end_pfn &&
+ cc->nr_migratepages > cc->nr_freepages)
+ break;
/*
* If we isolated enough freepages, or aborted due to async
@@ -1103,8 +1143,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1;
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
- unsigned long low_pfn, end_pfn;
- unsigned long isolate_start_pfn;
+ unsigned long block_start_pfn;
+ unsigned long block_end_pfn;
+ unsigned long low_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1115,16 +1156,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
+ block_start_pfn = pageblock_start_pfn(low_pfn);
+ if (block_start_pfn < zone->zone_start_pfn)
+ block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
- end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(low_pfn);
/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
*/
- for (; end_pfn <= cc->free_pfn;
- low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
+ for (; block_end_pfn <= cc->free_pfn;
+ low_pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
/*
* This can potentially iterate a massively long zone with
@@ -1135,7 +1181,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
&& compact_should_abort(cc))
break;
- page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+ page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+ zone);
if (!page)
continue;
@@ -1153,9 +1200,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
continue;
/* Perform the isolation */
- isolate_start_pfn = low_pfn;
- low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
- isolate_mode);
+ low_pfn = isolate_migratepages_block(cc, low_pfn,
+ block_end_pfn, isolate_mode);
if (!low_pfn || cc->contended) {
acct_isolated(zone, cc);
@@ -1163,15 +1209,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
/*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator.
- * - this is the lowest page that could have been isolated and
- * then freed by migration.
- */
- if (cc->nr_migratepages && !cc->last_migrated_pfn)
- cc->last_migrated_pfn = isolate_start_pfn;
-
- /*
* Either we isolated something and proceed with migration. Or
* we failed and compact_zone should decide if we should
* continue or not.
@@ -1195,7 +1232,7 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
-static int __compact_finished(struct zone *zone, struct compact_control *cc,
+static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
unsigned int order;
@@ -1211,14 +1248,17 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/*
* Mark that the PG_migrate_skip information should be cleared
- * by kswapd when it goes to sleep. kswapd does not set the
+ * by kswapd when it goes to sleep. kcompactd does not set the
* flag itself as the decision to be clear should be directly
* based on an allocation request.
*/
- if (!current_is_kswapd())
+ if (cc->direct_compaction)
zone->compact_blockskip_flush = true;
- return COMPACT_COMPLETE;
+ if (cc->whole_zone)
+ return COMPACT_COMPLETE;
+ else
+ return COMPACT_PARTIAL_SKIPPED;
}
if (is_via_compact_memory(cc->order))
@@ -1258,8 +1298,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
return COMPACT_NO_SUITABLE_PAGE;
}
-static int compact_finished(struct zone *zone, struct compact_control *cc,
- const int migratetype)
+static enum compact_result compact_finished(struct zone *zone,
+ struct compact_control *cc,
+ const int migratetype)
{
int ret;
@@ -1278,8 +1319,10 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
* COMPACT_PARTIAL - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
-static unsigned long __compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx)
+static enum compact_result __compaction_suitable(struct zone *zone, int order,
+ unsigned int alloc_flags,
+ int classzone_idx,
+ unsigned long wmark_target)
{
int fragindex;
unsigned long watermark;
@@ -1302,7 +1345,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
* allocated and for a short time, the footprint is higher
*/
watermark += (2UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
+ if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+ alloc_flags, wmark_target))
return COMPACT_SKIPPED;
/*
@@ -1323,12 +1367,14 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
return COMPACT_CONTINUE;
}
-unsigned long compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx)
+enum compact_result compaction_suitable(struct zone *zone, int order,
+ unsigned int alloc_flags,
+ int classzone_idx)
{
- unsigned long ret;
+ enum compact_result ret;
- ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+ ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+ zone_page_state(zone, NR_FREE_PAGES));
trace_mm_compaction_suitable(zone, order, ret);
if (ret == COMPACT_NOT_SUITABLE_ZONE)
ret = COMPACT_SKIPPED;
@@ -1336,9 +1382,42 @@ unsigned long compaction_suitable(struct zone *zone, int order,
return ret;
}
-static int compact_zone(struct zone *zone, struct compact_control *cc)
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+ int alloc_flags)
{
- int ret;
+ struct zone *zone;
+ struct zoneref *z;
+
+ /*
+ * Make sure at least one zone would pass __compaction_suitable if we continue
+ * retrying the reclaim.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+ ac->nodemask) {
+ unsigned long available;
+ enum compact_result compact_result;
+
+ /*
+ * Do not consider all the reclaimable memory because we do not
+ * want to trash just for a single high order allocation which
+ * is even not guaranteed to appear even if __compaction_suitable
+ * is happy about the watermark check.
+ */
+ available = zone_reclaimable_pages(zone) / order;
+ available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+ compact_result = __compaction_suitable(zone, order, alloc_flags,
+ ac_classzone_idx(ac), available);
+ if (compact_result != COMPACT_SKIPPED &&
+ compact_result != COMPACT_NOT_SUITABLE_ZONE)
+ return true;
+ }
+
+ return false;
+}
+
+static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
+{
+ enum compact_result ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
@@ -1346,22 +1425,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
- switch (ret) {
- case COMPACT_PARTIAL:
- case COMPACT_SKIPPED:
- /* Compaction is likely to fail */
+ /* Compaction is likely to fail */
+ if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED)
return ret;
- case COMPACT_CONTINUE:
- /* Fall through to compaction */
- ;
- }
+
+ /* huh, compaction_suitable is returning something unexpected */
+ VM_BUG_ON(ret != COMPACT_CONTINUE);
/*
* Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
+ * is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ if (compaction_restarting(zone, cc->order))
__reset_isolation_suitable(zone);
/*
@@ -1371,15 +1446,19 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
*/
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
- if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
- cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+ if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+ cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
zone->compact_cached_free_pfn = cc->free_pfn;
}
- if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
+
+ if (cc->migrate_pfn == start_pfn)
+ cc->whole_zone = true;
+
cc->last_migrated_pfn = 0;
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
@@ -1427,6 +1506,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
ret = COMPACT_CONTENDED;
goto out;
}
+ /*
+ * We failed to migrate at least one page in the current
+ * order-aligned block, so skip the rest of it.
+ */
+ if (cc->direct_compaction &&
+ (cc->mode == MIGRATE_ASYNC)) {
+ cc->migrate_pfn = block_end_pfn(
+ cc->migrate_pfn - 1, cc->order);
+ /* Draining pcplists is useless in this case */
+ cc->last_migrated_pfn = 0;
+
+ }
}
check_drain:
@@ -1440,7 +1531,7 @@ check_drain:
if (cc->order > 0 && cc->last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
- cc->migrate_pfn & ~((1UL << cc->order) - 1);
+ block_start_pfn(cc->migrate_pfn, cc->order);
if (cc->last_migrated_pfn < current_block_start) {
cpu = get_cpu();
@@ -1465,7 +1556,7 @@ out:
cc->nr_freepages = 0;
VM_BUG_ON(free_pfn == 0);
/* The cached pfn is always the first in a pageblock */
- free_pfn &= ~(pageblock_nr_pages-1);
+ free_pfn = pageblock_start_pfn(free_pfn);
/*
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
@@ -1483,11 +1574,11 @@ out:
return ret;
}
-static unsigned long compact_zone_order(struct zone *zone, int order,
+static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum migrate_mode mode, int *contended,
- int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags, int classzone_idx)
{
- unsigned long ret;
+ enum compact_result ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
@@ -1497,6 +1588,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
.mode = mode,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
+ .direct_compaction = true,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1524,15 +1616,15 @@ int sysctl_extfrag_threshold = 500;
*
* This is the main entry point for direct page compaction.
*/
-unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
- int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended)
+enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended)
{
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
- int rc = COMPACT_DEFERRED;
+ enum compact_result rc = COMPACT_SKIPPED;
int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
*contended = COMPACT_CONTENDED_NONE;
@@ -1546,15 +1638,17 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
- int status;
+ enum compact_result status;
int zone_contended;
- if (compaction_deferred(zone, order))
+ if (compaction_deferred(zone, order)) {
+ rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
continue;
+ }
status = compact_zone_order(zone, order, gfp_mask, mode,
&zone_contended, alloc_flags,
- ac->classzone_idx);
+ ac_classzone_idx(ac));
rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
@@ -1564,7 +1658,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
- ac->classzone_idx, alloc_flags)) {
+ ac_classzone_idx(ac), alloc_flags)) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
@@ -1586,7 +1680,8 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
goto break_loop;
}
- if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
+ if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE ||
+ status == COMPACT_PARTIAL_SKIPPED)) {
/*
* We think that allocation won't succeed in this zone
* so we defer compaction there. If it ends up
@@ -1621,7 +1716,7 @@ break_loop:
* If at least one zone wasn't deferred or skipped, we report if all
* zones that were tried were lock contended.
*/
- if (rc > COMPACT_SKIPPED && all_zones_contended)
+ if (rc > COMPACT_INACTIVE && all_zones_contended)
*contended = COMPACT_CONTENDED_LOCK;
return rc;
@@ -1759,4 +1854,225 @@ void compaction_unregister_node(struct node *node)
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
+{
+ return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+}
+
+static bool kcompactd_node_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+
+ for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
+ classzone_idx) == COMPACT_CONTINUE)
+ return true;
+ }
+
+ return false;
+}
+
+static void kcompactd_do_work(pg_data_t *pgdat)
+{
+ /*
+ * With no special task, compact all zones so that a page of requested
+ * order is allocatable.
+ */
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = pgdat->kcompactd_max_order,
+ .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+
+ };
+ bool success = false;
+
+ trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
+ cc.classzone_idx);
+ count_vm_event(KCOMPACTD_WAKE);
+
+ for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
+ int status;
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_deferred(zone, cc.order))
+ continue;
+
+ if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+ COMPACT_CONTINUE)
+ continue;
+
+ cc.nr_freepages = 0;
+ cc.nr_migratepages = 0;
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ if (kthread_should_stop())
+ return;
+ status = compact_zone(zone, &cc);
+
+ if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
+ cc.classzone_idx, 0)) {
+ success = true;
+ compaction_defer_reset(zone, cc.order, false);
+ } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
+ /*
+ * We use sync migration mode here, so we defer like
+ * sync direct compaction does.
+ */
+ defer_compaction(zone, cc.order);
+ }
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+
+ /*
+ * Regardless of success, we are done until woken up next. But remember
+ * the requested order/classzone_idx in case it was higher/tighter than
+ * our current ones
+ */
+ if (pgdat->kcompactd_max_order <= cc.order)
+ pgdat->kcompactd_max_order = 0;
+ if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+}
+
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ if (!order)
+ return;
+
+ if (pgdat->kcompactd_max_order < order)
+ pgdat->kcompactd_max_order = order;
+
+ if (pgdat->kcompactd_classzone_idx > classzone_idx)
+ pgdat->kcompactd_classzone_idx = classzone_idx;
+
+ if (!waitqueue_active(&pgdat->kcompactd_wait))
+ return;
+
+ if (!kcompactd_node_suitable(pgdat))
+ return;
+
+ trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
+ classzone_idx);
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+}
+
+/*
+ * The background compaction daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kcompactd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ set_freezable();
+
+ pgdat->kcompactd_max_order = 0;
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+
+ while (!kthread_should_stop()) {
+ trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+ wait_event_freezable(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat));
+
+ kcompactd_do_work(pgdat);
+ }
+
+ return 0;
+}
+
+/*
+ * This kcompactd start function will be called by init and node-hot-add.
+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
+ */
+int kcompactd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kcompactd)
+ return 0;
+
+ pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ if (IS_ERR(pgdat->kcompactd)) {
+ pr_err("Failed to start kcompactd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kcompactd);
+ pgdat->kcompactd = NULL;
+ }
+ return ret;
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kcompactd_stop(int nid)
+{
+ struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
+
+ if (kcompactd) {
+ kthread_stop(kcompactd);
+ NODE_DATA(nid)->kcompactd = NULL;
+ }
+}
+
+/*
+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
+ * not required for correctness. So if the last cpu in a node goes
+ * away, we get changed to run anywhere: as the first one comes back,
+ * restore their cpu bindings.
+ */
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ int nid;
+
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
+
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ }
+ }
+ return NOTIFY_OK;
+}
+
+static int __init kcompactd_init(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ kcompactd_run(nid);
+ hotcpu_notifier(cpu_callback, 0);
+ return 0;
+}
+subsys_initcall(kcompactd_init)
+
#endif /* CONFIG_COMPACTION */