summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-04-14 16:49:17 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-14 16:49:17 -0700
commit1dcf58d6e6e6eb7ec10e9abc56887b040205b06f (patch)
treec03e7a25ef13eea62f1547914a76e5c68f3f4c28 /mm
parent80dcc31fbe55932ac9204daee5f2ebc0c49b6da3 (diff)
parente4b0db72be2487bae0e3251c22f82c104f7c1cfd (diff)
Merge branch 'akpm' (patches from Andrew)
Merge first patchbomb from Andrew Morton: - arch/sh updates - ocfs2 updates - kernel/watchdog feature - about half of mm/ * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (122 commits) Documentation: update arch list in the 'memtest' entry Kconfig: memtest: update number of test patterns up to 17 arm: add support for memtest arm64: add support for memtest memtest: use phys_addr_t for physical addresses mm: move memtest under mm mm, hugetlb: abort __get_user_pages if current has been oom killed mm, mempool: do not allow atomic resizing memcg: print cgroup information when system panics due to panic_on_oom mm: numa: remove migrate_ratelimited mm: fold arch_randomize_brk into ARCH_HAS_ELF_RANDOMIZE mm: split ET_DYN ASLR from mmap ASLR s390: redefine randomize_et_dyn for ELF_ET_DYN_BASE mm: expose arch_mmap_rnd when available s390: standardize mmap_rnd() usage powerpc: standardize mmap_rnd() usage mips: extract logic for mmap_rnd() arm64: standardize mmap_rnd() usage x86: standardize mmap_rnd() usage arm: factor out mmap ASLR into mmap_rnd ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cleancache.c276
-rw-r--r--mm/cma.c49
-rw-r--r--mm/cma.h24
-rw-r--r--mm/cma_debug.c170
-rw-r--r--mm/compaction.c15
-rw-r--r--mm/filemap.c15
-rw-r--r--mm/gup.c124
-rw-r--r--mm/huge_memory.c39
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memblock.c4
-rw-r--r--mm/memcontrol.c194
-rw-r--r--mm/memory.c371
-rw-r--r--mm/memory_hotplug.c35
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/mempool.c10
-rw-r--r--mm/memtest.c118
-rw-r--r--mm/migrate.c37
-rw-r--r--mm/mlock.c131
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c247
-rw-r--r--mm/slab.c22
-rw-r--r--mm/slob.c3
-rw-r--r--mm/slub.c28
-rw-r--r--mm/truncate.c37
-rw-r--r--mm/vmalloc.c8
30 files changed, 1199 insertions, 818 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a03131b6ba8e..390214da4546 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -517,6 +517,12 @@ config CMA_DEBUG
processing calls such as dma_alloc_from_contiguous().
This option does not affect warning and error messages.
+config CMA_DEBUGFS
+ bool "CMA debugfs interface"
+ depends on CMA && DEBUG_FS
+ help
+ Turns on the DebugFS interface for CMA.
+
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
diff --git a/mm/Makefile b/mm/Makefile
index 15dbe9903c27..98c4eaeabdcb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
@@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 053bcd8f12fb..8fc50811119b 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -19,7 +19,7 @@
#include <linux/cleancache.h>
/*
- * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * cleancache_ops is set by cleancache_register_ops to contain the pointers
* to the cleancache "backend" implementation functions.
*/
static struct cleancache_ops *cleancache_ops __read_mostly;
@@ -34,145 +34,107 @@ static u64 cleancache_failed_gets;
static u64 cleancache_puts;
static u64 cleancache_invalidates;
-/*
- * When no backend is registered all calls to init_fs and init_shared_fs
- * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
- * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
- * [shared_|]fs_poolid_map) are given to the respective super block
- * (sb->cleancache_poolid) and no tmem_pools are created. When a backend
- * registers with cleancache the previous calls to init_fs and init_shared_fs
- * are executed to create tmem_pools and set the respective poolids. While no
- * backend is registered all "puts", "gets" and "flushes" are ignored or failed.
- */
-#define MAX_INITIALIZABLE_FS 32
-#define FAKE_FS_POOLID_OFFSET 1000
-#define FAKE_SHARED_FS_POOLID_OFFSET 2000
-
-#define FS_NO_BACKEND (-1)
-#define FS_UNKNOWN (-2)
-static int fs_poolid_map[MAX_INITIALIZABLE_FS];
-static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
-static char *uuids[MAX_INITIALIZABLE_FS];
-/*
- * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
- * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
- * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
- */
-static DEFINE_MUTEX(poolid_mutex);
-/*
- * When set to false (default) all calls to the cleancache functions, except
- * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
- * by the if (!cleancache_ops) return. This means multiple threads (from
- * different filesystems) will be checking cleancache_ops. The usage of a
- * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
- * OK if the time between the backend's have been initialized (and
- * cleancache_ops has been set to not NULL) and when the filesystems start
- * actually calling the backends. The inverse (when unloading) is obviously
- * not good - but this shim does not do that (yet).
- */
-
-/*
- * The backends and filesystems work all asynchronously. This is b/c the
- * backends can be built as modules.
- * The usual sequence of events is:
- * a) mount / -> __cleancache_init_fs is called. We set the
- * [shared_|]fs_poolid_map and uuids for.
- *
- * b). user does I/Os -> we call the rest of __cleancache_* functions
- * which return immediately as cleancache_ops is false.
- *
- * c). modprobe zcache -> cleancache_register_ops. We init the backend
- * and set cleancache_ops to true, and for any fs_poolid_map
- * (which is set by __cleancache_init_fs) we initialize the poolid.
- *
- * d). user does I/Os -> now that cleancache_ops is true all the
- * __cleancache_* functions can call the backend. They all check
- * that fs_poolid_map is valid and if so invoke the backend.
- *
- * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is
- * reset (which is the second check in the __cleancache_* ops
- * to call the backend).
- *
- * The sequence of event could also be c), followed by a), and d). and e). The
- * c) would not happen anymore. There is also the chance of c), and one thread
- * doing a) + d), and another doing e). For that case we depend on the
- * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
- * that it handles all I/Os before it invalidates the fs (which is last part
- * of unmounting process).
- *
- * Note: The acute reader will notice that there is no "rmmod zcache" case.
- * This is b/c the functionality for that is not yet implemented and when
- * done, will require some extra locking not yet devised.
- */
+static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
+{
+ switch (sb->cleancache_poolid) {
+ case CLEANCACHE_NO_BACKEND:
+ __cleancache_init_fs(sb);
+ break;
+ case CLEANCACHE_NO_BACKEND_SHARED:
+ __cleancache_init_shared_fs(sb);
+ break;
+ }
+}
/*
- * Register operations for cleancache, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for cleancache. Returns 0 on success.
*/
-struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(struct cleancache_ops *ops)
{
- struct cleancache_ops *old = cleancache_ops;
- int i;
+ if (cmpxchg(&cleancache_ops, NULL, ops))
+ return -EBUSY;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (fs_poolid_map[i] == FS_NO_BACKEND)
- fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
- if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
- shared_fs_poolid_map[i] = ops->init_shared_fs
- (uuids[i], PAGE_SIZE);
- }
/*
- * We MUST set cleancache_ops _after_ we have called the backends
- * init_fs or init_shared_fs functions. Otherwise the compiler might
- * re-order where cleancache_ops is set in this function.
+ * A cleancache backend can be built as a module and hence loaded after
+ * a cleancache enabled filesystem has called cleancache_init_fs. To
+ * handle such a scenario, here we call ->init_fs or ->init_shared_fs
+ * for each active super block. To differentiate between local and
+ * shared filesystems, we temporarily initialize sb->cleancache_poolid
+ * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
+ * respectively in case there is no backend registered at the time
+ * cleancache_init_fs or cleancache_init_shared_fs is called.
+ *
+ * Since filesystems can be mounted concurrently with cleancache
+ * backend registration, we have to be careful to guarantee that all
+ * cleancache enabled filesystems that has been mounted by the time
+ * cleancache_register_ops is called has got and all mounted later will
+ * get cleancache_poolid. This is assured by the following statements
+ * tied together:
+ *
+ * a) iterate_supers skips only those super blocks that has started
+ * ->kill_sb
+ *
+ * b) if iterate_supers encounters a super block that has not finished
+ * ->mount yet, it waits until it is finished
+ *
+ * c) cleancache_init_fs is called from ->mount and
+ * cleancache_invalidate_fs is called from ->kill_sb
+ *
+ * d) we call iterate_supers after cleancache_ops has been set
+ *
+ * From a) it follows that if iterate_supers skips a super block, then
+ * either the super block is already dead, in which case we do not need
+ * to bother initializing cleancache for it, or it was mounted after we
+ * initiated iterate_supers. In the latter case, it must have seen
+ * cleancache_ops set according to d) and initialized cleancache from
+ * ->mount by itself according to c). This proves that we call
+ * ->init_fs at least once for each active super block.
+ *
+ * From b) and c) it follows that if iterate_supers encounters a super
+ * block that has already started ->init_fs, it will wait until ->mount
+ * and hence ->init_fs has finished, then check cleancache_poolid, see
+ * that it has already been set and therefore do nothing. This proves
+ * that we call ->init_fs no more than once for each super block.
+ *
+ * Combined together, the last two paragraphs prove the function
+ * correctness.
+ *
+ * Note that various cleancache callbacks may proceed before this
+ * function is called or even concurrently with it, but since
+ * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
+ * until the corresponding ->init_fs has been actually called and
+ * cleancache_ops has been set.
*/
- barrier();
- cleancache_ops = ops;
- mutex_unlock(&poolid_mutex);
- return old;
+ iterate_supers(cleancache_register_ops_sb, NULL);
+ return 0;
}
EXPORT_SYMBOL(cleancache_register_ops);
/* Called by a cleancache-enabled filesystem at time of mount */
void __cleancache_init_fs(struct super_block *sb)
{
- int i;
+ int pool_id = CLEANCACHE_NO_BACKEND;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (fs_poolid_map[i] == FS_UNKNOWN) {
- sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
- if (cleancache_ops)
- fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
- else
- fs_poolid_map[i] = FS_NO_BACKEND;
- break;
- }
+ if (cleancache_ops) {
+ pool_id = cleancache_ops->init_fs(PAGE_SIZE);
+ if (pool_id < 0)
+ pool_id = CLEANCACHE_NO_POOL;
}
- mutex_unlock(&poolid_mutex);
+ sb->cleancache_poolid = pool_id;
}
EXPORT_SYMBOL(__cleancache_init_fs);
/* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+void __cleancache_init_shared_fs(struct super_block *sb)
{
- int i;
+ int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
- sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
- uuids[i] = uuid;
- if (cleancache_ops)
- shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
- (uuid, PAGE_SIZE);
- else
- shared_fs_poolid_map[i] = FS_NO_BACKEND;
- break;
- }
+ if (cleancache_ops) {
+ pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
+ if (pool_id < 0)
+ pool_id = CLEANCACHE_NO_POOL;
}
- mutex_unlock(&poolid_mutex);
+ sb->cleancache_poolid = pool_id;
}
EXPORT_SYMBOL(__cleancache_init_shared_fs);
@@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode,
}
/*
- * Returns a pool_id that is associated with a given fake poolid.
- */
-static int get_poolid_from_fake(int fake_pool_id)
-{
- if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
- return shared_fs_poolid_map[fake_pool_id -
- FAKE_SHARED_FS_POOLID_OFFSET];
- else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
- return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
- return FS_NO_BACKEND;
-}
-
-/*
* "Get" data from cleancache associated with the poolid/inode/index
* that were specified when the data was put to cleanache and, if
* successful, use it to fill the specified page with data and return 0.
@@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page)
{
int ret = -1;
int pool_id;
- int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops) {
@@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page)
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
- fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (fake_pool_id < 0)
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id < 0)
goto out;
- pool_id = get_poolid_from_fake(fake_pool_id);
if (cleancache_get_key(page->mapping->host, &key) < 0)
goto out;
- if (pool_id >= 0)
- ret = cleancache_ops->get_page(pool_id,
- key, page->index, page);
+ ret = cleancache_ops->get_page(pool_id, key, page->index, page);
if (ret == 0)
cleancache_succ_gets++;
else
@@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page);
void __cleancache_put_page(struct page *page)
{
int pool_id;
- int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops) {
@@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page)
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
- fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (fake_pool_id < 0)
- return;
-
- pool_id = get_poolid_from_fake(fake_pool_id);
-
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (pool_id >= 0 &&
cleancache_get_key(page->mapping->host, &key) >= 0) {
cleancache_ops->put_page(pool_id, key, page->index, page);
@@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
struct page *page)
{
/* careful... page->mapping is NULL sometimes when this is called */
- int pool_id;
- int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops)
return;
- if (fake_pool_id >= 0) {
- pool_id = get_poolid_from_fake(fake_pool_id);
- if (pool_id < 0)
- return;
-
+ if (pool_id >= 0) {
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (cleancache_get_key(mapping->host, &key) >= 0) {
cleancache_ops->invalidate_page(pool_id,
@@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
*/
void __cleancache_invalidate_inode(struct address_space *mapping)
{
- int pool_id;
- int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops)
return;
- if (fake_pool_id < 0)
- return;
-
- pool_id = get_poolid_from_fake(fake_pool_id);
-
if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
cleancache_ops->invalidate_inode(pool_id, key);
}
@@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
*/
void __cleancache_invalidate_fs(struct super_block *sb)
{
- int index;
- int fake_pool_id = sb->cleancache_poolid;
- int old_poolid = fake_pool_id;
+ int pool_id;
- mutex_lock(&poolid_mutex);
- if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
- index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
- old_poolid = shared_fs_poolid_map[index];
- shared_fs_poolid_map[index] = FS_UNKNOWN;
- uuids[index] = NULL;
- } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
- index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
- old_poolid = fs_poolid_map[index];
- fs_poolid_map[index] = FS_UNKNOWN;
- }
- sb->cleancache_poolid = -1;
- if (cleancache_ops)
- cleancache_ops->invalidate_fs(old_poolid);
- mutex_unlock(&poolid_mutex);
+ pool_id = sb->cleancache_poolid;
+ sb->cleancache_poolid = CLEANCACHE_NO_POOL;
+
+ if (cleancache_ops && pool_id >= 0)
+ cleancache_ops->invalidate_fs(pool_id);
}
EXPORT_SYMBOL(__cleancache_invalidate_fs);
static int __init init_cleancache(void)
{
- int i;
-
#ifdef CONFIG_DEBUG_FS
struct dentry *root = debugfs_create_dir("cleancache", NULL);
if (root == NULL)
@@ -400,10 +314,6 @@ static int __init init_cleancache(void)
debugfs_create_u64("invalidates", S_IRUGO,
root, &cleancache_invalidates);
#endif
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- fs_poolid_map[i] = FS_UNKNOWN;
- shared_fs_poolid_map[i] = FS_UNKNOWN;
- }
return 0;
}
module_init(init_cleancache)
diff --git a/mm/cma.c b/mm/cma.c
index 68ecb7a42983..47203faaf65e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -35,29 +35,24 @@
#include <linux/highmem.h>
#include <linux/io.h>
-struct cma {
- unsigned long base_pfn;
- unsigned long count;
- unsigned long *bitmap;
- unsigned int order_per_bit; /* Order of pages represented by one bit */
- struct mutex lock;
-};
-
-static struct cma cma_areas[MAX_CMA_AREAS];
-static unsigned cma_area_count;
+#include "cma.h"
+
+struct cma cma_areas[MAX_CMA_AREAS];
+unsigned cma_area_count;
static DEFINE_MUTEX(cma_mutex);
-phys_addr_t cma_get_base(struct cma *cma)
+phys_addr_t cma_get_base(const struct cma *cma)
{
return PFN_PHYS(cma->base_pfn);
}
-unsigned long cma_get_size(struct cma *cma)
+unsigned long cma_get_size(const struct cma *cma)
{
return cma->count << PAGE_SHIFT;
}
-static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
+ int align_order)
{
if (align_order <= cma->order_per_bit)
return 0;
@@ -68,7 +63,8 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
* Find a PFN aligned to the specified order and return an offset represented in
* order_per_bits.
*/
-static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
+ int align_order)
{
if (align_order <= cma->order_per_bit)
return 0;
@@ -77,18 +73,14 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
- cma->base_pfn) >> cma->order_per_bit;
}
-static unsigned long cma_bitmap_maxno(struct cma *cma)
-{
- return cma->count >> cma->order_per_bit;
-}
-
-static unsigned long cma_bitmap_pages_to_bits(struct cma *cma,
- unsigned long pages)
+static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
+ unsigned long pages)
{
return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
}
-static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count)
+static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
+ unsigned int count)
{
unsigned long bitmap_no, bitmap_count;
@@ -134,6 +126,12 @@ static int __init cma_activate_area(struct cma *cma)
} while (--i);
mutex_init(&cma->lock);
+
+#ifdef CONFIG_CMA_DEBUGFS
+ INIT_HLIST_HEAD(&cma->mem_head);
+ spin_lock_init(&cma->mem_head_lock);
+#endif
+
return 0;
err:
@@ -167,7 +165,8 @@ core_initcall(cma_init_reserved_areas);
* This function creates custom contiguous area from already reserved memory.
*/
int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
- int order_per_bit, struct cma **res_cma)
+ unsigned int order_per_bit,
+ struct cma **res_cma)
{
struct cma *cma;
phys_addr_t alignment;
@@ -358,7 +357,7 @@ err:
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
-struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
+struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
{
unsigned long mask, offset, pfn, start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
@@ -429,7 +428,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
* It returns false when provided pages do not belong to contiguous area and
* true otherwise.
*/
-bool cma_release(struct cma *cma, struct page *pages, int count)
+bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
{
unsigned long pfn;
diff --git a/mm/cma.h b/mm/cma.h
new file mode 100644
index 000000000000..1132d733556d
--- /dev/null
+++ b/mm/cma.h
@@ -0,0 +1,24 @@
+#ifndef __MM_CMA_H__
+#define __MM_CMA_H__
+
+struct cma {
+ unsigned long base_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+ unsigned int order_per_bit; /* Order of pages represented by one bit */
+ struct mutex lock;
+#ifdef CONFIG_CMA_DEBUGFS
+ struct hlist_head mem_head;
+ spinlock_t mem_head_lock;
+#endif
+};
+
+extern struct cma cma_areas[MAX_CMA_AREAS];
+extern unsigned cma_area_count;
+
+static unsigned long cma_bitmap_maxno(struct cma *cma)
+{
+ return cma->count >> cma->order_per_bit;
+}
+
+#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
new file mode 100644
index 000000000000..0b377536ccde
--- /dev/null
+++ b/mm/cma_debug.c
@@ -0,0 +1,170 @@
+/*
+ * CMA DebugFS Interface
+ *
+ * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com>
+ */
+
+
+#include <linux/debugfs.h>
+#include <linux/cma.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm_types.h>
+
+#include "cma.h"
+
+struct cma_mem {
+ struct hlist_node node;
+ struct page *p;
+ unsigned long n;
+};
+
+static struct dentry *cma_debugfs_root;
+
+static int cma_debugfs_get(void *data, u64 *val)
+{
+ unsigned long *p = data;
+
+ *val = *p;
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
+
+static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
+{
+ spin_lock(&cma->mem_head_lock);
+ hlist_add_head(&mem->node, &cma->mem_head);
+ spin_unlock(&cma->mem_head_lock);
+}
+
+static struct cma_mem *cma_get_entry_from_list(struct cma *cma)
+{
+ struct cma_mem *mem = NULL;
+
+ spin_lock(&cma->mem_head_lock);
+ if (!hlist_empty(&cma->mem_head)) {
+ mem = hlist_entry(cma->mem_head.first, struct cma_mem, node);
+ hlist_del_init(&mem->node);
+ }
+ spin_unlock(&cma->mem_head_lock);
+
+ return mem;
+}
+
+static int cma_free_mem(struct cma *cma, int count)
+{
+ struct cma_mem *mem = NULL;
+
+ while (count) {
+ mem = cma_get_entry_from_list(cma);
+ if (mem == NULL)
+ return 0;
+
+ if (mem->n <= count) {
+ cma_release(cma, mem->p, mem->n);
+ count -= mem->n;
+ kfree(mem);
+ } else if (cma->order_per_bit == 0) {
+ cma_release(cma, mem->p, count);
+ mem->p += count;
+ mem->n -= count;
+ count = 0;
+ cma_add_to_cma_mem_list(cma, mem);
+ } else {
+ pr_debug("cma: cannot release partial block when order_per_bit != 0\n");
+ cma_add_to_cma_mem_list(cma, mem);
+ break;
+ }
+ }
+
+ return 0;
+
+}
+
+static int cma_free_write(void *data, u64 val)
+{
+ int pages = val;
+ struct cma *cma = data;
+
+ return cma_free_mem(cma, pages);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
+
+static int cma_alloc_mem(struct cma *cma, int count)
+{
+ struct cma_mem *mem;
+ struct page *p;
+
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+
+ p = cma_alloc(cma, count, 0);
+ if (!p) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ mem->p = p;
+ mem->n = count;
+
+ cma_add_to_cma_mem_list(cma, mem);
+
+ return 0;
+}
+
+static int cma_alloc_write(void *data, u64 val)
+{
+ int pages = val;
+ struct cma *cma = data;
+
+ return cma_alloc_mem(cma, pages);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
+
+static void cma_debugfs_add_one(struct cma *cma, int idx)
+{
+ struct dentry *tmp;
+ char name[16];
+ int u32s;
+
+ sprintf(name, "cma-%d", idx);
+
+ tmp = debugfs_create_dir(name, cma_debugfs_root);
+
+ debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma,
+ &cma_alloc_fops);
+
+ debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma,
+ &cma_free_fops);
+
+ debugfs_create_file("base_pfn", S_IRUGO, tmp,
+ &cma->base_pfn, &cma_debugfs_fops);
+ debugfs_create_file("count", S_IRUGO, tmp,
+ &cma->count, &cma_debugfs_fops);
+ debugfs_create_file("order_per_bit", S_IRUGO, tmp,
+ &cma->order_per_bit, &cma_debugfs_fops);
+
+ u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
+ debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
+}
+
+static int __init cma_debugfs_init(void)
+{
+ int i;
+
+ cma_debugfs_root = debugfs_create_dir("cma", NULL);
+ if (!cma_debugfs_root)
+ return -ENOMEM;
+
+ for (i = 0; i < cma_area_count; i++)
+ cma_debugfs_add_one(&cma_areas[i], i);
+
+ return 0;
+}
+late_initcall(cma_debugfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
index 8c0d9459b54a..a18201a8124e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1174,13 +1174,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
+ bool can_steal;
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[migratetype]))
return COMPACT_PARTIAL;
- /* Job done if allocation would set block type */
- if (order >= pageblock_order && area->nr_free)
+#ifdef CONFIG_CMA
+ /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
+ if (migratetype == MIGRATE_MOVABLE &&
+ !list_empty(&area->free_list[MIGRATE_CMA]))
+ return COMPACT_PARTIAL;
+#endif
+ /*
+ * Job done if allocation would steal freepages from
+ * other migratetype buddy lists.
+ */
+ if (find_suitable_fallback(area, order, migratetype,
+ true, &can_steal) != -1)
return COMPACT_PARTIAL;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 876f4e6f3ed6..12548d03c11d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -202,16 +202,15 @@ void __delete_from_page_cache(struct page *page, void *shadow)
BUG_ON(page_mapped(page));
/*
- * Some filesystems seem to re-dirty the page even after
- * the VM has canceled the dirty bit (eg ext3 journaling).
+ * At this point page must be either written or cleaned by truncate.
+ * Dirty page here signals a bug and loss of unwritten data.
*
- * Fix it up by doing a final dirty accounting check after
- * having removed the page entirely.
+ * This fixes dirty accounting after removing the page entirely but
+ * leaves PageDirty set: it has no effect for truncated page and
+ * anyway will be cleared before returning page into buddy allocator.
*/
- if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
- dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
- }
+ if (WARN_ON_ONCE(PageDirty(page)))
+ account_page_cleaned(page, mapping);
}
/**
diff --git a/mm/gup.c b/mm/gup.c
index a6e24e246f86..ca7b607ab671 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -92,7 +92,7 @@ retry:
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned int fault_flags = 0;
int ret;
- /* For mlock, just skip the stack guard page. */
- if ((*flags & FOLL_MLOCK) &&
+ /* For mm_populate(), just skip the stack guard page. */
+ if ((*flags & FOLL_POPULATE) &&
(stack_guard_page_start(vma, address) ||
stack_guard_page_end(vma, address + PAGE_SIZE)))
return -ENOENT;
@@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
EXPORT_SYMBOL(get_user_pages);
/**
+ * populate_vma_page_range() - populate a range of pages in the vma.
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @nonblocking:
+ *
+ * This takes care of mlocking the pages too if VM_LOCKED is set.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * If @nonblocking is NULL, it may be held for read or write and will
+ * be unperturbed.
+ *
+ * If @nonblocking is non-NULL, it must held for read only and may be
+ * released. If it's released, *@nonblocking will be set to 0.
+ */
+long populate_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * We want mlock to succeed for regions that have any permissions
+ * other than PROT_NONE.
+ */
+ if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ gup_flags |= FOLL_FORCE;
+
+ /*
+ * We made sure addr is within a VMA, so the following will
+ * not result in a stack expansion that recurses back here.
+ */
+ return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ NULL, NULL, nonblocking);
+}
+
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ long ret = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. populate_vma_page_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = populate_vma_page_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
+/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6817b0350c71..3afb5cbe1312 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1231,7 +1231,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
@@ -2109,7 +2109,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
{
while (--_pte >= pte) {
pte_t pteval = *_pte;
- if (!pte_none(pteval))
+ if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
release_pte_page(pte_page(pteval));
}
}
@@ -2120,13 +2120,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
{
struct page *page;
pte_t *_pte;
- int none = 0;
+ int none_or_zero = 0;
bool referenced = false, writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
- if (pte_none(pteval)) {
- if (++none <= khugepaged_max_ptes_none)
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out;
@@ -2207,9 +2207,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
pte_t pteval = *_pte;
struct page *src_page;
- if (pte_none(pteval)) {
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, address);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ if (is_zero_pfn(pte_pfn(pteval))) {
+ /*
+ * ptl mostly unnecessary.
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ spin_unlock(ptl);
+ }
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
@@ -2316,8 +2328,14 @@ static struct page
struct vm_area_struct *vma, unsigned long address,
int node)
{
+ gfp_t flags;
+
VM_BUG_ON_PAGE(*hpage, *hpage);
+ /* Only allocate from the target node */
+ flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+ __GFP_THISNODE;
+
/*
* Before allocating the hugepage, release the mmap_sem read lock.
* The allocation can take potentially a long time if it involves
@@ -2326,8 +2344,7 @@ static struct page
*/
up_read(&mm->mmap_sem);
- *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
- khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
+ *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
@@ -2543,7 +2560,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none = 0;
+ int ret = 0, none_or_zero = 0;
struct page *page;
unsigned long _address;
spinlock_t *ptl;
@@ -2561,8 +2578,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
- if (pte_none(pteval)) {
- if (++none <= khugepaged_max_ptes_none)
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out_unmap;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c41b2a0ee273..8874c8ad55aa 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3278,6 +3278,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
/*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (unlikely(fatal_signal_pending(current))) {
+ remainder = 0;
+ break;
+ }
+
+ /*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
@@ -3735,8 +3744,7 @@ retry:
if (!pmd_huge(*pmd))
goto out;
if (pmd_present(*pmd)) {
- page = pte_page(*(pte_t *)pmd) +
- ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
if (flags & FOLL_GET)
get_page(page);
} else {
diff --git a/mm/internal.h b/mm/internal.h
index a96da5b0029d..edaab69a9c35 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -200,6 +200,8 @@ isolate_freepages_range(struct compact_control *cc,
unsigned long
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal);
#endif
@@ -240,7 +242,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
#ifdef CONFIG_MMU
-extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
+extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *nonblocking);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
diff --git a/mm/memblock.c b/mm/memblock.c
index 252b77bdf65e..3f37a0bca5d5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -699,14 +699,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *_rgn = &memblock.reserved;
+ struct memblock_type *type = &memblock.reserved;
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(_rgn, base, size, nid, flags);
+ return memblock_add_range(type, base, size, nid, flags);
}
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b34ef4a32a3b..c3f09b2dda5f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -14,6 +14,12 @@
* Copyright (C) 2012 Parallels Inc. and Google Inc.
* Authors: Glauber Costa and Suleiman Souhlal
*
+ * Native page reclaim
+ * Charge lifetime sanitation
+ * Lockless page tracking & accounting
+ * Unified hierarchy configuration model
+ * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -1436,15 +1442,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
struct mem_cgroup *iter;
unsigned int i;
- if (!p)
- return;
-
mutex_lock(&oom_info_lock);
rcu_read_lock();
- pr_info("Task in ");
- pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_cont(" killed as a result of limit of ");
+ if (p) {
+ pr_info("Task in ");
+ pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ pr_cont(" killed as a result of limit of ");
+ } else {
+ pr_info("Memory limit reached of cgroup ");
+ }
+
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont("\n");
@@ -1531,7 +1539,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
return;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -2779,92 +2787,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
- * @nr_pages: number of regular pages (>1 for huge pages)
- * @from: mem_cgroup which the page is moved from.
- * @to: mem_cgroup which the page is moved to. @from != @to.
- *
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct page *page,
- unsigned int nr_pages,
- struct mem_cgroup *from,
- struct mem_cgroup *to)
-{
- unsigned long flags;
- int ret;
-
- VM_BUG_ON(from == to);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- ret = -EBUSY;
- if (nr_pages > 1 && !PageTransHuge(page))
- goto out;
-
- /*
- * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
- * of its source page while we change it: page migration takes
- * both pages off the LRU, but page cache replacement doesn't.
- */
- if (!trylock_page(page))
- goto out;
-
- ret = -EINVAL;
- if (page->mem_cgroup != from)
- goto out_unlock;
-
- spin_lock_irqsave(&from->move_lock, flags);
-
- if (!PageAnon(page) && page_mapped(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- }
-
- if (PageWriteback(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- }
-
- /*
- * It is safe to change page->mem_cgroup here because the page
- * is referenced, charged, and isolated - we can't race with
- * uncharging, charging, migration, or LRU putback.
- */
-
- /* caller should have done css_get */
- page->mem_cgroup = to;
- spin_unlock_irqrestore(&from->move_lock, flags);
-
- ret = 0;
-
- local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
- memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
- memcg_check_events(from, page);
- local_irq_enable();
-out_unlock:
- unlock_page(page);
-out:
- return ret;
-}
-
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
@@ -4816,6 +4738,92 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
return page;
}
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ * - compound_lock is held when nr_pages > 1
+ *
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
+ */
+static int mem_cgroup_move_account(struct page *page,
+ unsigned int nr_pages,
+ struct mem_cgroup *from,
+ struct mem_cgroup *to)
+{
+ unsigned long flags;
+ int ret;
+
+ VM_BUG_ON(from == to);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ ret = -EBUSY;
+ if (nr_pages > 1 && !PageTransHuge(page))
+ goto out;
+
+ /*
+ * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
+ * of its source page while we change it: page migration takes
+ * both pages off the LRU, but page cache replacement doesn't.
+ */
+ if (!trylock_page(page))
+ goto out;
+
+ ret = -EINVAL;
+ if (page->mem_cgroup != from)
+ goto out_unlock;
+
+ spin_lock_irqsave(&from->move_lock, flags);
+
+ if (!PageAnon(page) && page_mapped(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ }
+
+ if (PageWriteback(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ }
+
+ /*
+ * It is safe to change page->mem_cgroup here because the page
+ * is referenced, charged, and isolated - we can't race with
+ * uncharging, charging, migration, or LRU putback.
+ */
+
+ /* caller should have done css_get */
+ page->mem_cgroup = to;
+ spin_unlock_irqrestore(&from->move_lock, flags);
+
+ ret = 0;
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(to, page, nr_pages);
+ memcg_check_events(to, page);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
+ memcg_check_events(from, page);
+ local_irq_enable();
+out_unlock:
+ unlock_page(page);
+out:
+ return ret;
+}
+
static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
diff --git a/mm/memory.c b/mm/memory.c
index 97839f5c8c30..ac20b2a6a0c3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1983,167 +1983,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
- * This routine handles present pages, when users try to write
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
- *
- * Note that this routine assumes that the protection checks have been
- * done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
+ * Handle write page faults for pages that can be reused in the current vma
*
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * This can happen either due to the mapping being with the VM_SHARED flag,
+ * or due to us being the last reference standing to the page. In either
+ * case, all we need to do here is to mark the page as writable and update
+ * any related book-keeping.
*/
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- spinlock_t *ptl, pte_t orig_pte)
+static inline int wp_page_reuse(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+ struct page *page, int page_mkwrite,
+ int dirty_shared)
__releases(ptl)
{
- struct page *old_page, *new_page = NULL;
pte_t entry;
- int ret = 0;
- int page_mkwrite = 0;
- bool dirty_shared = false;
- unsigned long mmun_start = 0; /* For mmu_notifiers */
- unsigned long mmun_end = 0; /* For mmu_notifiers */
- struct mem_cgroup *memcg;
-
- old_page = vm_normal_page(vma, address, orig_pte);
- if (!old_page) {
- /*
- * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
- * VM_PFNMAP VMA.
- *
- * We should not cow pages in a shared writeable mapping.
- * Just mark the pages writable as we can't do any dirty
- * accounting on raw pfn maps.
- */
- if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))
- goto reuse;
- goto gotten;
- }
-
/*
- * Take out anonymous pages first, anonymous shared vmas are
- * not dirty accountable.
+ * Clear the pages cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
- if (!trylock_page(old_page)) {
- page_cache_get(old_page);
- pte_unmap_unlock(page_table, ptl);
- lock_page(old_page);
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
- unlock_page(old_page);
- goto unlock;
- }
- page_cache_release(old_page);
- }
- if (reuse_swap_page(old_page)) {
- /*
- * The page is all ours. Move it to our anon_vma so
- * the rmap code will not search our parent or siblings.
- * Protected against the rmap code by the page lock.
- */
- page_move_anon_rmap(old_page, vma, address);
- unlock_page(old_page);
- goto reuse;
- }
- unlock_page(old_page);
- } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))) {
- page_cache_get(old_page);
- /*
- * Only catch write-faults on shared writable pages,
- * read-only shared pages can get COWed by
- * get_user_pages(.write=1, .force=1).
- */
- if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
- int tmp;
-
- pte_unmap_unlock(page_table, ptl);
- tmp = do_page_mkwrite(vma, old_page, address);
- if (unlikely(!tmp || (tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(old_page);
- return tmp;
- }
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
- unlock_page(old_page);
- goto unlock;
- }
- page_mkwrite = 1;
- }
-
- dirty_shared = true;
-
-reuse:
- /*
- * Clear the pages cpupid information as the existing
- * information potentially belongs to a now completely
- * unrelated process.
- */
- if (old_page)
- page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
-
- flush_cache_page(vma, address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, address, page_table, entry,1))
- update_mmu_cache(vma, address, page_table);
- pte_unmap_unlock(page_table, ptl);
- ret |= VM_FAULT_WRITE;
+ if (page)
+ page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = pte_mkyoung(orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, address, page_table, entry, 1))
+ update_mmu_cache(vma, address, page_table);
+ pte_unmap_unlock(page_table, ptl);
- if (!page_mkwrite)
- lock_page(old_page);
+ if (dirty_shared) {
+ struct address_space *mapping;
+ int dirtied;
- dirtied = set_page_dirty(old_page);
- VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
- mapping = old_page->mapping;
- unlock_page(old_page);
- page_cache_release(old_page);
+ if (!page_mkwrite)
+ lock_page(page);
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ mapping = page->mapping;
+ unlock_page(page);
+ page_cache_release(page);
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
}
- return ret;
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
}
- /*
- * Ok, we need to copy. Oh, well..
- */
- page_cache_get(old_page);
-gotten:
- pte_unmap_unlock(page_table, ptl);
+ return VM_FAULT_WRITE;
+}
+
+/*
+ * Handle the case of a page which we actually need to copy to a new page.
+ *
+ * Called with mmap_sem locked and the old page referenced, but
+ * without the ptl held.
+ *
+ * High level logic flow:
+ *
+ * - Allocate a page, copy the content of the old page to the new one.
+ * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
+ * - Take the PTL. If the pte changed, bail out and release the allocated page
+ * - If the pte is still the way we remember it, update the page table and all
+ * relevant references. This includes dropping the reference the page-table
+ * held to the old page, as well as updating the rmap.
+ * - In any case, unlock the PTL and drop the reference we took to the old page.
+ */
+static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ pte_t orig_pte, struct page *old_page)
+{
+ struct page *new_page = NULL;
+ spinlock_t *ptl = NULL;
+ pte_t entry;
+ int page_copied = 0;
+ const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
+ const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
+ struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2163,8 +2087,6 @@ gotten:
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
goto oom_free_new;
- mmun_start = address & PAGE_MASK;
- mmun_end = mmun_start + PAGE_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
@@ -2177,8 +2099,9 @@ gotten:
dec_mm_counter_fast(mm, MM_FILEPAGES);
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- } else
+ } else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
+ }
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2150,29 @@ gotten:
/* Free the old page.. */
new_page = old_page;
- ret |= VM_FAULT_WRITE;
- } else
+ page_copied = 1;
+ } else {
mem_cgroup_cancel_charge(new_page, memcg);
+ }
if (new_page)
page_cache_release(new_page);
-unlock:
+
pte_unmap_unlock(page_table, ptl);
- if (mmun_end > mmun_start)
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
- if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+ if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
}
- return ret;
+ return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
page_cache_release(new_page);
oom:
@@ -2258,6 +2181,144 @@ oom:
return VM_FAULT_OOM;
}
+static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table,
+ pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
+ struct page *old_page)
+ __releases(ptl)
+{
+ int page_mkwrite = 0;
+
+ page_cache_get(old_page);
+
+ /*
+ * Only catch write-faults on shared writable pages,
+ * read-only shared pages can get COWed by
+ * get_user_pages(.write=1, .force=1).
+ */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+ int tmp;
+
+ pte_unmap_unlock(page_table, ptl);
+ tmp = do_page_mkwrite(vma, old_page, address);
+ if (unlikely(!tmp || (tmp &
+ (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ page_cache_release(old_page);
+ return tmp;
+ }
+ /*
+ * Since we dropped the lock we need to revalidate
+ * the PTE as someone else may have changed it. If
+ * they did, we just return, as we can count on the
+ * MMU to tell us if they didn't also make it writable.
+ */
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ page_cache_release(old_page);
+ return 0;
+ }
+ page_mkwrite = 1;
+ }
+
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, page_mkwrite, 1);
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ spinlock_t *ptl, pte_t orig_pte)
+ __releases(ptl)
+{
+ struct page *old_page;
+
+ old_page = vm_normal_page(vma, address, orig_pte);
+ if (!old_page) {
+ /*
+ * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
+ * VM_PFNMAP VMA.
+ *
+ * We should not cow pages in a shared writeable mapping.
+ * Just mark the pages writable as we can't do any dirty
+ * accounting on raw pfn maps.
+ */
+ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, 0, 0);
+
+ pte_unmap_unlock(page_table, ptl);
+ return wp_page_copy(mm, vma, address, page_table, pmd,
+ orig_pte, old_page);
+ }
+
+ /*
+ * Take out anonymous pages first, anonymous shared vmas are
+ * not dirty accountable.
+ */
+ if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (!trylock_page(old_page)) {
+ page_cache_get(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ lock_page(old_page);
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ page_cache_release(old_page);
+ return 0;
+ }
+ page_cache_release(old_page);
+ }
+ if (reuse_swap_page(old_page)) {
+ /*
+ * The page is all ours. Move it to our anon_vma so
+ * the rmap code will not search our parent or siblings.
+ * Protected against the rmap code by the page lock.
+ */
+ page_move_anon_rmap(old_page, vma, address);
+ unlock_page(old_page);
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, 0, 0);
+ }
+ unlock_page(old_page);
+ } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))) {
+ return wp_page_shared(mm, vma, address, page_table, pmd,
+ ptl, orig_pte, old_page);
+ }
+
+ /*
+ * Ok, we need to copy. Oh, well..
+ */
+ page_cache_get(old_page);
+
+ pte_unmap_unlock(page_table, ptl);
+ return wp_page_copy(mm, vma, address, page_table, pmd,
+ orig_pte, old_page);
+}
+
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 65842d688b7c..e2e8014fb755 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -104,7 +104,7 @@ void put_online_mems(void)
}
-static void mem_hotplug_begin(void)
+void mem_hotplug_begin(void)
{
mem_hotplug.active_writer = current;
@@ -119,7 +119,7 @@ static void mem_hotplug_begin(void)
}
}
-static void mem_hotplug_done(void)
+void mem_hotplug_done(void)
{
mem_hotplug.active_writer = NULL;
mutex_unlock(&mem_hotplug.lock);
@@ -502,7 +502,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
+ err = __add_section(nid, zone, section_nr_to_pfn(i));
/*
* EEXIST is finally dealt with by ioresource collision
@@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
}
+/* Must be protected by mem_hotplug_begin() */
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long flags;
@@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int ret;
struct memory_notify arg;
- mem_hotplug_begin();
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
@@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
*/
zone = page_zone(pfn_to_page(pfn));
- ret = -EINVAL;
if ((zone_idx(zone) > ZONE_NORMAL ||
online_type == MMOP_ONLINE_MOVABLE) &&
!can_online_high_movable(zone))
- goto out;
+ return -EINVAL;
if (online_type == MMOP_ONLINE_KERNEL &&
zone_idx(zone) == ZONE_MOVABLE) {
if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
- goto out;
+ return -EINVAL;
}
if (online_type == MMOP_ONLINE_MOVABLE &&
zone_idx(zone) == ZONE_MOVABLE - 1) {
if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
- goto out;
+ return -EINVAL;
}
/* Previous code may changed the zone of the pfn range */
@@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
ret = notifier_to_errno(ret);
if (ret) {
memory_notify(MEM_CANCEL_ONLINE, &arg);
- goto out;
+ return ret;
}
/*
* If this zone is not populated, then it is not in zonelist.
@@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
(((unsigned long long) pfn + nr_pages)
<< PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
- goto out;
+ return ret;
}
zone->present_pages += onlined_pages;
@@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
-out:
- mem_hotplug_done();
- return ret;
+ return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1688,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
- mem_hotplug_begin();
-
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
- ret = -EINVAL;
if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
- goto out;
+ return -EINVAL;
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE, true);
if (ret)
- goto out;
+ return ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1795,7 +1789,6 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
- mem_hotplug_done();
return 0;
failed_removal:
@@ -1805,12 +1798,10 @@ failed_removal:
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
-
-out:
- mem_hotplug_done();
return ret;
}
+/* Must be protected by mem_hotplug_begin() */
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4721046a134a..ede26291d4aa 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
else
- return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+ return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+ __GFP_THISNODE, 0);
}
/*
@@ -1985,7 +1986,8 @@ retry_cpuset:
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(node, *nmask)) {
mpol_cond_put(pol);
- page = alloc_pages_exact_node(node, gfp, order);
+ page = alloc_pages_exact_node(node,
+ gfp | __GFP_THISNODE, order);
goto out;
}
}
diff --git a/mm/mempool.c b/mm/mempool.c
index e209c98c7203..949970db2874 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -113,23 +113,24 @@ EXPORT_SYMBOL(mempool_create_node);
* mempool_create().
* @new_min_nr: the new minimum number of elements guaranteed to be
* allocated for this pool.
- * @gfp_mask: the usual allocation bitmask.
*
* This function shrinks/grows the pool. In the case of growing,
* it cannot be guaranteed that the pool will be grown to the new
* size immediately, but new mempool_free() calls will refill it.
+ * This function may sleep.
*
* Note, the caller must guarantee that no mempool_destroy is called
* while this function is running. mempool_alloc() & mempool_free()
* might be called (eg. from IRQ contexts) while this function executes.
*/
-int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
+int mempool_resize(mempool_t *pool, int new_min_nr)
{
void *element;
void **new_elements;
unsigned long flags;
BUG_ON(new_min_nr <= 0);
+ might_sleep();
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
@@ -145,7 +146,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
spin_unlock_irqrestore(&pool->lock, flags);
/* Grow the pool */
- new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
+ new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
+ GFP_KERNEL);
if (!new_elements)
return -ENOMEM;
@@ -164,7 +166,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
while (pool->curr_nr < pool->min_nr) {
spin_unlock_irqrestore(&pool->lock, flags);
- element = pool->alloc(gfp_mask, pool->pool_data);
+ element = pool->alloc(GFP_KERNEL, pool->pool_data);
if (!element)
goto out;
spin_lock_irqsave(&pool->lock, flags);
diff --git a/mm/memtest.c b/mm/memtest.c
new file mode 100644
index 000000000000..1997d934b13b
--- /dev/null
+++ b/mm/memtest.c
@@ -0,0 +1,118 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/memblock.h>
+
+static u64 patterns[] __initdata = {
+ /* The first entry has to be 0 to leave memtest with zeroed memory */
+ 0,
+ 0xffffffffffffffffULL,
+ 0x5555555555555555ULL,
+ 0xaaaaaaaaaaaaaaaaULL,
+ 0x1111111111111111ULL,
+ 0x2222222222222222ULL,
+ 0x4444444444444444ULL,
+ 0x8888888888888888ULL,
+ 0x3333333333333333ULL,
+ 0x6666666666666666ULL,
+ 0x9999999999999999ULL,
+ 0xccccccccccccccccULL,
+ 0x7777777777777777ULL,
+ 0xbbbbbbbbbbbbbbbbULL,
+ 0xddddddddddddddddULL,
+ 0xeeeeeeeeeeeeeeeeULL,
+ 0x7a6c7258554e494cULL, /* yeah ;-) */
+};
+
+static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
+{
+ printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
+ (unsigned long long) pattern,
+ (unsigned long long) start_bad,
+ (unsigned long long) end_bad);
+ memblock_reserve(start_bad, end_bad - start_bad);
+}
+
+static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
+{
+ u64 *p, *start, *end;
+ phys_addr_t start_bad, last_bad;
+ phys_addr_t start_phys_aligned;
+ const size_t incr = sizeof(pattern);
+
+ start_phys_aligned = ALIGN(start_phys, incr);
+ start = __va(start_phys_aligned);
+ end = start + (size - (start_phys_aligned - start_phys)) / incr;
+ start_bad = 0;
+ last_bad = 0;
+
+ for (p = start; p < end; p++)
+ *p = pattern;
+
+ for (p = start; p < end; p++, start_phys_aligned += incr) {
+ if (*p == pattern)
+ continue;
+ if (start_phys_aligned == last_bad + incr) {
+ last_bad += incr;
+ continue;
+ }
+ if (start_bad)
+ reserve_bad_mem(pattern, start_bad, last_bad + incr);
+ start_bad = last_bad = start_phys_aligned;
+ }
+ if (start_bad)
+ reserve_bad_mem(pattern, start_bad, last_bad + incr);
+}
+
+static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
+{
+ u64 i;
+ phys_addr_t this_start, this_end;
+
+ for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+ if (this_start < this_end) {
+ printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
+ (unsigned long long)this_start,
+ (unsigned long long)this_end,
+ (unsigned long long)cpu_to_be64(pattern));
+ memtest(pattern, this_start, this_end - this_start);
+ }
+ }
+}
+
+/* default is disabled */
+static int memtest_pattern __initdata;
+
+static int __init parse_memtest(char *arg)
+{
+ if (arg)
+ memtest_pattern = simple_strtoul(arg, NULL, 0);
+ else
+ memtest_pattern = ARRAY_SIZE(patterns);
+
+ return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+void __init early_memtest(phys_addr_t start, phys_addr_t end)
+{
+ unsigned int i;
+ unsigned int idx = 0;
+
+ if (!memtest_pattern)
+ return;
+
+ printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+ for (i = memtest_pattern-1; i < UINT_MAX; --i) {
+ idx = i % ARRAY_SIZE(patterns);
+ do_one_pass(patterns[idx], start, end);
+ }
+}
diff --git a/mm/migrate.c b/mm/migrate.c
index 85e042686031..a65ff72ab739 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -901,12 +901,23 @@ out:
}
/*
+ * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
+ * around it.
+ */
+#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
+#define ICE_noinline noinline
+#else
+#define ICE_noinline
+#endif
+
+/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
- unsigned long private, struct page *page, int force,
- enum migrate_mode mode)
+static ICE_noinline int unmap_and_move(new_page_t get_new_page,
+ free_page_t put_new_page,
+ unsigned long private, struct page *page,
+ int force, enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -1554,30 +1565,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
* page migration rate limiting control.
* Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
* window of time. Default here says do not migrate more than 1280M per second.
- * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
- * as it is faults that reset the window, pte updates will happen unconditionally
- * if there has not been a fault since @pteupdate_interval_millisecs after the
- * throttle window closed.
*/
static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-/* Returns true if NUMA migration is currently rate limited */
-bool migrate_ratelimited(int node)
-{
- pg_data_t *pgdat = NODE_DATA(node);
-
- if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
- msecs_to_jiffies(pteupdate_interval_millisecs)))
- return false;
-
- if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
- return false;
-
- return true;
-}
-
/* Returns true if the node is migrate rate-limited after the update */
static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
unsigned long nr_pages)
diff --git a/mm/mlock.c b/mm/mlock.c
index 8a54cd214925..6fd2cf15e868 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -205,62 +205,6 @@ out:
return nr_pages - 1;
}
-/**
- * __mlock_vma_pages_range() - mlock a range of pages in the vma.
- * @vma: target vma
- * @start: start address
- * @end: end address
- * @nonblocking:
- *
- * This takes care of making the pages present too.
- *
- * return 0 on success, negative error code on error.
- *
- * vma->vm_mm->mmap_sem must be held.
- *
- * If @nonblocking is NULL, it may be held for read or write and will
- * be unperturbed.
- *
- * If @nonblocking is non-NULL, it must held for read only and may be
- * released. If it's released, *@nonblocking will be set to 0.
- */
-long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long nr_pages = (end - start) / PAGE_SIZE;
- int gup_flags;
-
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(end & ~PAGE_MASK);
- VM_BUG_ON_VMA(start < vma->vm_start, vma);
- VM_BUG_ON_VMA(end > vma->vm_end, vma);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-
- gup_flags = FOLL_TOUCH | FOLL_MLOCK;
- /*
- * We want to touch writable mappings with a write fault in order
- * to break COW, except for shared mappings because these don't COW
- * and we would not want to dirty them for nothing.
- */
- if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
- gup_flags |= FOLL_WRITE;
-
- /*
- * We want mlock to succeed for regions that have any permissions
- * other than PROT_NONE.
- */
- if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
- gup_flags |= FOLL_FORCE;
-
- /*
- * We made sure addr is within a VMA, so the following will
- * not result in a stack expansion that recurses back here.
- */
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
- NULL, NULL, nonblocking);
-}
-
/*
* convert get_user_pages() return value to posix mlock() error
*/
@@ -596,7 +540,7 @@ success:
/*
* vm_flags is protected by the mmap_sem held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
- * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
+ * set VM_LOCKED, populate_vma_page_range will bring it back.
*/
if (lock)
@@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
-/*
- * __mm_populate - populate and/or mlock pages within a range of address space.
- *
- * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
- * flags. VMAs must be already marked with the desired vm_flags, and
- * mmap_sem must not be held.
- */
-int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
-{
- struct mm_struct *mm = current->mm;
- unsigned long end, nstart, nend;
- struct vm_area_struct *vma = NULL;
- int locked = 0;
- long ret = 0;
-
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(len != PAGE_ALIGN(len));
- end = start + len;
-
- for (nstart = start; nstart < end; nstart = nend) {
- /*
- * We want to fault in pages for [nstart; end) address range.
- * Find first corresponding VMA.
- */
- if (!locked) {
- locked = 1;
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, nstart);
- } else if (nstart >= vma->vm_end)
- vma = vma->vm_next;
- if (!vma || vma->vm_start >= end)
- break;
- /*
- * Set [nstart; nend) to intersection of desired address
- * range with the first VMA. Also, skip undesirable VMA types.
- */
- nend = min(end, vma->vm_end);
- if (vma->vm_flags & (VM_IO | VM_PFNMAP))
- continue;
- if (nstart < vma->vm_start)
- nstart = vma->vm_start;
- /*
- * Now fault in a range of pages. __mlock_vma_pages_range()
- * double checks the vma flags, so that it won't mlock pages
- * if the vma was already munlocked.
- */
- ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
- if (ret < 0) {
- if (ignore_errors) {
- ret = 0;
- continue; /* continue at next VMA */
- }
- ret = __mlock_posix_error_return(ret);
- break;
- }
- nend = nstart + ret * PAGE_SIZE;
- ret = 0;
- }
- if (locked)
- up_read(&mm->mmap_sem);
- return ret; /* 0 or negative error code */
-}
-
SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
unsigned long locked;
@@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
error = do_mlock(start, len, 1);
up_write(&current->mm->mmap_sem);
- if (!error)
- error = __mm_populate(start, len, 0);
- return error;
+ if (error)
+ return error;
+
+ error = __mm_populate(start, len, 0);
+ if (error)
+ return __mlock_posix_error_return(error);
+ return 0;
}
SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9ec50a368634..06a6076c92e5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2316,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
return prev;
}
#else
@@ -2351,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(vma, addr, start, NULL);
+ populate_vma_page_range(vma, addr, start, NULL);
return vma;
}
#endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 642f38cb175a..52628c819bf7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask)
+ int order, const nodemask_t *nodemask,
+ struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, NULL, nodemask);
+ dump_header(NULL, gfp_mask, order, memcg, nodemask);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
&totalpages);
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
+ check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
if (sysctl_oom_kill_allocating_task && current->mm &&
!oom_unkillable_task(current, NULL, nodemask) &&
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 644bcb665773..0372411f38fc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2111,6 +2111,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
EXPORT_SYMBOL(account_page_dirtied);
/*
+ * Helper function for deaccounting dirty page without writeback.
+ *
+ * Doing this should *normally* only ever be done when a page
+ * is truncated, and is not actually mapped anywhere at all. However,
+ * fs/buffer.c does this when it notices that somebody has cleaned
+ * out all the buffers on a page without actually doing it through
+ * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
+ */
+void account_page_cleaned(struct page *page, struct address_space *mapping)
+{
+ if (mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+ task_io_account_cancelled_write(PAGE_CACHE_SIZE);
+ }
+}
+EXPORT_SYMBOL(account_page_cleaned);
+
+/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40e29429e7b0..1b849500640c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#ifdef CONFIG_CMA
- [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
-#else
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#endif
[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
#ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
#endif
};
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order)
+{
+ return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+}
+#else
+static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order) { return NULL; }
+#endif
+
/*
* Move the free pages in a range to the free lists of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page,
* as fragmentation caused by those allocations polluting movable pageblocks
* is worse than movable allocations stealing from unmovable and reclaimable
* pageblocks.
- *
- * If we claim more than half of the pageblock, change pageblock's migratetype
- * as well.
*/
-static void try_to_steal_freepages(struct zone *zone, struct page *page,
- int start_type, int fallback_type)
+static bool can_steal_fallback(unsigned int order, int start_mt)
+{
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually steal whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
+
+ if (order >= pageblock_order / 2 ||
+ start_mt == MIGRATE_RECLAIMABLE ||
+ start_mt == MIGRATE_UNMOVABLE ||
+ page_group_by_mobility_disabled)
+ return true;
+
+ return false;
+}
+
+/*
+ * This function implements actual steal behaviour. If order is large enough,
+ * we can steal whole pageblock. If not, we first move freepages in this
+ * pageblock and check whether half of pages are moved or not. If half of
+ * pages are moved, we can change migratetype of pageblock and permanently
+ * use it's pages as requested migratetype in the future.
+ */
+static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ int start_type)
{
int current_order = page_order(page);
+ int pages;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
@@ -1151,19 +1186,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
return;
}
- if (current_order >= pageblock_order / 2 ||
- start_type == MIGRATE_RECLAIMABLE ||
- start_type == MIGRATE_UNMOVABLE ||
- page_group_by_mobility_disabled) {
- int pages;
+ pages = move_freepages_block(zone, page, start_type);
+
+ /* Claim the whole block if over half of it is free */
+ if (pages >= (1 << (pageblock_order-1)) ||
+ page_group_by_mobility_disabled)
+ set_pageblock_migratetype(page, start_type);
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal)
+{
+ int i;
+ int fallback_mt;
+
+ if (area->nr_free == 0)
+ return -1;
+
+ *can_steal = false;
+ for (i = 0;; i++) {
+ fallback_mt = fallbacks[migratetype][i];
+ if (fallback_mt == MIGRATE_RESERVE)
+ break;
+
+ if (list_empty(&area->free_list[fallback_mt]))
+ continue;
- pages = move_freepages_block(zone, page, start_type);
+ if (can_steal_fallback(order, migratetype))
+ *can_steal = true;
- /* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled)
- set_pageblock_migratetype(page, start_type);
+ if (!only_stealable)
+ return fallback_mt;
+
+ if (*can_steal)
+ return fallback_mt;
}
+
+ return -1;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1238,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct free_area *area;
unsigned int current_order;
struct page *page;
+ int fallback_mt;
+ bool can_steal;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1;
current_order >= order && current_order <= MAX_ORDER-1;
--current_order) {
- int i;
- for (i = 0;; i++) {
- int migratetype = fallbacks[start_migratetype][i];
- int buddy_type = start_migratetype;
-
- /* MIGRATE_RESERVE handled later if necessary */
- if (migratetype == MIGRATE_RESERVE)
- break;
-
- area = &(zone->free_area[current_order]);
- if (list_empty(&area->free_list[migratetype]))
- continue;
-
- page = list_entry(area->free_list[migratetype].next,
- struct page, lru);
- area->nr_free--;
-
- if (!is_migrate_cma(migratetype)) {
- try_to_steal_freepages(zone, page,
- start_migratetype,
- migratetype);
- } else {
- /*
- * When borrowing from MIGRATE_CMA, we need to
- * release the excess buddy pages to CMA
- * itself, and we do not try to steal extra
- * free pages.
- */
- buddy_type = migratetype;
- }
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt == -1)
+ continue;
- /* Remove the page from the freelists */
- list_del(&page->lru);
- rmv_page_order(page);
+ page = list_entry(area->free_list[fallback_mt].next,
+ struct page, lru);
+ if (can_steal)
+ steal_suitable_fallback(zone, page, start_migratetype);
- expand(zone, page, order, current_order, area,
- buddy_type);
+ /* Remove the page from the freelists */
+ area->nr_free--;
+ list_del(&page->lru);
+ rmv_page_order(page);
- /*
- * The freepage_migratetype may differ from pageblock's
- * migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
- * does not differ for MIGRATE_CMA pageblocks. For CMA
- * we need to make sure unallocated pages flushed from
- * pcp lists are returned to the correct freelist.
- */
- set_freepage_migratetype(page, buddy_type);
+ expand(zone, page, order, current_order, area,
+ start_migratetype);
+ /*
+ * The freepage_migratetype may differ from pageblock's
+ * migratetype depending on the decisions in
+ * try_to_steal_freepages(). This is OK as long as it
+ * does not differ for MIGRATE_CMA pageblocks. For CMA
+ * we need to make sure unallocated pages flushed from
+ * pcp lists are returned to the correct freelist.
+ */
+ set_freepage_migratetype(page, start_migratetype);
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
- return page;
- }
+ return page;
}
return NULL;
@@ -1249,7 +1295,11 @@ retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
- page = __rmqueue_fallback(zone, order, migratetype);
+ if (migratetype == MIGRATE_MOVABLE)
+ page = __rmqueue_cma_fallback(zone, order);
+
+ if (!page)
+ page = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -2362,13 +2412,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*did_some_progress = 1;
goto out;
}
- /*
- * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
- * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
- * The caller should handle page allocation failure by itself if
- * it specifies __GFP_THISNODE.
- * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
- */
+ /* The OOM killer may not free memory on a specific node */
if (gfp_mask & __GFP_THISNODE)
goto out;
}
@@ -2623,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
}
/*
- * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
- * __GFP_NOWARN set) should not cause reclaim since the subsystem
- * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
- * using a larger set of nodes after it has established that the
- * allowed per node queues are empty and that nodes are
- * over allocated.
+ * If this allocation cannot block and it is for a specific node, then
+ * fail early. There's no need to wakeup kswapd or retry for a
+ * speculative node-specific allocation.
*/
- if (IS_ENABLED(CONFIG_NUMA) &&
- (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
goto nopage;
retry:
@@ -2824,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
- * of GFP_THISNODE and a memoryless node
+ * of __GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
@@ -3201,38 +3241,31 @@ static void show_migration_types(unsigned char type)
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
- * Suppresses nodes that are not allowed by current's cpuset if
- * SHOW_MEM_FILTER_NODES is passed.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ * cpuset.
*/
void show_free_areas(unsigned int filter)
{
+ unsigned long free_pcp = 0;
int cpu;
struct zone *zone;
for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
- show_node(zone);
- printk("%s per-cpu:\n", zone->name);
- for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pageset;
-
- pageset = per_cpu_ptr(zone->pageset, cpu);
-
- printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
- cpu, pageset->pcp.high,
- pageset->pcp.batch, pageset->pcp.count);
- }
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
}
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu"
- " dirty:%lu writeback:%lu unstable:%lu\n"
- " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+ " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
- " free_cma:%lu\n",
+ " free:%lu free_pcp:%lu free_cma:%lu\n",
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_INACTIVE_ANON),
global_page_state(NR_ISOLATED_ANON),
@@ -3243,13 +3276,14 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_FREE_PAGES),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED),
global_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
+ global_page_state(NR_FREE_PAGES),
+ free_pcp,
global_page_state(NR_FREE_CMA_PAGES));
for_each_populated_zone(zone) {
@@ -3257,6 +3291,11 @@ void show_free_areas(unsigned int filter)
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
+
+ free_pcp = 0;
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+
show_node(zone);
printk("%s"
" free:%lukB"
@@ -3283,6 +3322,8 @@ void show_free_areas(unsigned int filter)
" pagetables:%lukB"
" unstable:%lukB"
" bounce:%lukB"
+ " free_pcp:%lukB"
+ " local_pcp:%ukB"
" free_cma:%lukB"
" writeback_tmp:%lukB"
" pages_scanned:%lu"
@@ -3314,6 +3355,8 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
+ K(free_pcp),
+ K(this_cpu_read(zone->pageset->pcp.count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -5717,7 +5760,7 @@ static void __setup_per_zone_wmarks(void)
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas controls asynch page reclaim, and so should
+ * deltas control asynch page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
diff --git a/mm/slab.c b/mm/slab.c
index c4b89eaf4c96..7eb38dd1cefa 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
return NULL;
}
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+ return flags;
+}
+
#else /* CONFIG_NUMA */
static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
@@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
return __cache_free_alien(cachep, objp, node, page_node);
}
+
+/*
+ * Construct gfp mask to allocate from a specific node but do not invoke reclaim
+ * or warn about failures.
+ */
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
+}
#endif
/*
@@ -2825,7 +2839,7 @@ alloc_done:
if (unlikely(!ac->avail)) {
int x;
force_grow:
- x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+ x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
@@ -3019,7 +3033,7 @@ retry:
get_node(cache, nid) &&
get_node(cache, nid)->free_objects) {
obj = ____cache_alloc_node(cache,
- flags | GFP_THISNODE, nid);
+ gfp_exact_node(flags), nid);
if (obj)
break;
}
@@ -3047,7 +3061,7 @@ retry:
nid = page_to_nid(page);
if (cache_grow(cache, flags, nid, page)) {
obj = ____cache_alloc_node(cache,
- flags | GFP_THISNODE, nid);
+ gfp_exact_node(flags), nid);
if (!obj)
/*
* Another processor may allocate the
@@ -3118,7 +3132,7 @@ retry:
must_grow:
spin_unlock(&n->list_lock);
- x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+ x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
if (x)
goto retry;
diff --git a/mm/slob.c b/mm/slob.c
index 94a7fede6d48..4765f65019c7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
return 0;
}
-void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
void *b;
@@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
return b;
}
-EXPORT_SYMBOL(slob_alloc_node);
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
diff --git a/mm/slub.c b/mm/slub.c
index 82c473780c91..0fdd6c1e1f82 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
if (cmpxchg_double(&page->freelist, &page->counters,
freelist_old, counters_old,
freelist_new, counters_new))
- return 1;
+ return true;
} else
#endif
{
@@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
page->freelist = freelist_new;
set_page_slub_counters(page, counters_new);
slab_unlock(page);
- return 1;
+ return true;
}
slab_unlock(page);
}
@@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
pr_info("%s %s: cmpxchg double redo ", n, s->name);
#endif
- return 0;
+ return false;
}
static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
@@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
if (cmpxchg_double(&page->freelist, &page->counters,
freelist_old, counters_old,
freelist_new, counters_new))
- return 1;
+ return true;
} else
#endif
{
@@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
set_page_slub_counters(page, counters_new);
slab_unlock(page);
local_irq_restore(flags);
- return 1;
+ return true;
}
slab_unlock(page);
local_irq_restore(flags);
@@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
pr_info("%s %s: cmpxchg double redo ", n, s->name);
#endif
- return 0;
+ return false;
}
#ifdef CONFIG_SLUB_DEBUG
@@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str)
*/
goto check_slabs;
- if (tolower(*str) == 'o') {
- /*
- * Avoid enabling debugging on caches if its minimum order
- * would increase as a result.
- */
- disable_higher_order_debug = 1;
- goto out;
- }
-
slub_debug = 0;
if (*str == '-')
/*
@@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str)
case 'a':
slub_debug |= SLAB_FAILSLAB;
break;
+ case 'o':
+ /*
+ * Avoid enabling debugging on caches if its minimum
+ * order would increase as a result.
+ */
+ disable_higher_order_debug = 1;
+ break;
default:
pr_err("slub_debug option '%c' unknown. skipped\n",
*str);
diff --git a/mm/truncate.c b/mm/truncate.c
index ddec5a5966d7..7a9d8a3cb143 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset,
}
/*
- * This cancels just the dirty bit on the kernel page itself, it
- * does NOT actually remove dirty bits on any mmap's that may be
- * around. It also leaves the page tagged dirty, so any sync
- * activity will still find it on the dirty lists, and in particular,
- * clear_page_dirty_for_io() will still look at the dirty bits in
- * the VM.
- *
- * Doing this should *normally* only ever be done when a page
- * is truncated, and is not actually mapped anywhere at all. However,
- * fs/buffer.c does this when it notices that somebody has cleaned
- * out all the buffers on a page without actually doing it through
- * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
- */
-void cancel_dirty_page(struct page *page, unsigned int account_size)
-{
- if (TestClearPageDirty(page)) {
- struct address_space *mapping = page->mapping;
- if (mapping && mapping_cap_account_dirty(mapping)) {
- dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host),
- BDI_RECLAIMABLE);
- if (account_size)
- task_io_account_cancelled_write(account_size);
- }
- }
-}
-EXPORT_SYMBOL(cancel_dirty_page);
-
-/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
@@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (page_has_private(page))
do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
+ /*
+ * Some filesystems seem to re-dirty the page even after
+ * the VM has canceled the dirty bit (eg ext3 journaling).
+ * Hence dirty accounting check is placed after invalidation.
+ */
+ if (TestClearPageDirty(page))
+ account_page_cleaned(page, mapping);
ClearPageMappedToDisk(page);
delete_from_page_cache(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 49abccf29a29..a5bbdd3b5d67 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -29,6 +29,7 @@
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/llist.h>
+#include <linux/bitops.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
@@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_clear_huge(pmd))
+ continue;
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next);
@@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_clear_huge(pud))
+ continue;
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next);
@@ -1314,7 +1319,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP)
- align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
+ align = 1ul << clamp_t(int, fls_long(size),
+ PAGE_SHIFT, IOREMAP_MAX_ORDER);
size = PAGE_ALIGN(size);
if (unlikely(!size))