summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst4
-rw-r--r--Documentation/admin-guide/mm/transhuge.rst3
-rw-r--r--Documentation/core-api/pin_user_pages.rst2
-rw-r--r--MAINTAINERS4
-rw-r--r--arch/arm64/kernel/probes/kprobes.c12
-rw-r--r--arch/openrisc/kernel/dma.c5
-rw-r--r--arch/x86/hyperv/hv_init.c4
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--drivers/media/platform/omap3isp/isp.c2
-rw-r--r--drivers/media/platform/omap3isp/ispvideo.c1
-rw-r--r--fs/ocfs2/dlmglue.c17
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/suballoc.c9
-rw-r--r--include/asm-generic/cacheflush.h5
-rw-r--r--include/linux/mmzone.h4
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/vmalloc.h1
-rw-r--r--kernel/kexec_file.c34
-rw-r--r--kernel/module.c4
-rw-r--r--lib/test_hmm.c3
-rw-r--r--mm/compaction.c17
-rw-r--r--mm/debug_vm_pgtable.c4
-rw-r--r--mm/memcontrol.c18
-rw-r--r--mm/memory.c33
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/nommu.c17
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c19
-rw-r--r--mm/swap.c3
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/vmalloc.c21
-rw-r--r--mm/vmscan.c3
-rw-r--r--mm/workingset.c46
35 files changed, 165 insertions, 161 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index ce3e05e41724..d09471aa7443 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1356,8 +1356,8 @@ PAGE_SIZE multiple when read back.
thp_fault_alloc
Number of transparent hugepages which were allocated to satisfy
- a page fault, including COW faults. This counter is not present
- when CONFIG_TRANSPARENT_HUGEPAGE is not set.
+ a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE
+ is not set.
thp_collapse_alloc
Number of transparent hugepages which were allocated to allow
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 6a233e42be08..b2acd0d395ca 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -305,8 +305,7 @@ monitor how successfully the system is providing huge pages for use.
thp_fault_alloc
is incremented every time a huge page is successfully
- allocated to handle a page fault. This applies to both the
- first time a page is faulted and for COW faults.
+ allocated to handle a page fault.
thp_collapse_alloc
is incremented by khugepaged when it has found
diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst
index 6068266dd303..7ca8c7bac650 100644
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -33,7 +33,7 @@ all combinations of get*(), pin*(), FOLL_LONGTERM, and more. Also, the
pin_user_pages*() APIs are clearly distinct from the get_user_pages*() APIs, so
that's a natural dividing line, and a good point to make separate wrapper calls.
In other words, use pin_user_pages*() for DMA-pinned pages, and
-get_user_pages*() for other cases. There are four cases described later on in
+get_user_pages*() for other cases. There are five cases described later on in
this document, to further clarify that concept.
FOLL_PIN and FOLL_GET are mutually exclusive for a given gup call. However,
diff --git a/MAINTAINERS b/MAINTAINERS
index 63c4cc4a04d6..28bd4715ebd5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16058,8 +16058,10 @@ SPARSE CHECKER
M: "Luc Van Oostenryck" <luc.vanoostenryck@gmail.com>
L: linux-sparse@vger.kernel.org
S: Maintained
-W: https://sparse.wiki.kernel.org/
+W: https://sparse.docs.kernel.org/
T: git git://git.kernel.org/pub/scm/devel/sparse/sparse.git
+Q: https://patchwork.kernel.org/project/linux-sparse/list/
+B: https://bugzilla.kernel.org/enter_bug.cgi?component=Sparse&product=Tools
F: include/linux/compiler.h
SPEAR CLOCK FRAMEWORK SUPPORT
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index d1c95dcf1d78..cbe49cd117cf 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -120,15 +120,9 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
void *alloc_insn_page(void)
{
- void *page;
-
- page = vmalloc_exec(PAGE_SIZE);
- if (page) {
- set_memory_ro((unsigned long)page, 1);
- set_vm_flush_reset_perms(page);
- }
-
- return page;
+ return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE, __func__);
}
/* arm kprobe: install breakpoint in text */
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index c152a68811dd..345727638d52 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -74,8 +74,11 @@ void *arch_dma_set_uncached(void *cpu_addr, size_t size)
* We need to iterate through the pages, clearing the dcache for
* them and setting the cache-inhibit bit.
*/
+ mmap_read_lock(&init_mm);
error = walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops,
NULL);
+ mmap_read_unlock(&init_mm);
+
if (error)
return ERR_PTR(error);
return cpu_addr;
@@ -85,9 +88,11 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size)
{
unsigned long va = (unsigned long)cpu_addr;
+ mmap_read_lock(&init_mm);
/* walk_page_range shouldn't be able to fail here */
WARN_ON(walk_page_range(&init_mm, va, va + size,
&clear_nocache_walk_ops, NULL));
+ mmap_read_unlock(&init_mm);
}
void arch_sync_dma_for_device(phys_addr_t addr, size_t size,
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index a54c6a401581..2bdc72e6890e 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -375,7 +375,9 @@ void __init hyperv_init(void)
guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0);
wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
- hv_hypercall_pg = vmalloc_exec(PAGE_SIZE);
+ hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START,
+ VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX,
+ VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, __func__);
if (hv_hypercall_pg == NULL) {
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
goto remove_cpuhp_state;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 2da1f95b88d7..816b31c68550 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -194,6 +194,7 @@ enum page_cache_mode {
#define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0)
#define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC)
#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G)
#define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC)
#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G)
#define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G)
@@ -219,6 +220,7 @@ enum page_cache_mode {
#define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC)
#define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC)
#define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0)
+#define PAGE_KERNEL_ROX __pgprot_mask(__PAGE_KERNEL_ROX | _ENC)
#define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC)
#define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC)
#define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
diff --git a/drivers/media/platform/omap3isp/isp.c b/drivers/media/platform/omap3isp/isp.c
index a4ee6b86663e..b91e472ee764 100644
--- a/drivers/media/platform/omap3isp/isp.c
+++ b/drivers/media/platform/omap3isp/isp.c
@@ -39,8 +39,6 @@
* Troy Laramy <t-laramy@ti.com>
*/
-#include <asm/cacheflush.h>
-
#include <linux/clk.h>
#include <linux/clkdev.h>
#include <linux/delay.h>
diff --git a/drivers/media/platform/omap3isp/ispvideo.c b/drivers/media/platform/omap3isp/ispvideo.c
index 10c214bd0903..1ac9aef70dff 100644
--- a/drivers/media/platform/omap3isp/ispvideo.c
+++ b/drivers/media/platform/omap3isp/ispvideo.c
@@ -18,7 +18,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <asm/cacheflush.h>
#include <media/v4l2-dev.h>
#include <media/v4l2-ioctl.h>
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 152a0fc4e905..751bc4dc7466 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -689,6 +689,12 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb)
+{
+ ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ init_rwsem(&osb->nfs_sync_rwlock);
+}
+
void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
{
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
@@ -2855,6 +2861,11 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
+ if (ex)
+ down_write(&osb->nfs_sync_rwlock);
+ else
+ down_read(&osb->nfs_sync_rwlock);
+
if (ocfs2_mount_local(osb))
return 0;
@@ -2873,6 +2884,10 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
if (!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, lockres,
ex ? LKM_EXMODE : LKM_PRMODE);
+ if (ex)
+ up_write(&osb->nfs_sync_rwlock);
+ else
+ up_read(&osb->nfs_sync_rwlock);
}
int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
@@ -3340,7 +3355,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
- ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_nfs_sync_lock_init(osb);
ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ee5d98516212..2dd71d626196 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -395,6 +395,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
+ struct rw_semaphore nfs_sync_rwlock;
struct ocfs2_lock_res osb_trim_fs_lockres;
struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 0dd8c41bafd4..19137c6d087b 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -290,7 +290,7 @@
#define OCFS2_MAX_SLOTS 255
/* Slot map indicator for an empty slot */
-#define OCFS2_INVALID_SLOT -1
+#define OCFS2_INVALID_SLOT ((u16)-1)
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
@@ -326,8 +326,8 @@ struct ocfs2_system_inode_info {
enum {
BAD_BLOCK_SYSTEM_INODE = 0,
GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE
SLOT_MAP_SYSTEM_INODE,
-#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
HEARTBEAT_SYSTEM_INODE,
GLOBAL_BITMAP_SYSTEM_INODE,
USER_QUOTA_SYSTEM_INODE,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4836becb7578..45745cc3408a 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2825,9 +2825,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
goto bail;
}
- inode_alloc_inode =
- ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
- suballoc_slot);
+ if (suballoc_slot == (u16)OCFS2_INVALID_SLOT)
+ inode_alloc_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
+ else
+ inode_alloc_inode = ocfs2_get_system_file_inode(osb,
+ INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
if (!inode_alloc_inode) {
/* the error code could be inaccurate, but we are not able to
* get the correct one. */
diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h
index 907fa5d16494..4a674db4e1fa 100644
--- a/include/asm-generic/cacheflush.h
+++ b/include/asm-generic/cacheflush.h
@@ -2,6 +2,11 @@
#ifndef _ASM_GENERIC_CACHEFLUSH_H
#define _ASM_GENERIC_CACHEFLUSH_H
+struct mm_struct;
+struct vm_area_struct;
+struct page;
+struct address_space;
+
/*
* The cache doesn't need to be flushed when TLB entries change when
* the cache is mapped to physical memory, not virtual memory
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c4c37fd12104..f6f884970511 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -257,8 +257,8 @@ struct lruvec {
*/
unsigned long anon_cost;
unsigned long file_cost;
- /* Evictions & activations on the inactive file list */
- atomic_long_t inactive_age;
+ /* Non-resident age, driven by LRU movement */
+ atomic_long_t nonresident_age;
/* Refaults at the time of last reclaim cycle */
unsigned long refaults;
/* Various lruvec state flags (enum lruvec_flags) */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4c5974bb9ba9..5b3216ba39a9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -313,6 +313,7 @@ struct vma_swap_readahead {
};
/* linux/mm/workingset.c */
+void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
void workingset_refault(struct page *page, void *shadow);
void workingset_activation(struct page *page);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 48bb681e6c2a..0221f852a7e1 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -106,7 +106,6 @@ extern void *vzalloc(unsigned long size);
extern void *vmalloc_user(unsigned long size);
extern void *vmalloc_node(unsigned long size, int node);
extern void *vzalloc_node(unsigned long size, int node);
-extern void *vmalloc_exec(unsigned long size);
extern void *vmalloc_32(unsigned long size);
extern void *vmalloc_32_user(unsigned long size);
extern void *__vmalloc(unsigned long size, gfp_t gfp_mask);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index bb05fd52de85..09cc78df53c6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -181,34 +181,19 @@ void kimage_file_post_load_cleanup(struct kimage *image)
static int
kimage_validate_signature(struct kimage *image)
{
- const char *reason;
int ret;
ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
image->kernel_buf_len);
- switch (ret) {
- case 0:
- break;
+ if (ret) {
- /* Certain verification errors are non-fatal if we're not
- * checking errors, provided we aren't mandating that there
- * must be a valid signature.
- */
- case -ENODATA:
- reason = "kexec of unsigned image";
- goto decide;
- case -ENOPKG:
- reason = "kexec of image with unsupported crypto";
- goto decide;
- case -ENOKEY:
- reason = "kexec of image with unavailable key";
- decide:
if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
- pr_notice("%s rejected\n", reason);
+ pr_notice("Enforced kernel signature verification failed (%d).\n", ret);
return ret;
}
- /* If IMA is guaranteed to appraise a signature on the kexec
+ /*
+ * If IMA is guaranteed to appraise a signature on the kexec
* image, permit it even if the kernel is otherwise locked
* down.
*/
@@ -216,17 +201,10 @@ kimage_validate_signature(struct kimage *image)
security_locked_down(LOCKDOWN_KEXEC))
return -EPERM;
- return 0;
-
- /* All other errors are fatal, including nomem, unparseable
- * signatures and signature check failures - even if signatures
- * aren't required.
- */
- default:
- pr_notice("kernel signature verification failed (%d).\n", ret);
+ pr_debug("kernel signature verification failed (%d).\n", ret);
}
- return ret;
+ return 0;
}
#endif
diff --git a/kernel/module.c b/kernel/module.c
index e8a198588f26..0c6573b98c36 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2783,7 +2783,9 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
void * __weak module_alloc(unsigned long size)
{
- return vmalloc_exec(size);
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE, __func__);
}
bool __weak module_init_section(const char *name)
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 28528285942c..a2a82262b97b 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -520,8 +520,7 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
err_free:
kfree(devmem);
err_release:
- release_mem_region(devmem->pagemap.res.start,
- resource_size(&devmem->pagemap.res));
+ release_mem_region(res->start, resource_size(res));
err:
mutex_unlock(&mdevice->devmem_lock);
return false;
diff --git a/mm/compaction.c b/mm/compaction.c
index fd988b7e5f2b..86375605faa9 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2316,15 +2316,26 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.page = NULL,
};
- current->capture_control = &capc;
+ /*
+ * Make sure the structs are really initialized before we expose the
+ * capture control, in case we are interrupted and the interrupt handler
+ * frees a page.
+ */
+ barrier();
+ WRITE_ONCE(current->capture_control, &capc);
ret = compact_zone(&cc, &capc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
- *capture = capc.page;
- current->capture_control = NULL;
+ /*
+ * Make sure we hide capture control first before we read the captured
+ * page pointer, otherwise an interrupt could free and capture a page
+ * and we would leak it.
+ */
+ WRITE_ONCE(current->capture_control, NULL);
+ *capture = READ_ONCE(capc.page);
return ret;
}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index e45623016aea..61ab16fb2e36 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -246,13 +246,13 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
unsigned long vaddr)
{
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = ptep_get(ptep);
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
set_pte_at(mm, vaddr, ptep, pte);
barrier();
pte_clear(mm, vaddr, ptep);
- pte = READ_ONCE(*ptep);
+ pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0b38b6ad547d..19622328e4b5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2772,8 +2772,10 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
return;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
- if (!cw)
+ if (!cw) {
+ css_put(&memcg->css);
return;
+ }
cw->memcg = memcg;
cw->cachep = cachep;
@@ -6360,11 +6362,16 @@ static unsigned long effective_protection(unsigned long usage,
* We're using unprotected memory for the weight so that if
* some cgroups DO claim explicit protection, we don't protect
* the same bytes twice.
+ *
+ * Check both usage and parent_usage against the respective
+ * protected values. One should imply the other, but they
+ * aren't read atomically - make sure the division is sane.
*/
if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
return ep;
-
- if (parent_effective > siblings_protected && usage > protected) {
+ if (parent_effective > siblings_protected &&
+ parent_usage > siblings_protected &&
+ usage > protected) {
unsigned long unclaimed;
unclaimed = parent_effective - siblings_protected;
@@ -6416,7 +6423,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
if (parent == root) {
memcg->memory.emin = READ_ONCE(memcg->memory.min);
- memcg->memory.elow = memcg->memory.low;
+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
goto out;
}
@@ -6428,7 +6435,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_min_usage)));
WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
- memcg->memory.low, READ_ONCE(parent->memory.elow),
+ READ_ONCE(memcg->memory.low),
+ READ_ONCE(parent->memory.elow),
atomic_long_read(&parent->memory.children_low_usage)));
out:
diff --git a/mm/memory.c b/mm/memory.c
index dc7f3543b1fd..87ec87cdc1ff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ out:
}
#ifdef pte_index
-static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd,
+static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
int err;
@@ -1506,8 +1506,9 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd,
if (!page_count(page))
return -EINVAL;
err = validate_page_before_insert(page);
- return err ? err : insert_page_into_pte_locked(
- mm, pte_offset_map(pmd, addr), addr, page, prot);
+ if (err)
+ return err;
+ return insert_page_into_pte_locked(mm, pte, addr, page, prot);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -1517,7 +1518,8 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num, pgprot_t prot)
{
pmd_t *pmd = NULL;
- spinlock_t *pte_lock = NULL;
+ pte_t *start_pte, *pte;
+ spinlock_t *pte_lock;
struct mm_struct *const mm = vma->vm_mm;
unsigned long curr_page_idx = 0;
unsigned long remaining_pages_total = *num;
@@ -1536,18 +1538,17 @@ more:
ret = -ENOMEM;
if (pte_alloc(mm, pmd))
goto out;
- pte_lock = pte_lockptr(mm, pmd);
while (pages_to_write_in_pmd) {
int pte_idx = 0;
const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
- spin_lock(pte_lock);
- for (; pte_idx < batch_size; ++pte_idx) {
- int err = insert_page_in_batch_locked(mm, pmd,
+ start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
+ for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
+ int err = insert_page_in_batch_locked(mm, pte,
addr, pages[curr_page_idx], prot);
if (unlikely(err)) {
- spin_unlock(pte_lock);
+ pte_unmap_unlock(start_pte, pte_lock);
ret = err;
remaining_pages_total -= pte_idx;
goto out;
@@ -1555,7 +1556,7 @@ more:
addr += PAGE_SIZE;
++curr_page_idx;
}
- spin_unlock(pte_lock);
+ pte_unmap_unlock(start_pte, pte_lock);
pages_to_write_in_pmd -= batch_size;
remaining_pages_total -= batch_size;
}
@@ -3140,8 +3141,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
err = mem_cgroup_charge(page, vma->vm_mm,
GFP_KERNEL);
ClearPageSwapCache(page);
- if (err)
+ if (err) {
+ ret = VM_FAULT_OOM;
goto out_page;
+ }
+
+ /*
+ * XXX: Move to lru_cache_add() when it
+ * supports new vs putback
+ */
+ spin_lock_irq(&page_pgdat(page)->lru_lock);
+ lru_note_cost_page(page);
+ spin_unlock_irq(&page_pgdat(page)->lru_lock);
lru_cache_add(page);
swap_readpage(page, true);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9b34e03e730a..da374cd3d45b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -471,11 +471,20 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages)
{
+ const unsigned long end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long flags;
+ unsigned long pfn, cur_nr_pages, flags;
/* Poison struct pages because they are now uninitialized again. */
- page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+ for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
+ cond_resched();
+
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages =
+ min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
+ page_init_poison(pfn_to_page(pfn),
+ sizeof(struct page) * cur_nr_pages);
+ }
#ifdef CONFIG_ZONE_DEVICE
/*
diff --git a/mm/nommu.c b/mm/nommu.c
index cdcad5d61dd1..f32a69095d50 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -291,23 +291,6 @@ void *vzalloc_node(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node);
/**
- * vmalloc_exec - allocate virtually contiguous, executable memory
- * @size: allocation size
- *
- * Kernel-internal function to allocate enough pages to cover @size
- * the page level allocator and map them into contiguous and
- * executable kernel virtual space.
- *
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
- */
-
-void *vmalloc_exec(unsigned long size)
-{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM);
-}
-
-/**
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
* @size: allocation size
*
diff --git a/mm/slab.h b/mm/slab.h
index 207c83ef6e06..74f7e09a7cfd 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -348,7 +348,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
- unsigned int nr_pages = 1 << order;
+ int nr_pages = 1 << order;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
int ret;
@@ -388,7 +388,7 @@ out:
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
struct kmem_cache *s)
{
- unsigned int nr_pages = 1 << order;
+ int nr_pages = 1 << order;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 9e72ba224175..37d48a56431d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1726,7 +1726,7 @@ void kzfree(const void *p)
if (unlikely(ZERO_OR_NULL_PTR(mem)))
return;
ks = ksize(mem);
- memset(mem, 0, ks);
+ memzero_explicit(mem, ks);
kfree(mem);
}
EXPORT_SYMBOL(kzfree);
diff --git a/mm/slub.c b/mm/slub.c
index fe81773fd97e..ef303070d175 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3766,15 +3766,13 @@ error:
}
static void list_slab_objects(struct kmem_cache *s, struct page *page,
- const char *text, unsigned long *map)
+ const char *text)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
+ unsigned long *map;
void *p;
- if (!map)
- return;
-
slab_err(s, page, text, s->name);
slab_lock(page);
@@ -3786,6 +3784,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
print_tracking(s, p);
}
}
+ put_map(map);
slab_unlock(page);
#endif
}
@@ -3799,11 +3798,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
LIST_HEAD(discard);
struct page *page, *h;
- unsigned long *map = NULL;
-
-#ifdef CONFIG_SLUB_DEBUG
- map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
-#endif
BUG_ON(irqs_disabled());
spin_lock_irq(&n->list_lock);
@@ -3813,16 +3807,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
list_add(&page->slab_list, &discard);
} else {
list_slab_objects(s, page,
- "Objects remaining in %s on __kmem_cache_shutdown()",
- map);
+ "Objects remaining in %s on __kmem_cache_shutdown()");
}
}
spin_unlock_irq(&n->list_lock);
-#ifdef CONFIG_SLUB_DEBUG
- bitmap_free(map);
-#endif
-
list_for_each_entry_safe(page, h, &discard, slab_list)
discard_slab(s, page);
}
diff --git a/mm/swap.c b/mm/swap.c
index dbcab84c6fce..a82efc33411f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -443,8 +443,7 @@ void mark_page_accessed(struct page *page)
else
__lru_cache_activate_page(page);
ClearPageReferenced(page);
- if (page_is_file_lru(page))
- workingset_activation(page);
+ workingset_activation(page);
}
if (page_is_idle(page))
clear_page_idle(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e98ff460e9e9..05889e8e3c97 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,7 +21,7 @@
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
-
+#include "internal.h"
/*
* swapper_space is a fiction, retained to simplify the path through
@@ -429,7 +429,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
__SetPageSwapBacked(page);
/* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) {
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK)) {
put_swap_page(page, entry);
goto fail_unlock;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3091c2ca60df..5a2b55c8dd9a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1862,7 +1862,6 @@ EXPORT_SYMBOL(vm_unmap_ram);
* @pages: an array of pointers to the pages to be mapped
* @count: number of pages
* @node: prefer to allocate data structures on this node
- * @prot: memory protection to use. PAGE_KERNEL for regular RAM
*
* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
* faster than vmap so it's good. But if you mix long-life and short-life
@@ -2696,26 +2695,6 @@ void *vzalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vzalloc_node);
-/**
- * vmalloc_exec - allocate virtually contiguous, executable memory
- * @size: allocation size
- *
- * Kernel-internal function to allocate enough pages to cover @size
- * the page level allocator and map them into contiguous and
- * executable kernel virtual space.
- *
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
- *
- * Return: pointer to the allocated memory or %NULL on error
- */
-void *vmalloc_exec(unsigned long size)
-{
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
- NUMA_NO_NODE, __builtin_return_address(0));
-}
-
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b6d84326bdf2..749d239c62b2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -904,6 +904,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
__delete_from_swap_cache(page, swap);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
+ workingset_eviction(page, target_memcg);
} else {
void (*freepage)(struct page *);
void *shadow = NULL;
@@ -1884,6 +1885,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
list_add(&page->lru, &pages_to_free);
} else {
nr_moved += nr_pages;
+ if (PageActive(page))
+ workingset_age_nonresident(lruvec, nr_pages);
}
}
diff --git a/mm/workingset.c b/mm/workingset.c
index d481ea452eeb..50b7937bab32 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -156,8 +156,8 @@
*
* Implementation
*
- * For each node's file LRU lists, a counter for inactive evictions
- * and activations is maintained (node->inactive_age).
+ * For each node's LRU lists, a counter for inactive evictions and
+ * activations is maintained (node->nonresident_age).
*
* On eviction, a snapshot of this counter (along with some bits to
* identify the node) is stored in the now empty page cache
@@ -213,7 +213,17 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*workingsetp = workingset;
}
-static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat)
+/**
+ * workingset_age_nonresident - age non-resident entries as LRU ages
+ * @memcg: the lruvec that was aged
+ * @nr_pages: the number of pages to count
+ *
+ * As in-memory pages are aged, non-resident pages need to be aged as
+ * well, in order for the refault distances later on to be comparable
+ * to the in-memory dimensions. This function allows reclaim and LRU
+ * operations to drive the non-resident aging along in parallel.
+ */
+void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
{
/*
* Reclaiming a cgroup means reclaiming all its children in a
@@ -227,11 +237,8 @@ static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat)
* the root cgroup's, age as well.
*/
do {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- atomic_long_inc(&lruvec->inactive_age);
- } while (memcg && (memcg = parent_mem_cgroup(memcg)));
+ atomic_long_add(nr_pages, &lruvec->nonresident_age);
+ } while ((lruvec = parent_lruvec(lruvec)));
}
/**
@@ -254,12 +261,11 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- advance_inactive_age(page_memcg(page), pgdat);
-
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ workingset_age_nonresident(lruvec, hpage_nr_pages(page));
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
- eviction = atomic_long_read(&lruvec->inactive_age);
+ eviction = atomic_long_read(&lruvec->nonresident_age);
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
@@ -309,20 +315,20 @@ void workingset_refault(struct page *page, void *shadow)
if (!mem_cgroup_disabled() && !eviction_memcg)
goto out;
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
- refault = atomic_long_read(&eviction_lruvec->inactive_age);
+ refault = atomic_long_read(&eviction_lruvec->nonresident_age);
/*
* Calculate the refault distance
*
* The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases. There is a
+ * across nonresident_age overflows in most cases. There is a
* special case: usually, shadow entries have a short lifetime
* and are either refaulted or reclaimed along with the inode
* before they get too old. But it is not impossible for the
- * inactive_age to lap a shadow entry in the field, which can
- * then result in a false small refault distance, leading to a
- * false activation should this old entry actually refault
- * again. However, earlier kernels used to deactivate
+ * nonresident_age to lap a shadow entry in the field, which
+ * can then result in a false small refault distance, leading
+ * to a false activation should this old entry actually
+ * refault again. However, earlier kernels used to deactivate
* unconditionally with *every* reclaim invocation for the
* longest time, so the occasional inappropriate activation
* leading to pressure on the active list is not a problem.
@@ -359,7 +365,7 @@ void workingset_refault(struct page *page, void *shadow)
goto out;
SetPageActive(page);
- advance_inactive_age(memcg, pgdat);
+ workingset_age_nonresident(lruvec, hpage_nr_pages(page));
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
/* Page was active prior to eviction */
@@ -382,6 +388,7 @@ out:
void workingset_activation(struct page *page)
{
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
rcu_read_lock();
/*
@@ -394,7 +401,8 @@ void workingset_activation(struct page *page)
memcg = page_memcg_rcu(page);
if (!mem_cgroup_disabled() && !memcg)
goto out;
- advance_inactive_age(memcg, page_pgdat(page));
+ lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ workingset_age_nonresident(lruvec, hpage_nr_pages(page));
out:
rcu_read_unlock();
}