diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-18 12:25:25 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-18 12:25:25 -0700 |
commit | 1912b04e0f9b116faf2e61c5432980f87c389c51 (patch) | |
tree | b86ca0faa1cf768e423c571856a8ff20b7d60376 | |
parent | 9453b2d4694c2cb6c30d99e65d4a3deb09e94ac3 (diff) | |
parent | c922781fef43d2ddbdef36a3a281441bb153377b (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge yet more updates from Andrew Morton:
"Subsystems affected by this patch series: mm (memcg, migration,
pagemap, gup, madvise, vmalloc), ia64, and misc"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (31 commits)
mm: remove duplicate include statement in mmu.c
mm: remove the filename in the top of file comment in vmalloc.c
mm: cleanup the gfp_mask handling in __vmalloc_area_node
mm: remove alloc_vm_area
x86/xen: open code alloc_vm_area in arch_gnttab_valloc
xen/xenbus: use apply_to_page_range directly in xenbus_map_ring_pv
drm/i915: use vmap in i915_gem_object_map
drm/i915: stop using kmap in i915_gem_object_map
drm/i915: use vmap in shmem_pin_map
zsmalloc: switch from alloc_vm_area to get_vm_area
mm: allow a NULL fn callback in apply_to_page_range
mm: add a vmap_pfn function
mm: add a VM_MAP_PUT_PAGES flag for vmap
mm: update the documentation for vfree
mm/madvise: introduce process_madvise() syscall: an external memory hinting API
pid: move pidfd_get_pid() to pid.c
mm/madvise: pass mm to do_madvise
selftests/vm: 10x speedup for hmm-tests
binfmt_elf: take the mmap lock around find_extend_vma()
mm/gup_benchmark: take the mmap lock around GUP
...
55 files changed, 592 insertions, 439 deletions
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index ec8bed9e7b75..ee7b01bb7346 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -479,3 +479,4 @@ 547 common openat2 sys_openat2 548 common pidfd_getfd sys_pidfd_getfd 549 common faccessat2 sys_faccessat2 +550 common process_madvise sys_process_madvise diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 698cc740c6b8..ab69250a86bc 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -17,7 +17,6 @@ #include <asm/cp15.h> #include <asm/cputype.h> -#include <asm/sections.h> #include <asm/cachetype.h> #include <asm/fixmap.h> #include <asm/sections.h> diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 171077cbf419..d056a548358e 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -453,3 +453,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 3b859596840d..b3b2019f8d16 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 440 +#define __NR_compat_syscalls 441 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 2a3ad9b9accd..107f08e03b9f 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -887,6 +887,8 @@ __SYSCALL(__NR_openat2, sys_openat2) __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #define __NR_faccessat2 439 __SYSCALL(__NR_faccessat2, sys_faccessat2) +#define __NR_process_madvise 440 +__SYSCALL(__NR_process_madvise, sys_process_madvise) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 81901c5e5426..c89bd5f8cbf8 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -40,7 +40,7 @@ obj-y += esi_stub.o # must be in kernel proper endif obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o -obj-$(CONFIG_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_ELF_CORE) += elfcore.o # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 4799c96c325f..b96ed8b8a508 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -360,3 +360,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 81fc799d8392..625fb6d32842 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -439,3 +439,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index b4e263916f41..aae729c95cf9 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -445,3 +445,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index cf72a0206a87..32817c954435 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -378,3 +378,4 @@ 437 n32 openat2 sys_openat2 438 n32 pidfd_getfd sys_pidfd_getfd 439 n32 faccessat2 sys_faccessat2 +440 n32 process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 557f9954a2b9..9e4ea3c31b1c 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -354,3 +354,4 @@ 437 n64 openat2 sys_openat2 438 n64 pidfd_getfd sys_pidfd_getfd 439 n64 faccessat2 sys_faccessat2 +440 n64 process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index a17aab5abeb2..29f5f28cf5ce 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -427,3 +427,4 @@ 437 o32 openat2 sys_openat2 438 o32 pidfd_getfd sys_pidfd_getfd 439 o32 faccessat2 sys_faccessat2 +440 o32 process_madvise sys_process_madvise diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index ae3dab371f6f..38c63e5404bc 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -437,3 +437,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 9d7fb4ced290..1275daec7fec 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -529,3 +529,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 1c3b48165e86..28c168000483 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -442,3 +442,4 @@ 437 common openat2 sys_openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise sys_process_madvise diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index ae0a00beea5f..783738448ff5 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -442,3 +442,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 37ec52b34c73..78160260991b 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -485,3 +485,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 9b6931f8d555..0d0667a9fbd7 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -444,3 +444,4 @@ 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 +440 i386 process_madvise sys_process_madvise diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 347809649ba2..1f47e24fb65c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -361,6 +361,7 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 4988e19598c8..1e681bf62561 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -25,6 +25,7 @@ static struct gnttab_vm_area { struct vm_struct *area; pte_t **ptes; + int idx; } gnttab_shared_vm_area, gnttab_status_vm_area; int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, @@ -90,19 +91,31 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) } } +static int gnttab_apply(pte_t *pte, unsigned long addr, void *data) +{ + struct gnttab_vm_area *area = data; + + area->ptes[area->idx++] = pte; + return 0; +} + static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) { area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL); if (area->ptes == NULL) return -ENOMEM; - - area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes); - if (area->area == NULL) { - kfree(area->ptes); - return -ENOMEM; - } - + area->area = get_vm_area(PAGE_SIZE * nr_frames, VM_IOREMAP); + if (!area->area) + goto out_free_ptes; + if (apply_to_page_range(&init_mm, (unsigned long)area->area->addr, + PAGE_SIZE * nr_frames, gnttab_apply, area)) + goto out_free_vm_area; return 0; +out_free_vm_area: + free_vm_area(area->area); +out_free_ptes: + kfree(area->ptes); + return -ENOMEM; } static void arch_gnttab_vfree(struct gnttab_vm_area *area) diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 6276e3c2d3fc..b070f272995d 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -410,3 +410,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 9afa5c4a6bf0..1e1cb245fca7 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -25,6 +25,7 @@ config DRM_I915 select CRC32 select SND_HDA_I915 if SND_HDA_CORE select CEC_CORE if CEC_NOTIFIER + select VMAP_PFN help Choose this option if you have a system that has "Intel Graphics Media Accelerator" or "HD Graphics" integrated graphics, diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index d6eeefab3d01..f60ca6dc911f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -162,8 +162,6 @@ static void unmap_object(struct drm_i915_gem_object *obj, void *ptr) { if (is_vmalloc_addr(ptr)) vunmap(ptr); - else - kunmap(kmap_to_page(ptr)); } struct sg_table * @@ -234,34 +232,21 @@ unlock: return err; } -static inline pte_t iomap_pte(resource_size_t base, - dma_addr_t offset, - pgprot_t prot) -{ - return pte_mkspecial(pfn_pte((base + offset) >> PAGE_SHIFT, prot)); -} - /* The 'mapping' part of i915_gem_object_pin_map() below */ -static void *i915_gem_object_map(struct drm_i915_gem_object *obj, - enum i915_map_type type) +static void *i915_gem_object_map_page(struct drm_i915_gem_object *obj, + enum i915_map_type type) { - unsigned long n_pte = obj->base.size >> PAGE_SHIFT; - struct sg_table *sgt = obj->mm.pages; - pte_t *stack[32], **mem; - struct vm_struct *area; + unsigned long n_pages = obj->base.size >> PAGE_SHIFT, i; + struct page *stack[32], **pages = stack, *page; + struct sgt_iter iter; pgprot_t pgprot; + void *vaddr; - if (!i915_gem_object_has_struct_page(obj) && type != I915_MAP_WC) - return NULL; - - if (GEM_WARN_ON(type == I915_MAP_WC && - !static_cpu_has(X86_FEATURE_PAT))) - return NULL; - - /* A single page can always be kmapped */ - if (n_pte == 1 && type == I915_MAP_WB) { - struct page *page = sg_page(sgt->sgl); - + switch (type) { + default: + MISSING_CASE(type); + fallthrough; /* to use PAGE_KERNEL anyway */ + case I915_MAP_WB: /* * On 32b, highmem using a finite set of indirect PTE (i.e. * vmap) to provide virtual mappings of the high pages. @@ -277,33 +262,10 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj, * forever. * * So if the page is beyond the 32b boundary, make an explicit - * vmap. On 64b, this check will be optimised away as we can - * directly kmap any page on the system. + * vmap. */ - if (!PageHighMem(page)) - return kmap(page); - } - - mem = stack; - if (n_pte > ARRAY_SIZE(stack)) { - /* Too big for stack -- allocate temporary array instead */ - mem = kvmalloc_array(n_pte, sizeof(*mem), GFP_KERNEL); - if (!mem) - return NULL; - } - - area = alloc_vm_area(obj->base.size, mem); - if (!area) { - if (mem != stack) - kvfree(mem); - return NULL; - } - - switch (type) { - default: - MISSING_CASE(type); - fallthrough; /* to use PAGE_KERNEL anyway */ - case I915_MAP_WB: + if (n_pages == 1 && !PageHighMem(sg_page(obj->mm.pages->sgl))) + return page_address(sg_page(obj->mm.pages->sgl)); pgprot = PAGE_KERNEL; break; case I915_MAP_WC: @@ -311,30 +273,50 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj, break; } - if (i915_gem_object_has_struct_page(obj)) { - struct sgt_iter iter; - struct page *page; - pte_t **ptes = mem; + if (n_pages > ARRAY_SIZE(stack)) { + /* Too big for stack -- allocate temporary array instead */ + pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; + } - for_each_sgt_page(page, iter, sgt) - **ptes++ = mk_pte(page, pgprot); - } else { - resource_size_t iomap; - struct sgt_iter iter; - pte_t **ptes = mem; - dma_addr_t addr; + i = 0; + for_each_sgt_page(page, iter, obj->mm.pages) + pages[i++] = page; + vaddr = vmap(pages, n_pages, 0, pgprot); + if (pages != stack) + kvfree(pages); + return vaddr; +} - iomap = obj->mm.region->iomap.base; - iomap -= obj->mm.region->region.start; +static void *i915_gem_object_map_pfn(struct drm_i915_gem_object *obj, + enum i915_map_type type) +{ + resource_size_t iomap = obj->mm.region->iomap.base - + obj->mm.region->region.start; + unsigned long n_pfn = obj->base.size >> PAGE_SHIFT; + unsigned long stack[32], *pfns = stack, i; + struct sgt_iter iter; + dma_addr_t addr; + void *vaddr; + + if (type != I915_MAP_WC) + return NULL; - for_each_sgt_daddr(addr, iter, sgt) - **ptes++ = iomap_pte(iomap, addr, pgprot); + if (n_pfn > ARRAY_SIZE(stack)) { + /* Too big for stack -- allocate temporary array instead */ + pfns = kvmalloc_array(n_pfn, sizeof(*pfns), GFP_KERNEL); + if (!pfns) + return NULL; } - if (mem != stack) - kvfree(mem); - - return area->addr; + i = 0; + for_each_sgt_daddr(addr, iter, obj->mm.pages) + pfns[i++] = (iomap + addr) >> PAGE_SHIFT; + vaddr = vmap_pfn(pfns, n_pfn, pgprot_writecombine(PAGE_KERNEL_IO)); + if (pfns != stack) + kvfree(pfns); + return vaddr; } /* get, pin, and map the pages of the object into kernel space */ @@ -386,7 +368,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj, } if (!ptr) { - ptr = i915_gem_object_map(obj, type); + if (GEM_WARN_ON(type == I915_MAP_WC && + !static_cpu_has(X86_FEATURE_PAT))) + ptr = NULL; + else if (i915_gem_object_has_struct_page(obj)) + ptr = i915_gem_object_map_page(obj, type); + else + ptr = i915_gem_object_map_pfn(obj, type); if (!ptr) { err = -ENOMEM; goto err_unpin; diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c index 43c7acbdc79d..f011ea42487e 100644 --- a/drivers/gpu/drm/i915/gt/shmem_utils.c +++ b/drivers/gpu/drm/i915/gt/shmem_utils.c @@ -49,80 +49,40 @@ struct file *shmem_create_from_object(struct drm_i915_gem_object *obj) return file; } -static size_t shmem_npte(struct file *file) -{ - return file->f_mapping->host->i_size >> PAGE_SHIFT; -} - -static void __shmem_unpin_map(struct file *file, void *ptr, size_t n_pte) -{ - unsigned long pfn; - - vunmap(ptr); - - for (pfn = 0; pfn < n_pte; pfn++) { - struct page *page; - - page = shmem_read_mapping_page_gfp(file->f_mapping, pfn, - GFP_KERNEL); - if (!WARN_ON(IS_ERR(page))) { - put_page(page); - put_page(page); - } - } -} - void *shmem_pin_map(struct file *file) { - const size_t n_pte = shmem_npte(file); - pte_t *stack[32], **ptes, **mem; - struct vm_struct *area; - unsigned long pfn; - - mem = stack; - if (n_pte > ARRAY_SIZE(stack)) { - mem = kvmalloc_array(n_pte, sizeof(*mem), GFP_KERNEL); - if (!mem) - return NULL; - } + struct page **pages; + size_t n_pages, i; + void *vaddr; - area = alloc_vm_area(n_pte << PAGE_SHIFT, mem); - if (!area) { - if (mem != stack) - kvfree(mem); + n_pages = file->f_mapping->host->i_size >> PAGE_SHIFT; + pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) return NULL; - } - ptes = mem; - for (pfn = 0; pfn < n_pte; pfn++) { - struct page *page; - - page = shmem_read_mapping_page_gfp(file->f_mapping, pfn, - GFP_KERNEL); - if (IS_ERR(page)) + for (i = 0; i < n_pages; i++) { + pages[i] = shmem_read_mapping_page_gfp(file->f_mapping, i, + GFP_KERNEL); + if (IS_ERR(pages[i])) goto err_page; - - **ptes++ = mk_pte(page, PAGE_KERNEL); } - if (mem != stack) - kvfree(mem); - + vaddr = vmap(pages, n_pages, VM_MAP_PUT_PAGES, PAGE_KERNEL); + if (!vaddr) + goto err_page; mapping_set_unevictable(file->f_mapping); - return area->addr; - + return vaddr; err_page: - if (mem != stack) - kvfree(mem); - - __shmem_unpin_map(file, area->addr, pfn); + while (--i >= 0) + put_page(pages[i]); + kvfree(pages); return NULL; } void shmem_unpin_map(struct file *file, void *ptr) { mapping_clear_unevictable(file->f_mapping); - __shmem_unpin_map(file, ptr, shmem_npte(file)); + vfree(ptr); } static int __shmem_rw(struct file *file, loff_t off, diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 2690318ad50f..fd80e318b99c 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -73,16 +73,13 @@ struct map_ring_valloc { struct xenbus_map_node *node; /* Why do we need two arrays? See comment of __xenbus_map_ring */ - union { - unsigned long addrs[XENBUS_MAX_RING_GRANTS]; - pte_t *ptes[XENBUS_MAX_RING_GRANTS]; - }; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS]; struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; - unsigned int idx; /* HVM only. */ + unsigned int idx; }; static DEFINE_SPINLOCK(xenbus_valloc_lock); @@ -686,6 +683,14 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); #ifdef CONFIG_XEN_PV +static int map_ring_apply(pte_t *pte, unsigned long addr, void *data) +{ + struct map_ring_valloc *info = data; + + info->phys_addrs[info->idx++] = arbitrary_virt_to_machine(pte).maddr; + return 0; +} + static int xenbus_map_ring_pv(struct xenbus_device *dev, struct map_ring_valloc *info, grant_ref_t *gnt_refs, @@ -694,18 +699,15 @@ static int xenbus_map_ring_pv(struct xenbus_device *dev, { struct xenbus_map_node *node = info->node; struct vm_struct *area; - int err = GNTST_okay; - int i; - bool leaked; + bool leaked = false; + int err = -ENOMEM; - area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, info->ptes); + area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP); if (!area) return -ENOMEM; - - for (i = 0; i < nr_grefs; i++) - info->phys_addrs[i] = - arbitrary_virt_to_machine(info->ptes[i]).maddr; - + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + XEN_PAGE_SIZE * nr_grefs, map_ring_apply, info)) + goto failed; err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, info, GNTMAP_host_map | GNTMAP_contains_pte, &leaked); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e7e9d0cde51a..b6b3d052ca86 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -310,7 +310,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ + if (mmap_read_lock_killable(mm)) + return -EINTR; vma = find_extend_vma(mm, bprm->p); + mmap_read_unlock(mm); if (!vma) return -EFAULT; diff --git a/fs/buffer.c b/fs/buffer.c index 5a28a6aa7f16..23f645657488 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -842,13 +842,13 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, struct buffer_head *bh, *head; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *old_memcg; if (retry) gfp |= __GFP_NOFAIL; memcg = get_mem_cgroup_from_page(page); - memalloc_use_memcg(memcg); + old_memcg = set_active_memcg(memcg); head = NULL; offset = PAGE_SIZE; @@ -867,7 +867,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, set_bh_page(bh, page, offset); } out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); mem_cgroup_put(memcg); return head; /* diff --git a/fs/io_uring.c b/fs/io_uring.c index 2e1dc354cd08..b58169240c77 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3989,7 +3989,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock) if (force_nonblock) return -EAGAIN; - ret = do_madvise(ma->addr, ma->len, ma->advice); + ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); io_req_complete(req, ret); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index c942910a8649..9167884a61ec 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -531,6 +531,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + struct mem_cgroup *old_memcg; struct inode *child = NULL; bool name_event = false; @@ -580,7 +581,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, gfp |= __GFP_RETRY_MAYFAIL; /* Whoever is interested in the event, pays for the allocation. */ - memalloc_use_memcg(group->memcg); + old_memcg = set_active_memcg(group->memcg); if (fanotify_is_perm_event(mask)) { event = fanotify_alloc_perm_event(path, gfp); @@ -608,7 +609,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, event->pid = get_pid(task_tgid(current)); out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); return event; } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index a65cf8c9f600..9ddcbadc98e2 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -66,6 +66,7 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask, int ret; int len = 0; int alloc_len = sizeof(struct inotify_event_info); + struct mem_cgroup *old_memcg; if ((inode_mark->mask & FS_EXCL_UNLINK) && path && d_unlinked(path->dentry)) @@ -87,9 +88,9 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask, * trigger OOM killer in the target monitoring memcg as it may have * security repercussion. */ - memalloc_use_memcg(group->memcg); + old_memcg = set_active_memcg(group->memcg); event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); if (unlikely(!event)) { /* diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6ef4a552e09d..e391e3c56de5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1531,18 +1531,6 @@ static inline bool memcg_kmem_enabled(void) return static_branch_likely(&memcg_kmem_enabled_key); } -static inline bool memcg_kmem_bypass(void) -{ - if (in_interrupt()) - return true; - - /* Allow remote memcg charging in kthread contexts. */ - if ((!current->mm || (current->flags & PF_KTHREAD)) && - !current->active_memcg) - return true; - return false; -} - static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 61a2633fcc7f..ef360fe70aaf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2579,7 +2579,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); -extern int do_madvise(unsigned long start, size_t len_in, int behavior); +extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, diff --git a/include/linux/pid.h b/include/linux/pid.h index 176d6cf80e7c..fa10acb8d6a4 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -77,6 +77,7 @@ extern const struct file_operations pidfd_fops; struct file; extern struct pid *pidfd_pid(const struct file *file); +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); static inline struct pid *get_pid(struct pid *pid) { diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 981e34cb1409..d5ece7a9a403 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -279,39 +279,38 @@ static inline void memalloc_nocma_restore(unsigned int flags) #endif #ifdef CONFIG_MEMCG +DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); /** - * memalloc_use_memcg - Starts the remote memcg charging scope. + * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. * * This function marks the beginning of the remote memcg charging scope. All the * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * - * NOTE: This function is not nesting safe. + * NOTE: This function can nest. Users must save the return value and + * reset the previous value after their own charging scope is over. */ -static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +static inline struct mem_cgroup * +set_active_memcg(struct mem_cgroup *memcg) { - WARN_ON_ONCE(current->active_memcg); - current->active_memcg = memcg; -} + struct mem_cgroup *old; + + if (in_interrupt()) { + old = this_cpu_read(int_active_memcg); + this_cpu_write(int_active_memcg, memcg); + } else { + old = current->active_memcg; + current->active_memcg = memcg; + } -/** - * memalloc_unuse_memcg - Ends the remote memcg charging scope. - * - * This function marks the end of the remote memcg charging scope started by - * memalloc_use_memcg(). - */ -static inline void memalloc_unuse_memcg(void) -{ - current->active_memcg = NULL; + return old; } #else -static inline void memalloc_use_memcg(struct mem_cgroup *memcg) -{ -} - -static inline void memalloc_unuse_memcg(void) +static inline struct mem_cgroup * +set_active_memcg(struct mem_cgroup *memcg) { + return NULL; } #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 06db09875aa4..2eda7678fe1d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -879,6 +879,8 @@ asmlinkage long sys_munlockall(void); asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char __user * vec); asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); +asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + size_t vlen, int behavior, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0221f852a7e1..938eaf9517e2 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -24,6 +24,7 @@ struct notifier_block; /* in notifier.h */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ +#define VM_MAP_PUT_PAGES 0x00000100 /* put pages and free array in vfree */ /* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. @@ -121,6 +122,7 @@ extern void vfree_atomic(const void *addr); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); +void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot); extern void vunmap(const void *addr); extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, @@ -167,6 +169,7 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller); +void free_vm_area(struct vm_struct *area); extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); @@ -202,10 +205,6 @@ static inline void set_vm_flush_reset_perms(void *addr) } #endif -/* Allocate/destroy a 'vmalloc' VM area. */ -extern struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes); -extern void free_vm_area(struct vm_struct *area); - /* for /dev/kmem */ extern long vread(char *buf, char *addr, unsigned long count); extern long vwrite(char *buf, char *addr, unsigned long count); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f2b5d72a46c2..2056318988f7 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -857,9 +857,11 @@ __SYSCALL(__NR_openat2, sys_openat2) __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #define __NR_faccessat2 439 __SYSCALL(__NR_faccessat2, sys_faccessat2) +#define __NR_process_madvise 440 +__SYSCALL(__NR_process_madvise, sys_process_madvise) #undef __NR_syscalls -#define __NR_syscalls 440 +#define __NR_syscalls 441 /* * 32 bit systems traditionally used different diff --git a/kernel/exit.c b/kernel/exit.c index 1f51c27bae59..87a2d515de0d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1474,25 +1474,6 @@ end: return retval; } -static struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) -{ - struct fd f; - struct pid *pid; - - f = fdget(fd); - if (!f.file) - return ERR_PTR(-EBADF); - - pid = pidfd_pid(f.file); - if (!IS_ERR(pid)) { - get_pid(pid); - *flags = f.file->f_flags; - } - - fdput(f); - return pid; -} - static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { diff --git a/kernel/pid.c b/kernel/pid.c index 74ddbff1a6ba..a96bc4bf4f86 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -520,6 +520,25 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) +{ + struct fd f; + struct pid *pid; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + pid = pidfd_pid(f.file); + if (!IS_ERR(pid)) { + get_pid(pid); + *flags = f.file->f_flags; + } + + fdput(f); + return pid; +} + /** * pidfd_create() - Create a new pid file descriptor. * diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c925d1e1777e..f27ac94d5fa7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -280,6 +280,7 @@ COND_SYSCALL(mlockall); COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); +COND_SYSCALL(process_madvise); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); diff --git a/mm/Kconfig b/mm/Kconfig index c7f30f8b282b..d42423f884a7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -816,6 +816,9 @@ config DEVICE_PRIVATE memory; i.e., memory that is only accessible from the device (or group of devices). You likely also want to select HMM_MIRROR. +config VMAP_PFN + bool + config FRAME_VECTOR bool diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 464cae1fa3ea..8b3e5b5cd8fa 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -72,6 +72,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, int nr; struct page **pages; int ret = 0; + bool needs_mmap_lock = + cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK; if (gup->size > ULONG_MAX) return -EINVAL; @@ -81,6 +83,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd, if (!pages) return -ENOMEM; + if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) { + ret = -EINTR; + goto free_pages; + } + i = 0; nr = gup->nr_pages_per_call; start_time = ktime_get(); @@ -120,9 +127,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, pages + i, NULL); break; default: - kvfree(pages); ret = -EINVAL; - goto out; + goto unlock; } if (nr <= 0) @@ -150,8 +156,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd, end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); +unlock: + if (needs_mmap_lock) + mmap_read_unlock(current->mm); +free_pages: kvfree(pages); -out: return ret; } diff --git a/mm/madvise.c b/mm/madvise.c index fd1f448b4e1d..416a56b8e757 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -17,6 +17,8 @@ #include <linux/falloc.h> #include <linux/fadvise.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> @@ -27,7 +29,6 @@ #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> -#include <linux/sched/mm.h> #include <asm/tlb.h> @@ -258,6 +259,7 @@ static long madvise_willneed(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { + struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; loff_t offset; @@ -294,10 +296,10 @@ static long madvise_willneed(struct vm_area_struct *vma, get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); - mmap_read_lock(current->mm); + mmap_read_lock(mm); return 0; } @@ -766,6 +768,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, unsigned long start, unsigned long end, int behavior) { + struct mm_struct *mm = vma->vm_mm; + *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; @@ -773,8 +777,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ - mmap_read_lock(current->mm); - vma = find_vma(current->mm, start); + mmap_read_lock(mm); + vma = find_vma(mm, start); if (!vma) return -ENOMEM; if (start < vma->vm_start) { @@ -828,6 +832,7 @@ static long madvise_remove(struct vm_area_struct *vma, loff_t offset; int error; struct file *f; + struct mm_struct *mm = vma->vm_mm; *prev = NULL; /* tell sys_madvise we drop mmap_lock */ @@ -855,13 +860,13 @@ static long madvise_remove(struct vm_area_struct *vma, get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); - mmap_read_lock(current->mm); + mmap_read_lock(mm); return error; } @@ -984,6 +989,18 @@ madvise_behavior_valid(int behavior) } } +static bool +process_madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_COLD: + case MADV_PAGEOUT: + return true; + default: + return false; + } +} + /* * The madvise(2) system call. * @@ -1031,6 +1048,11 @@ madvise_behavior_valid(int behavior) * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. + * MADV_COLD - the application is not expected to use this memory soon, + * deactivate pages in this range so that they can be reclaimed + * easily if memory pressure hanppens. + * MADV_PAGEOUT - the application is not expected to use this memory soon, + * page out the pages in this range immediately. * * return values: * zero - success @@ -1045,7 +1067,7 @@ madvise_behavior_valid(int behavior) * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -int do_madvise(unsigned long start, size_t len_in, int behavior) +int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; @@ -1083,10 +1105,10 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) write = madvise_need_mmap_write(behavior); if (write) { - if (mmap_write_lock_killable(current->mm)) + if (mmap_write_lock_killable(mm)) return -EINTR; } else { - mmap_read_lock(current->mm); + mmap_read_lock(mm); } /* @@ -1094,7 +1116,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) * ranges, just ignore them, but return -ENOMEM at the end. * - different from the way of handling in mlock etc. */ - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma_prev(mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; @@ -1131,19 +1153,92 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) if (prev) vma = prev->vm_next; else /* madvise_remove dropped mmap_lock */ - vma = find_vma(current->mm, start); + vma = find_vma(mm, start); } out: blk_finish_plug(&plug); if (write) - mmap_write_unlock(current->mm); + mmap_write_unlock(mm); else - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { - return do_madvise(start, len_in, behavior); + return do_madvise(current->mm, start, len_in, behavior); +} + +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, + size_t, vlen, int, behavior, unsigned int, flags) +{ + ssize_t ret; + struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec *iov = iovstack; + struct iov_iter iter; + struct pid *pid; + struct task_struct *task; + struct mm_struct *mm; + size_t total_len; + unsigned int f_flags; + + if (flags != 0) { + ret = -EINVAL; + goto out; + } + + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + goto out; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto free_iov; + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + if (task->mm != current->mm && + !process_madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_task; + } + + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + + total_len = iov_iter_count(&iter); + + while (iov_iter_count(&iter)) { + iovec = iov_iter_iovec(&iter); + ret = do_madvise(mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); + if (ret < 0) + break; + iov_iter_advance(&iter, iovec.iov_len); + } + + if (ret == 0) + ret = total_len - iov_iter_count(&iter); + + mmput(mm); + return ret; + +release_task: + put_task_struct(task); +put_pid: + put_pid(pid); +free_iov: + kfree(iov); +out: + return ret; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7f74a158cfa8..3a24e3b619f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -73,6 +73,9 @@ EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; +/* Active memory cgroup to use from an interrupt context */ +DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); + /* Socket memory accounting disabled? */ static bool cgroup_memory_nosocket; @@ -1061,23 +1064,56 @@ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) } EXPORT_SYMBOL(get_mem_cgroup_from_page); -/** - * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. - */ -static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) +static __always_inline struct mem_cgroup *active_memcg(void) { - if (unlikely(current->active_memcg)) { - struct mem_cgroup *memcg; + if (in_interrupt()) + return this_cpu_read(int_active_memcg); + else + return current->active_memcg; +} - rcu_read_lock(); +static __always_inline struct mem_cgroup *get_active_memcg(void) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = active_memcg(); + if (memcg) { /* current->active_memcg must hold a ref. */ - if (WARN_ON_ONCE(!css_tryget(¤t->active_memcg->css))) + if (WARN_ON_ONCE(!css_tryget(&memcg->css))) memcg = root_mem_cgroup; else memcg = current->active_memcg; - rcu_read_unlock(); - return memcg; } + rcu_read_unlock(); + + return memcg; +} + +static __always_inline bool memcg_kmem_bypass(void) +{ + /* Allow remote memcg charging from any context. */ + if (unlikely(active_memcg())) + return false; + + /* Memcg to charge can't be determined. */ + if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + return true; + + return false; +} + +/** + * If active memcg is set, do not fallback to current->mm->memcg. + */ +static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + if (memcg_kmem_bypass()) + return NULL; + + if (unlikely(active_memcg())) + return get_active_memcg(); + return get_mem_cgroup_from_mm(current->mm); } @@ -2933,12 +2969,12 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg; - if (unlikely(!current->mm && !current->active_memcg)) + if (memcg_kmem_bypass()) return NULL; rcu_read_lock(); - if (unlikely(current->active_memcg)) - memcg = rcu_dereference(current->active_memcg); + if (unlikely(active_memcg())) + memcg = active_memcg(); else memcg = mem_cgroup_from_task(current); @@ -3059,19 +3095,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) struct mem_cgroup *memcg; int ret = 0; - if (memcg_kmem_bypass()) - return 0; - memcg = get_mem_cgroup_from_current(); - if (!mem_cgroup_is_root(memcg)) { + if (memcg && !mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { page->mem_cgroup = memcg; __SetPageKmemcg(page); return 0; } + css_put(&memcg->css); } - css_put(&memcg->css); return ret; } @@ -5290,12 +5323,12 @@ static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *old_memcg; long error = -ENOMEM; - memalloc_use_memcg(parent); + old_memcg = set_active_memcg(parent); memcg = mem_cgroup_alloc(); - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); if (IS_ERR(memcg)) return ERR_CAST(memcg); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a2184b721fbf..c0bb186bba62 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1673,16 +1673,6 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -static struct page *new_page(struct page *p, unsigned long private) -{ - struct migration_target_control mtc = { - .nid = page_to_nid(p), - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, - }; - - return alloc_migration_target(p, (unsigned long)&mtc); -} - /* * Safely get reference count of an arbitrary page. * Returns 0 for a free page, -EIO for a zero refcount page @@ -1797,6 +1787,10 @@ static int __soft_offline_page(struct page *page) char const *msg_page[] = {"page", "hugepage"}; bool huge = PageHuge(page); LIST_HEAD(pagelist); + struct migration_target_control mtc = { + .nid = NUMA_NO_NODE, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; /* * Check PageHWPoison again inside page lock because PageHWPoison @@ -1833,8 +1827,8 @@ static int __soft_offline_page(struct page *page) } if (isolate_page(hpage, &pagelist)) { - ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, - MIGRATE_SYNC, MR_MEMORY_FAILURE); + ret = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (!ret) { bool release = !huge; diff --git a/mm/memory.c b/mm/memory.c index 589afe45d0b3..c48f8df6e502 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2391,13 +2391,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_enter_lazy_mmu_mode(); - do { - if (create || !pte_none(*pte)) { - err = fn(pte++, addr, data); - if (err) - break; - } - } while (addr += PAGE_SIZE, addr != end); + if (fn) { + do { + if (create || !pte_none(*pte)) { + err = fn(pte++, addr, data); + if (err) + break; + } + } while (addr += PAGE_SIZE, addr != end); + } *mask |= PGTBL_PTE_MODIFIED; arch_leave_lazy_mmu_mode(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6f203574ca1d..b44d4c7ba73b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1290,27 +1290,6 @@ found: return 0; } -static struct page *new_node_page(struct page *page, unsigned long private) -{ - nodemask_t nmask = node_states[N_MEMORY]; - struct migration_target_control mtc = { - .nid = page_to_nid(page), - .nmask = &nmask, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, - }; - - /* - * try to allocate from a different node but reuse this node if there - * are no other online nodes to be used (e.g. we are offlining a part - * of the only existing node) - */ - node_clear(mtc.nid, nmask); - if (nodes_empty(nmask)) - node_set(mtc.nid, nmask); - - return alloc_migration_target(page, (unsigned long)&mtc); -} - static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { @@ -1370,9 +1349,28 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) put_page(page); } if (!list_empty(&source)) { - /* Allocate a new page from the nearest neighbor node */ - ret = migrate_pages(&source, new_node_page, NULL, 0, - MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + nodemask_t nmask = node_states[N_MEMORY]; + struct migration_target_control mtc = { + .nmask = &nmask, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; + + /* + * We have checked that migration range is on a single zone so + * we can use the nid of the first page to all the others. + */ + mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); + + /* + * try to allocate from a different node but reuse this node + * if there are no other online nodes to be used (e.g. we are + * offlining a part of the only existing node) + */ + node_clear(mtc.nid, nmask); + if (nodes_empty(nmask)) + node_set(mtc.nid, nmask); + ret = migrate_pages(&source, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) { list_for_each_entry(page, &source, lru) { pr_warn("migrating pfn %lx failed ret:%d ", diff --git a/mm/migrate.c b/mm/migrate.c index 4cf1af88c1dd..5ca5842df5db 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1864,33 +1864,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, return nr_pages ? -EFAULT : 0; } -/* - * Move a list of pages in the address space of the currently executing - * process. - */ -static int kernel_move_pages(pid_t pid, unsigned long nr_pages, - const void __user * __user *pages, - const int __user *nodes, - int __user *status, int flags) +static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) { struct task_struct *task; struct mm_struct *mm; - int err; - nodemask_t task_nodes; - - /* Check flags */ - if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) - return -EINVAL; - if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) - return -EPERM; + /* + * There is no need to check if current process has the right to modify + * the specified process when they are same. + */ + if (!pid) { + mmget(current->mm); + *mem_nodes = cpuset_mems_allowed(current); + return current->mm; + } /* Find the mm_struct */ rcu_read_lock(); - task = pid ? find_task_by_vpid(pid) : current; + task = find_task_by_vpid(pid); if (!task) { rcu_read_unlock(); - return -ESRCH; + return ERR_PTR(-ESRCH); } get_task_struct(task); @@ -1900,22 +1894,47 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages, */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); - err = -EPERM; + mm = ERR_PTR(-EPERM); goto out; } rcu_read_unlock(); - err = security_task_movememory(task); - if (err) + mm = ERR_PTR(security_task_movememory(task)); + if (IS_ERR(mm)) goto out; - - task_nodes = cpuset_mems_allowed(task); + *mem_nodes = cpuset_mems_allowed(task); mm = get_task_mm(task); +out: put_task_struct(task); - if (!mm) + mm = ERR_PTR(-EINVAL); + return mm; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +static int kernel_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + struct mm_struct *mm; + int err; + nodemask_t task_nodes; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + mm = find_mm_struct(pid, &task_nodes); + if (IS_ERR(mm)) + return PTR_ERR(mm); + if (nodes) err = do_pages_move(mm, task_nodes, nr_pages, pages, nodes, status, flags); @@ -1924,10 +1943,6 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages, mmput(mm); return err; - -out: - put_task_struct(task); - return err; } SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, diff --git a/mm/mmap.c b/mm/mmap.c index ebb92f5515a1..d91ecb00d38c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -558,6 +558,50 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, return 0; } +/* + * vma_next() - Get the next VMA. + * @mm: The mm_struct. + * @vma: The current vma. + * + * If @vma is NULL, return the first vma in the mm. + * + * Returns: The next VMA after @vma. + */ +static inline struct vm_area_struct *vma_next(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + if (!vma) + return mm->mmap; + + return vma->vm_next; +} + +/* + * munmap_vma_range() - munmap VMAs that overlap a range. + * @mm: The mm struct + * @start: The start of the range. + * @len: The length of the range. + * @pprev: pointer to the pointer that will be set to previous vm_area_struct + * @rb_link: the rb_node + * @rb_parent: the parent rb_node + * + * Find all the vm_area_struct that overlap from @start to + * @end and munmap them. Set @pprev to the previous vm_area_struct. + * + * Returns: -ENOMEM on munmap failure or 0 on success. + */ +static inline int +munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, + struct vm_area_struct **pprev, struct rb_node ***link, + struct rb_node **parent, struct list_head *uf) +{ + + while (find_vma_links(mm, start, start + len, pprev, link, parent)) + if (do_munmap(mm, start, len, uf)) + return -ENOMEM; + + return 0; +} static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -1128,10 +1172,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - if (prev) - next = prev->vm_next; - else - next = mm->mmap; + next = vma_next(mm, prev); area = next; if (area && area->vm_end == end) /* cases 6, 7, 8 */ next = next->vm_next; @@ -1707,13 +1748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -ENOMEM; } - /* Clear old maps */ - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, - &rb_parent)) { - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - } - + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + return -ENOMEM; /* * Private writable mapping: check memory availability */ @@ -2632,7 +2669,7 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end) { - struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; + struct vm_area_struct *next = vma_next(mm, prev); struct mmu_gather tlb; lru_add_drain(); @@ -2831,7 +2868,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (error) return error; } - vma = prev ? prev->vm_next : mm->mmap; + vma = vma_next(mm, prev); if (unlikely(uf)) { /* @@ -3049,14 +3086,9 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (error) return error; - /* - * Clear old maps. this also does some error checking for us - */ - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, - &rb_parent)) { - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - } + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + return -ENOMEM; /* Check against address space limits *after* clearing old maps... */ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) diff --git a/mm/nommu.c b/mm/nommu.c index 0df7ca321314..0faf39b32cdb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -354,13 +354,6 @@ void vm_unmap_aliases(void) } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) -{ - BUG(); - return NULL; -} -EXPORT_SYMBOL_GPL(alloc_vm_area); - void free_vm_area(struct vm_struct *area) { BUG(); diff --git a/mm/percpu.c b/mm/percpu.c index 1ed1a349eab8..66a93f096394 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1584,8 +1584,7 @@ static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) || - memcg_kmem_bypass()) + if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) return PCPU_CHUNK_ROOT; objcg = get_obj_cgroup_from_current(); diff --git a/mm/slab.h b/mm/slab.h index 06c6587765a3..6d7c6a5056ba 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -280,9 +280,6 @@ static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s, { struct obj_cgroup *objcg; - if (memcg_kmem_bypass()) - return NULL; - objcg = get_obj_cgroup_from_current(); if (!objcg) return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 04ac98bf5045..6ae491a8b210 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/mm/vmalloc.c - * * Copyright (C) 1993 Linus Torvalds * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 @@ -2321,20 +2319,21 @@ static void __vfree(const void *addr) } /** - * vfree - release memory allocated by vmalloc() - * @addr: memory base address + * vfree - Release memory allocated by vmalloc() + * @addr: Memory base address * - * Free the virtually continuous memory area starting at @addr, as - * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is - * NULL, no operation is performed. + * Free the virtually continuous memory area starting at @addr, as obtained + * from one of the vmalloc() family of APIs. This will usually also free the + * physical memory underlying the virtual allocation, but that memory is + * reference counted, so it will not be freed until the last user goes away. * - * Must not be called in NMI context (strictly speaking, only if we don't - * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling - * conventions for vfree() arch-depenedent would be a really bad idea) + * If @addr is NULL, no operation is performed. * + * Context: * May sleep if called *not* from interrupt context. - * - * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) + * Must not be called in NMI context (strictly speaking, it could be + * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling + * conventions for vfree() arch-depenedent would be a really bad idea). */ void vfree(const void *addr) { @@ -2376,8 +2375,11 @@ EXPORT_SYMBOL(vunmap); * @flags: vm_area->flags * @prot: page protection for the mapping * - * Maps @count pages from @pages into contiguous kernel virtual - * space. + * Maps @count pages from @pages into contiguous kernel virtual space. + * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself + * (which must be kmalloc or vmalloc memory) and one reference per pages in it + * are transferred from the caller to vmap(), and will be freed / dropped when + * vfree() is called on the return value. * * Return: the address of the area or %NULL on failure */ @@ -2403,28 +2405,73 @@ void *vmap(struct page **pages, unsigned int count, return NULL; } + if (flags & VM_MAP_PUT_PAGES) + area->pages = pages; return area->addr; } EXPORT_SYMBOL(vmap); +#ifdef CONFIG_VMAP_PFN +struct vmap_pfn_data { + unsigned long *pfns; + pgprot_t prot; + unsigned int idx; +}; + +static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) +{ + struct vmap_pfn_data *data = private; + + if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) + return -EINVAL; + *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); + return 0; +} + +/** + * vmap_pfn - map an array of PFNs into virtually contiguous space + * @pfns: array of PFNs + * @count: number of pages to map + * @prot: page protection for the mapping + * + * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns + * the start address of the mapping. + */ +void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) +{ + struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; + struct vm_struct *area; + + area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, + __builtin_return_address(0)); + if (!area) + return NULL; + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + count * PAGE_SIZE, vmap_pfn_apply, &data)) { + free_vm_area(area); + return NULL; + } + return area->addr; +} +EXPORT_SYMBOL_GPL(vmap_pfn); +#endif /* CONFIG_VMAP_PFN */ + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { - struct page **pages; - unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; - const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? - 0 : - __GFP_HIGHMEM; + unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; + unsigned int array_size = nr_pages * sizeof(struct page *), i; + struct page **pages; - nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; - array_size = (nr_pages * sizeof(struct page *)); + gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) + gfp_mask |= __GFP_HIGHMEM; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, - node, area->caller); + pages = __vmalloc_node(array_size, 1, nested_gfp, node, + area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -2442,9 +2489,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask|highmem_mask); + page = alloc_page(gfp_mask); else - page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); + page = alloc_pages_node(node, gfp_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vfree() */ @@ -3032,54 +3079,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -static int f(pte_t *pte, unsigned long addr, void *data) -{ - pte_t ***p = data; - - if (p) { - *(*p) = pte; - (*p)++; - } - return 0; -} - -/** - * alloc_vm_area - allocate a range of kernel address space - * @size: size of the area - * @ptes: returns the PTEs for the address space - * - * Returns: NULL on failure, vm_struct on success - * - * This function reserves a range of kernel address space, and - * allocates pagetables to map that range. No actual mappings - * are created. - * - * If @ptes is non-NULL, pointers to the PTEs (in init_mm) - * allocated for the VM area are returned. - */ -struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_IOREMAP, - __builtin_return_address(0)); - if (area == NULL) - return NULL; - - /* - * This ensures that page tables are constructed for this region - * of kernel virtual address space and mapped into init_mm. - */ - if (apply_to_page_range(&init_mm, (unsigned long)area->addr, - size, f, ptes ? &ptes : NULL)) { - free_vm_area(area); - return NULL; - } - - return area; -} -EXPORT_SYMBOL_GPL(alloc_vm_area); - void free_vm_area(struct vm_struct *area) { struct vm_struct *ret; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c36fdff9a371..918c7b019b3d 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1122,10 +1122,16 @@ static inline int __zs_cpu_up(struct mapping_area *area) */ if (area->vm) return 0; - area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); + area->vm = get_vm_area(PAGE_SIZE * 2, 0); if (!area->vm) return -ENOMEM; - return 0; + + /* + * Populate ptes in advance to avoid pte allocation with GFP_KERNEL + * in non-preemtible context of zs_map_object. + */ + return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr, + PAGE_SIZE * 2, NULL, NULL); } static inline void __zs_cpu_down(struct mapping_area *area) diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 0a28a6a29581..c9404ef9698e 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -45,7 +45,7 @@ struct hmm_buffer { #define TWOMEG (1 << 21) #define HMM_BUFFER_SIZE (1024 << 12) #define HMM_PATH_MAX 64 -#define NTIMES 256 +#define NTIMES 10 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) |