summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-23 09:58:07 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-23 09:58:07 -0800
commit5c00ff742bf5caf85f60e1c73999f99376fb865d (patch)
treefa484e83c27af79f1c0511e7e0673507461c9379 /kernel
parent228a1157fb9fec47eb135b51c0202b574e079ebf (diff)
parent2532e6c74a67e65b95f310946e0c0e0a41b3a34b (diff)
Merge tag 'mm-stable-2024-11-18-19-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - The series "zram: optimal post-processing target selection" from Sergey Senozhatsky improves zram's post-processing selection algorithm. This leads to improved memory savings. - Wei Yang has gone to town on the mapletree code, contributing several series which clean up the implementation: - "refine mas_mab_cp()" - "Reduce the space to be cleared for maple_big_node" - "maple_tree: simplify mas_push_node()" - "Following cleanup after introduce mas_wr_store_type()" - "refine storing null" - The series "selftests/mm: hugetlb_fault_after_madv improvements" from David Hildenbrand fixes this selftest for s390. - The series "introduce pte_offset_map_{ro|rw}_nolock()" from Qi Zheng implements some rationaizations and cleanups in the page mapping code. - The series "mm: optimize shadow entries removal" from Shakeel Butt optimizes the file truncation code by speeding up the handling of shadow entries. - The series "Remove PageKsm()" from Matthew Wilcox completes the migration of this flag over to being a folio-based flag. - The series "Unify hugetlb into arch_get_unmapped_area functions" from Oscar Salvador implements a bunch of consolidations and cleanups in the hugetlb code. - The series "Do not shatter hugezeropage on wp-fault" from Dev Jain takes away the wp-fault time practice of turning a huge zero page into small pages. Instead we replace the whole thing with a THP. More consistent cleaner and potentiall saves a large number of pagefaults. - The series "percpu: Add a test case and fix for clang" from Andy Shevchenko enhances and fixes the kernel's built in percpu test code. - The series "mm/mremap: Remove extra vma tree walk" from Liam Howlett optimizes mremap() by avoiding doing things which we didn't need to do. - The series "Improve the tmpfs large folio read performance" from Baolin Wang teaches tmpfs to copy data into userspace at the folio size rather than as individual pages. A 20% speedup was observed. - The series "mm/damon/vaddr: Fix issue in damon_va_evenly_split_region()" fro Zheng Yejian fixes DAMON splitting. - The series "memcg-v1: fully deprecate charge moving" from Shakeel Butt removes the long-deprecated memcgv2 charge moving feature. - The series "fix error handling in mmap_region() and refactor" from Lorenzo Stoakes cleanup up some of the mmap() error handling and addresses some potential performance issues. - The series "x86/module: use large ROX pages for text allocations" from Mike Rapoport teaches x86 to use large pages for read-only-execute module text. - The series "page allocation tag compression" from Suren Baghdasaryan is followon maintenance work for the new page allocation profiling feature. - The series "page->index removals in mm" from Matthew Wilcox remove most references to page->index in mm/. A slow march towards shrinking struct page. - The series "damon/{self,kunit}tests: minor fixups for DAMON debugfs interface tests" from Andrew Paniakin performs maintenance work for DAMON's self testing code. - The series "mm: zswap swap-out of large folios" from Kanchana Sridhar improves zswap's batching of compression and decompression. It is a step along the way towards using Intel IAA hardware acceleration for this zswap operation. - The series "kasan: migrate the last module test to kunit" from Sabyrzhan Tasbolatov completes the migration of the KASAN built-in tests over to the KUnit framework. - The series "implement lightweight guard pages" from Lorenzo Stoakes permits userapace to place fault-generating guard pages within a single VMA, rather than requiring that multiple VMAs be created for this. Improved efficiencies for userspace memory allocators are expected. - The series "memcg: tracepoint for flushing stats" from JP Kobryn uses tracepoints to provide increased visibility into memcg stats flushing activity. - The series "zram: IDLE flag handling fixes" from Sergey Senozhatsky fixes a zram buglet which potentially affected performance. - The series "mm: add more kernel parameters to control mTHP" from MaĆ­ra Canal enhances our ability to control/configuremultisize THP from the kernel boot command line. - The series "kasan: few improvements on kunit tests" from Sabyrzhan Tasbolatov has a couple of fixups for the KASAN KUnit tests. - The series "mm/list_lru: Split list_lru lock into per-cgroup scope" from Kairui Song optimizes list_lru memory utilization when lockdep is enabled. * tag 'mm-stable-2024-11-18-19-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (215 commits) cma: enforce non-zero pageblock_order during cma_init_reserved_mem() mm/kfence: add a new kunit test test_use_after_free_read_nofault() zram: fix NULL pointer in comp_algorithm_show() memcg/hugetlb: add hugeTLB counters to memcg vmstat: call fold_vm_zone_numa_events() before show per zone NUMA event mm: mmap_lock: check trace_mmap_lock_$type_enabled() instead of regcount zram: ZRAM_DEF_COMP should depend on ZRAM MAINTAINERS/MEMORY MANAGEMENT: add document files for mm Docs/mm/damon: recommend academic papers to read and/or cite mm: define general function pXd_init() kmemleak: iommu/iova: fix transient kmemleak false positive mm/list_lru: simplify the list_lru walk callback function mm/list_lru: split the lock to per-cgroup scope mm/list_lru: simplify reparenting and initial allocation mm/list_lru: code clean up for reparenting mm/list_lru: don't export list_lru_add mm/list_lru: don't pass unnecessary key parameters kasan: add kunit tests for kmalloc_track_caller, kmalloc_node_track_caller kasan: change kasan_atomics kunit test as KUNIT_CASE_SLOW kasan: use EXPORT_SYMBOL_IF_KUNIT to export symbols ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/uprobes.c1
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/futex/core.c2
-rw-r--r--kernel/module/debug_kmemleak.c3
-rw-r--r--kernel/module/main.c148
-rw-r--r--kernel/module/strict_rwx.c3
-rw-r--r--kernel/resource.c4
7 files changed, 131 insertions, 36 deletions
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a76ddc5fc982..fa04b14a7d72 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -15,7 +15,6 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index e58d27c05788..f253e81d0c28 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -16,7 +16,6 @@
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
@@ -1546,8 +1545,9 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
return ERR_PTR(err);
mm = get_task_mm(task);
- if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode)) {
+ if (!mm) {
+ mm = ERR_PTR(-ESRCH);
+ } else if (mm != current->mm && !ptrace_may_access(task, mode)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 326bfe6549d7..6de57246760e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -399,7 +399,7 @@ again:
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.i_seq = get_inode_sequence_number(inode);
- key->shared.pgoff = folio->index + folio_page_idx(folio, page);
+ key->shared.pgoff = page_pgoff(folio, page);
rcu_read_unlock();
}
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
index b4cc03842d70..df873dad049d 100644
--- a/kernel/module/debug_kmemleak.c
+++ b/kernel/module/debug_kmemleak.c
@@ -14,7 +14,8 @@ void kmemleak_load_module(const struct module *mod,
{
/* only scan writable, non-executable sections */
for_each_mod_mem_type(type) {
- if (type != MOD_DATA && type != MOD_INIT_DATA)
+ if (type != MOD_DATA && type != MOD_INIT_DATA &&
+ !mod->mem[type].is_rox)
kmemleak_no_scan(mod->mem[type].base);
}
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 4490924fe24e..d2e1b8976c7b 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1189,6 +1189,18 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
+void *__module_writable_address(struct module *mod, void *loc)
+{
+ for_class_mod_mem_type(type, text) {
+ struct module_memory *mem = &mod->mem[type];
+
+ if (loc >= mem->base && loc < mem->base + mem->size)
+ return loc + (mem->rw_copy - mem->base);
+ }
+
+ return loc;
+}
+
static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
{
unsigned int size = PAGE_ALIGN(mod->mem[type].size);
@@ -1206,6 +1218,23 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
if (!ptr)
return -ENOMEM;
+ mod->mem[type].base = ptr;
+
+ if (execmem_is_rox(execmem_type)) {
+ ptr = vzalloc(size);
+
+ if (!ptr) {
+ execmem_free(mod->mem[type].base);
+ return -ENOMEM;
+ }
+
+ mod->mem[type].rw_copy = ptr;
+ mod->mem[type].is_rox = true;
+ } else {
+ mod->mem[type].rw_copy = mod->mem[type].base;
+ memset(mod->mem[type].base, 0, size);
+ }
+
/*
* The pointer to these blocks of memory are stored on the module
* structure and we keep that around so long as the module is
@@ -1219,24 +1248,20 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
*/
kmemleak_not_leak(ptr);
- memset(ptr, 0, size);
- mod->mem[type].base = ptr;
-
return 0;
}
-static void module_memory_free(struct module *mod, enum mod_mem_type type,
- bool unload_codetags)
+static void module_memory_free(struct module *mod, enum mod_mem_type type)
{
- void *ptr = mod->mem[type].base;
+ struct module_memory *mem = &mod->mem[type];
- if (!unload_codetags && mod_mem_type_is_core_data(type))
- return;
+ if (mem->is_rox)
+ vfree(mem->rw_copy);
- execmem_free(ptr);
+ execmem_free(mem->base);
}
-static void free_mod_mem(struct module *mod, bool unload_codetags)
+static void free_mod_mem(struct module *mod)
{
for_each_mod_mem_type(type) {
struct module_memory *mod_mem = &mod->mem[type];
@@ -1247,25 +1272,20 @@ static void free_mod_mem(struct module *mod, bool unload_codetags)
/* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod_mem->base, mod_mem->size);
if (mod_mem->size)
- module_memory_free(mod, type, unload_codetags);
+ module_memory_free(mod, type);
}
/* MOD_DATA hosts mod, so free it at last */
lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
- module_memory_free(mod, MOD_DATA, unload_codetags);
+ module_memory_free(mod, MOD_DATA);
}
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
- bool unload_codetags;
-
trace_module_free(mod);
- unload_codetags = codetag_unload_module(mod);
- if (!unload_codetags)
- pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n",
- mod->name);
+ codetag_unload_module(mod);
mod_sysfs_teardown(mod);
@@ -1308,7 +1328,7 @@ static void free_module(struct module *mod)
kfree(mod->args);
percpu_modfree(mod);
- free_mod_mem(mod, unload_codetags);
+ free_mod_mem(mod);
}
void *__symbol_get(const char *symbol)
@@ -1573,6 +1593,20 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i
if (WARN_ON_ONCE(type == MOD_INVALID))
continue;
+ /*
+ * Do not allocate codetag memory as we load it into
+ * preallocated contiguous memory.
+ */
+ if (codetag_needs_module_section(mod, sname, s->sh_size)) {
+ /*
+ * s->sh_entsize won't be used but populate the
+ * type field to avoid confusion.
+ */
+ s->sh_entsize = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK)
+ << SH_ENTSIZE_TYPE_SHIFT;
+ continue;
+ }
+
s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
pr_debug("\t%s\n", sname);
}
@@ -2247,17 +2281,19 @@ static int move_module(struct module *mod, struct load_info *info)
int i;
enum mod_mem_type t = 0;
int ret = -ENOMEM;
+ bool codetag_section_found = false;
for_each_mod_mem_type(type) {
if (!mod->mem[type].size) {
mod->mem[type].base = NULL;
+ mod->mem[type].rw_copy = NULL;
continue;
}
ret = module_memory_alloc(mod, type);
if (ret) {
t = type;
- goto out_enomem;
+ goto out_err;
}
}
@@ -2266,12 +2302,37 @@ static int move_module(struct module *mod, struct load_info *info)
for (i = 0; i < info->hdr->e_shnum; i++) {
void *dest;
Elf_Shdr *shdr = &info->sechdrs[i];
- enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+ const char *sname;
+ unsigned long addr;
if (!(shdr->sh_flags & SHF_ALLOC))
continue;
- dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK);
+ sname = info->secstrings + shdr->sh_name;
+ /*
+ * Load codetag sections separately as they might still be used
+ * after module unload.
+ */
+ if (codetag_needs_module_section(mod, sname, shdr->sh_size)) {
+ dest = codetag_alloc_module_section(mod, sname, shdr->sh_size,
+ arch_mod_section_prepend(mod, i), shdr->sh_addralign);
+ if (WARN_ON(!dest)) {
+ ret = -EINVAL;
+ goto out_err;
+ }
+ if (IS_ERR(dest)) {
+ ret = PTR_ERR(dest);
+ goto out_err;
+ }
+ addr = (unsigned long)dest;
+ codetag_section_found = true;
+ } else {
+ enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+ unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK;
+
+ addr = (unsigned long)mod->mem[type].base + offset;
+ dest = mod->mem[type].rw_copy + offset;
+ }
if (shdr->sh_type != SHT_NOBITS) {
/*
@@ -2283,7 +2344,7 @@ static int move_module(struct module *mod, struct load_info *info)
if (i == info->index.mod &&
(WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) {
ret = -ENOEXEC;
- goto out_enomem;
+ goto out_err;
}
memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
}
@@ -2293,15 +2354,18 @@ static int move_module(struct module *mod, struct load_info *info)
* users of info can keep taking advantage and using the newly
* minted official memory area.
*/
- shdr->sh_addr = (unsigned long)dest;
+ shdr->sh_addr = addr;
pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
(long)shdr->sh_size, info->secstrings + shdr->sh_name);
}
return 0;
-out_enomem:
+out_err:
for (t--; t >= 0; t--)
- module_memory_free(mod, t, true);
+ module_memory_free(mod, t);
+ if (codetag_section_found)
+ codetag_free_module_sections(mod);
+
return ret;
}
@@ -2422,6 +2486,8 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
/* Module has been copied to its final place now: return it. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
kmemleak_load_module(mod, info);
+ codetag_module_replaced(info->mod, mod);
+
return mod;
}
@@ -2431,7 +2497,7 @@ static void module_deallocate(struct module *mod, struct load_info *info)
percpu_modfree(mod);
module_arch_freeing_init(mod);
- free_mod_mem(mod, true);
+ free_mod_mem(mod);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -2441,8 +2507,17 @@ int __weak module_finalize(const Elf_Ehdr *hdr,
return 0;
}
+int __weak module_post_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ return 0;
+}
+
static int post_relocation(struct module *mod, const struct load_info *info)
{
+ int ret;
+
/* Sort exception table now relocations are done. */
sort_extable(mod->extable, mod->extable + mod->num_exentries);
@@ -2454,7 +2529,24 @@ static int post_relocation(struct module *mod, const struct load_info *info)
add_kallsyms(mod, info);
/* Arch-specific module finalizing. */
- return module_finalize(info->hdr, info->sechdrs, mod);
+ ret = module_finalize(info->hdr, info->sechdrs, mod);
+ if (ret)
+ return ret;
+
+ for_each_mod_mem_type(type) {
+ struct module_memory *mem = &mod->mem[type];
+
+ if (mem->is_rox) {
+ if (!execmem_update_copy(mem->base, mem->rw_copy,
+ mem->size))
+ return -ENOMEM;
+
+ vfree(mem->rw_copy);
+ mem->rw_copy = NULL;
+ }
+ }
+
+ return module_post_finalize(info->hdr, info->sechdrs, mod);
}
/* Call module constructors. */
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index c45caa4690e5..239e5013359d 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -34,6 +34,9 @@ int module_enable_text_rox(const struct module *mod)
for_class_mod_mem_type(type, text) {
int ret;
+ if (mod->mem[type].is_rox)
+ continue;
+
if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
ret = module_set_memory(mod, type, set_memory_rox);
else
diff --git a/kernel/resource.c b/kernel/resource.c
index 4101016e8b20..d2c8143ae4ff 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1869,7 +1869,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size,
if (flags & GFR_DESCENDING) {
resource_size_t end;
- end = min_t(resource_size_t, base->end, PHYSMEM_END);
+ end = min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
return end - size + 1;
}
@@ -1886,7 +1886,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr,
* @size did not wrap 0.
*/
return addr > addr - size &&
- addr <= min_t(resource_size_t, base->end, PHYSMEM_END);
+ addr <= min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
}
static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,