From 979857ea2deae05454d257f119bedfe84a2c74d9 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Sep 2022 10:47:42 +0200 Subject: mm: slub: remove dead and buggy code from sysfs_slab_add() The function sysfs_slab_add() has two callers: One is slab_sysfs_init(), which first initializes slab_kset, and only when that succeeds sets slab_state to FULL, and then proceeds to call sysfs_slab_add() for all previously created slabs. The other is __kmem_cache_create(), but only after a if (slab_state <= UP) return 0; check. So in other words, sysfs_slab_add() is never called without slab_kset (aka the return value of cache_kset()) being non-NULL. And this is just as well, because if we ever did take this path and called kobject_init(&s->kobj), and then later when called again from slab_sysfs_init() would end up calling kobject_init_and_add(), we would hit if (kobj->state_initialized) { /* do not error out as sometimes we can recover */ pr_err("kobject (%p): tried to init an initialized object, something is seriously wrong.\n", dump_stack(); } in kobject.c. Signed-off-by: Rasmus Villemoes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- mm/slub.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 157527d7101b..76f4d3c41511 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5920,11 +5920,6 @@ static int sysfs_slab_add(struct kmem_cache *s) struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); - if (!kset) { - kobject_init(&s->kobj, &slab_ktype); - return 0; - } - if (!unmergeable && disable_higher_order_debug && (slub_debug & DEBUG_METADATA_FLAGS)) unmergeable = 1; -- cgit v1.2.3-70-g09d2 From 1a5ad30b89b4e9fa64f75b941a324396738b7616 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 30 Sep 2022 12:27:12 +0200 Subject: mm: slub: make slab_sysfs_init() a late_initcall Currently, slab_sysfs_init() is an __initcall aka device_initcall. It is rather time-consuming; on my board it takes around 11ms. That's about 1% of the time budget I have from U-Boot letting go and until linux must assume responsibility of keeping the external watchdog happy. There's no particular reason this would need to run at device_initcall time, so instead make it a late_initcall to allow vital functionality to get started a bit sooner. This actually ends up winning more than just those 11ms, because the slab caches that get created during other device_initcalls (and before my watchdog device gets probed) now don't end up doing the somewhat expensive sysfs_slab_add() themselves. Some example lines (with initcall_debug set) before/after: initcall ext4_init_fs+0x0/0x1ac returned 0 after 1386 usecs initcall journal_init+0x0/0x138 returned 0 after 517 usecs initcall init_fat_fs+0x0/0x68 returned 0 after 294 usecs initcall ext4_init_fs+0x0/0x1ac returned 0 after 240 usecs initcall journal_init+0x0/0x138 returned 0 after 32 usecs initcall init_fat_fs+0x0/0x68 returned 0 after 18 usecs Altogether, this means I now get to petting the watchdog around 17ms sooner. [Of course, the time the other initcalls save is instead spent in slab_sysfs_init(), which goes from 11ms to 16ms, so there's no overall change in boot time.] Signed-off-by: Rasmus Villemoes Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- mm/slub.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 76f4d3c41511..f162683f3006 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6049,8 +6049,7 @@ static int __init slab_sysfs_init(void) mutex_unlock(&slab_mutex); return 0; } - -__initcall(slab_sysfs_init); +late_initcall(slab_sysfs_init); #endif /* CONFIG_SYSFS */ #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) -- cgit v1.2.3-70-g09d2 From 7c82b3b308f9ca24852e3b0ee963b9eae128b78a Mon Sep 17 00:00:00 2001 From: Alexander Atanasov Date: Tue, 20 Sep 2022 15:11:11 +0300 Subject: mm: Make failslab writable again In (060807f841ac mm, slub: make remaining slub_debug related attributes read-only) failslab was made read-only. I think it became a collateral victim to the two other options for which the reasons are perfectly valid. Here is why: - sanity_checks and trace are slab internal debug options, failslab is used for fault injection. - for fault injections, which by presumption are random, it does not matter if it is not set atomically. And you need to set atleast one more option to trigger fault injection. - in a testing scenario you may need to change it at runtime example: module loading - you test all allocations limited by the space option. Then you move to test only your module's own slabs. - when set by command line flags it effectively disables all cache merges. Cc: Vlastimil Babka Cc: Andrew Morton Cc: Kees Cook Cc: Roman Gushchin Cc: Christoph Lameter Cc: Jann Horn Cc: Vijayanand Jitta Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Link: http://lkml.kernel.org/r/20200610163135.17364-5-vbabka@suse.cz Signed-off-by: Alexander Atanasov Signed-off-by: Vlastimil Babka --- Documentation/mm/slub.rst | 2 ++ mm/slub.c | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst index 4e1578186b4f..7f652216dabe 100644 --- a/Documentation/mm/slub.rst +++ b/Documentation/mm/slub.rst @@ -116,6 +116,8 @@ options from the ``slub_debug`` parameter translate to the following files:: T trace A failslab +failslab file is writable, so writing 1 or 0 will enable or disable +the option at runtime. Write returns -EINVAL if cache is an alias. Careful with tracing: It may spew out lots of information and never stop if used on the wrong slab. diff --git a/mm/slub.c b/mm/slub.c index f162683f3006..3fc7d861d1f2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5586,7 +5586,21 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); } -SLAB_ATTR_RO(failslab); + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + if (s->refcount > 1) + return -EINVAL; + + if (buf[0] == '1') + WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB); + else + WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB); + + return length; +} +SLAB_ATTR(failslab); #endif static ssize_t shrink_show(struct kmem_cache *s, char *buf) -- cgit v1.2.3-70-g09d2 From a8e53869995b90609a798f9830d44086ab6025c4 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Fri, 14 Oct 2022 20:43:22 +0900 Subject: mm/slub: remove dead code for debug caches on deactivate_slab() After commit c7323a5ad0786 ("mm/slub: restrict sysfs validation to debug caches and make it safe"), SLUB never installs percpu slab for debug caches and thus never deactivates percpu slab for them. Since only debug caches use the full list, SLUB no longer deactivates to full list. Remove dead code in deactivate_slab(). Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 157527d7101b..5eea9e446672 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2411,7 +2411,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST }; + enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); int free_delta = 0; enum slab_modes mode = M_NONE; @@ -2487,14 +2487,6 @@ redo: * acquire_slab() will see a slab that is frozen */ spin_lock_irqsave(&n->list_lock, flags); - } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) { - mode = M_FULL; - /* - * This also ensures that the scanning of full - * slabs from diagnostic functions will not see - * any frozen slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); } else { mode = M_FULL_NOLIST; } @@ -2504,7 +2496,7 @@ redo: old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) { - if (mode == M_PARTIAL || mode == M_FULL) + if (mode == M_PARTIAL) spin_unlock_irqrestore(&n->list_lock, flags); goto redo; } @@ -2518,10 +2510,6 @@ redo: stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); - } else if (mode == M_FULL) { - add_full(s, n, slab); - spin_unlock_irqrestore(&n->list_lock, flags); - stat(s, DEACTIVATE_FULL); } else if (mode == M_FULL_NOLIST) { stat(s, DEACTIVATE_FULL); } -- cgit v1.2.3-70-g09d2 From b539ce9f1a31c442098c3f351cb4d03ba27c2720 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 21 Oct 2022 21:18:12 +0200 Subject: mm/slab: Annotate kmem_cache_node->list_lock as raw The list_lock can be taken in hardirq context when do_drain() is being called via IPI on all cores, and therefore lockdep complains about it, because it can't be preempted on PREEMPT_RT. That's not a real issue, as SLAB can't be built on PREEMPT_RT anyway, but we still want to get rid of the warning on non-PREEMPT_RT builds. Annotate it therefore as a raw lock in order to get rid of he lockdep warning below. ============================= [ BUG: Invalid wait context ] 6.1.0-rc1-00134-ge35184f32151 #4 Not tainted ----------------------------- swapper/3/0 is trying to lock: ffff8bc88086dc18 (&parent->list_lock){..-.}-{3:3}, at: do_drain+0x57/0xb0 other info that might help us debug this: context-{2:2} no locks held by swapper/3/0. stack backtrace: CPU: 3 PID: 0 Comm: swapper/3 Not tainted 6.1.0-rc1-00134-ge35184f32151 #4 Hardware name: LENOVO 20K5S22R00/20K5S22R00, BIOS R0IET38W (1.16 ) 05/31/2017 Call Trace: dump_stack_lvl+0x6b/0x9d __lock_acquire+0x1519/0x1730 ? build_sched_domains+0x4bd/0x1590 ? __lock_acquire+0xad2/0x1730 lock_acquire+0x294/0x340 ? do_drain+0x57/0xb0 ? sched_clock_tick+0x41/0x60 _raw_spin_lock+0x2c/0x40 ? do_drain+0x57/0xb0 do_drain+0x57/0xb0 __flush_smp_call_function_queue+0x138/0x220 __sysvec_call_function+0x4f/0x210 sysvec_call_function+0x4b/0x90 asm_sysvec_call_function+0x16/0x20 RIP: 0010:mwait_idle+0x5e/0x80 Code: 31 d2 65 48 8b 04 25 80 ed 01 00 48 89 d1 0f 01 c8 48 8b 00 a8 08 75 14 66 90 0f 00 2d 0b 78 46 00 31 c0 48 89 c1 fb 0f 01 c9 06 fb 0f 1f 44 00 00 65 48 8b 04 25 80 ed 01 00 f0 80 60 02 df RSP: 0000:ffffa90940217ee0 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff9bb9f93a RBP: 0000000000000003 R08: 0000000000000001 R09: 0000000000000001 R10: ffffa90940217ea8 R11: 0000000000000000 R12: ffffffffffffffff R13: 0000000000000000 R14: ffff8bc88127c500 R15: 0000000000000000 ? default_idle_call+0x1a/0xa0 default_idle_call+0x4b/0xa0 do_idle+0x1f1/0x2c0 ? _raw_spin_unlock_irqrestore+0x56/0x70 cpu_startup_entry+0x19/0x20 start_secondary+0x122/0x150 secondary_startup_64_no_verify+0xce/0xdb Signed-off-by: Jiri Kosina Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.c | 90 +++++++++++++++++++++++++++++++-------------------------------- mm/slab.h | 4 +-- 2 files changed, 47 insertions(+), 47 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 59c8e28f7b6a..d8a287900193 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -234,7 +234,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) parent->shared = NULL; parent->alien = NULL; parent->colour_next = 0; - spin_lock_init(&parent->list_lock); + raw_spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; } @@ -559,9 +559,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, slab_node = slab_nid(slab); n = get_node(cachep, slab_node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, &objp, 1, slab_node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); } @@ -684,7 +684,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, struct kmem_cache_node *n = get_node(cachep, node); if (ac->avail) { - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); /* * Stuff objects into the remote nodes shared array first. * That way we could avoid the overhead of putting the objects @@ -695,7 +695,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, free_block(cachep, ac->entry, ac->avail, node, list); ac->avail = 0; - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); } } @@ -768,9 +768,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, slabs_destroy(cachep, &list); } else { n = get_node(cachep, slab_node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, &objp, 1, slab_node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); } return 1; @@ -811,10 +811,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) */ n = get_node(cachep, node); if (n) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); return 0; } @@ -893,7 +893,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, goto fail; n = get_node(cachep, node); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); if (n->shared && force_change) { free_block(cachep, n->shared->entry, n->shared->avail, node, &list); @@ -911,7 +911,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, new_alien = NULL; } - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); /* @@ -950,7 +950,7 @@ static void cpuup_canceled(long cpu) if (!n) continue; - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; @@ -961,7 +961,7 @@ static void cpuup_canceled(long cpu) nc->avail = 0; if (!cpumask_empty(mask)) { - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); goto free_slab; } @@ -975,7 +975,7 @@ static void cpuup_canceled(long cpu) alien = n->alien; n->alien = NULL; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); kfree(shared); if (alien) { @@ -1159,7 +1159,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node * /* * Do not assume that spinlocks can be initialized via memcpy: */ - spin_lock_init(&ptr->list_lock); + raw_spin_lock_init(&ptr->list_lock); MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->node[nodeid] = ptr; @@ -1330,11 +1330,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) for_each_kmem_cache_node(cachep, node, n) { unsigned long total_slabs, free_slabs, free_objs; - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); total_slabs = n->total_slabs; free_slabs = n->free_slabs; free_objs = n->free_objects; - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", node, total_slabs - free_slabs, total_slabs, @@ -2096,7 +2096,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); + assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); #endif } @@ -2104,7 +2104,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&get_node(cachep, node)->list_lock); + assert_raw_spin_locked(&get_node(cachep, node)->list_lock); #endif } @@ -2144,9 +2144,9 @@ static void do_drain(void *arg) check_irq_off(); ac = cpu_cache_get(cachep); n = get_node(cachep, node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); ac->avail = 0; slabs_destroy(cachep, &list); } @@ -2164,9 +2164,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep) drain_alien_cache(cachep, n->alien); for_each_kmem_cache_node(cachep, node, n) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); drain_array_locked(cachep, n->shared, node, true, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } @@ -2188,10 +2188,10 @@ static int drain_freelist(struct kmem_cache *cache, nr_freed = 0; while (nr_freed < tofree && !list_empty(&n->slabs_free)) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); p = n->slabs_free.prev; if (p == &n->slabs_free) { - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); goto out; } @@ -2204,7 +2204,7 @@ static int drain_freelist(struct kmem_cache *cache, * to the cache. */ n->free_objects -= cache->num; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slab_destroy(cache, slab); nr_freed++; } @@ -2629,7 +2629,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) INIT_LIST_HEAD(&slab->slab_list); n = get_node(cachep, slab_nid(slab)); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); n->total_slabs++; if (!slab->active) { list_add_tail(&slab->slab_list, &n->slabs_free); @@ -2639,7 +2639,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) STATS_INC_GROWN(cachep); n->free_objects += cachep->num - slab->active; - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); } @@ -2805,7 +2805,7 @@ static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct slab *slab; - assert_spin_locked(&n->list_lock); + assert_raw_spin_locked(&n->list_lock); slab = list_first_entry_or_null(&n->slabs_partial, struct slab, slab_list); if (!slab) { @@ -2832,10 +2832,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, if (!gfp_pfmemalloc_allowed(flags)) return NULL; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); slab = get_first_slab(n, true); if (!slab) { - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); return NULL; } @@ -2844,7 +2844,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, fixup_slab_list(cachep, n, slab, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); return obj; @@ -2903,7 +2903,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) if (!n->free_objects && (!shared || !shared->avail)) goto direct_grow; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); shared = READ_ONCE(n->shared); /* See if we can refill from the shared array */ @@ -2927,7 +2927,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) must_grow: n->free_objects -= ac->avail; alloc_done: - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); direct_grow: @@ -3147,7 +3147,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, BUG_ON(!n); check_irq_off(); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); slab = get_first_slab(n, false); if (!slab) goto must_grow; @@ -3165,12 +3165,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, fixup_slab_list(cachep, n, slab, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); return obj; must_grow: - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); if (slab) { /* This slab isn't counted yet so don't update free_objects */ @@ -3325,7 +3325,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) check_irq_off(); n = get_node(cachep, node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); if (n->shared) { struct array_cache *shared_array = n->shared; int max = shared_array->limit - shared_array->avail; @@ -3354,7 +3354,7 @@ free_done: STATS_SET_FREEABLE(cachep, i); } #endif - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); slabs_destroy(cachep, &list); @@ -3721,9 +3721,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, node = cpu_to_mem(cpu); n = get_node(cachep, node); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } free_percpu(prev); @@ -3815,9 +3815,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, return; } - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); drain_array_locked(cachep, ac, node, false, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } @@ -3901,7 +3901,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); total_slabs += n->total_slabs; free_slabs += n->free_slabs; @@ -3910,7 +3910,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) if (n->shared) shared_avail += n->shared->avail; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); } num_objs = total_slabs * cachep->num; active_slabs = total_slabs - free_slabs; diff --git a/mm/slab.h b/mm/slab.h index 0202a8c2f0d2..19e1baac807c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -750,9 +750,8 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, * The slab lists for all objects. */ struct kmem_cache_node { - spinlock_t list_lock; - #ifdef CONFIG_SLAB + raw_spinlock_t list_lock; struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; @@ -768,6 +767,7 @@ struct kmem_cache_node { #endif #ifdef CONFIG_SLUB + spinlock_t list_lock; unsigned long nr_partial; struct list_head partial; #ifdef CONFIG_SLUB_DEBUG -- cgit v1.2.3-70-g09d2 From bc29d5bd2ba977716e57572030290d6547ff3f6d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Aug 2022 11:09:11 +0200 Subject: mm/slub: perform free consistency checks before call_rcu For SLAB_TYPESAFE_BY_RCU caches we use call_rcu to perform empty slab freeing. The rcu callback rcu_free_slab() calls __free_slab() that currently includes checking the slab consistency for caches with SLAB_CONSISTENCY_CHECKS flags. This check needs the slab->objects field to be intact. Because in the next patch we want to allow rcu_head in struct slab to become larger in debug configurations and thus potentially overwrite more fields through a union than slab_list, we want to limit the fields used in rcu_free_slab(). Thus move the consistency checks to free_slab() before call_rcu(). This can be done safely even for SLAB_TYPESAFE_BY_RCU caches where accesses to the objects can still occur after freeing them. As a result, only the slab->slab_cache field has to be physically separate from rcu_head for the freeing callback to work. We also save some cycles in the rcu callback for caches with consistency checks enabled. Signed-off-by: Vlastimil Babka Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> --- mm/slub.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 157527d7101b..99ba865afc4a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1999,14 +1999,6 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) int order = folio_order(folio); int pages = 1 << order; - if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { - void *p; - - slab_pad_check(s, slab); - for_each_object(p, s, slab_address(slab), slab->objects) - check_object(s, slab, p, SLUB_RED_INACTIVE); - } - __slab_clear_pfmemalloc(slab); __folio_clear_slab(folio); folio->mapping = NULL; @@ -2025,9 +2017,17 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct slab *slab) { - if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { + if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { + void *p; + + slab_pad_check(s, slab); + for_each_object(p, s, slab_address(slab), slab->objects) + check_object(s, slab, p, SLUB_RED_INACTIVE); + } + + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&slab->rcu_head, rcu_free_slab); - } else + else __free_slab(s, slab); } -- cgit v1.2.3-70-g09d2 From 9ce67395f5a0cdec6ce152d26bfda13b98b25c01 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 21 Oct 2022 11:24:03 +0800 Subject: mm/slub: only zero requested size of buffer for kzalloc when debug enabled kzalloc/kmalloc will round up the request size to a fixed size (mostly power of 2), so the allocated memory could be more than requested. Currently kzalloc family APIs will zero all the allocated memory. To detect out-of-bound usage of the extra allocated memory, only zero the requested part, so that redzone sanity check could be added to the extra space later. For kzalloc users who will call ksize() later and utilize this extra space, please be aware that the space is not zeroed any more when debug is enabled. (Thanks to Kees Cook's effort to sanitize all ksize() user cases [1], this won't be a big issue). [1]. https://lore.kernel.org/all/20220922031013.2150682-1-keescook@chromium.org/#r Signed-off-by: Feng Tang Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Andrey Konovalov Signed-off-by: Vlastimil Babka --- mm/slab.c | 7 ++++--- mm/slab.h | 18 ++++++++++++++++-- mm/slub.c | 10 +++++++--- 3 files changed, 27 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 59c8e28f7b6a..26f41d47f5f1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3254,7 +3254,8 @@ slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, init = slab_want_init_on_alloc(flags, cachep); out: - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init); + slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init, + cachep->object_size); return objp; } @@ -3507,13 +3508,13 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * Done outside of the IRQ disabled section. */ slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s)); + slab_want_init_on_alloc(flags, s), s->object_size); /* FIXME: Trace call missing. Christoph would like a bulk variant */ return size; error: local_irq_enable(); cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); - slab_post_alloc_hook(s, objcg, flags, i, p, false); + slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); kmem_cache_free_bulk(s, i, p); return 0; } diff --git a/mm/slab.h b/mm/slab.h index 0202a8c2f0d2..2642102f6699 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -720,12 +720,26 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, static inline void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, gfp_t flags, - size_t size, void **p, bool init) + size_t size, void **p, bool init, + unsigned int orig_size) { + unsigned int zero_size = s->object_size; size_t i; flags &= gfp_allowed_mask; + /* + * For kmalloc object, the allocated memory size(object_size) is likely + * larger than the requested size(orig_size). If redzone check is + * enabled for the extra space, don't zero it, as it will be redzoned + * soon. The redzone operation for this extra space could be seen as a + * replacement of current poisoning under certain debug option, and + * won't break other sanity checks. + */ + if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && + (s->flags & SLAB_KMALLOC)) + zero_size = orig_size; + /* * As memory initialization might be integrated into KASAN, * kasan_slab_alloc and initialization memset must be @@ -736,7 +750,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, for (i = 0; i < size; i++) { p[i] = kasan_slab_alloc(s, p[i], flags, init); if (p[i] && init && !kasan_has_integrated_init()) - memset(p[i], 0, s->object_size); + memset(p[i], 0, zero_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); kmsan_slab_alloc(s, p[i], flags); diff --git a/mm/slub.c b/mm/slub.c index 157527d7101b..ecc44067625c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3395,7 +3395,11 @@ redo: init = slab_want_init_on_alloc(gfpflags, s); out: - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init); + /* + * When init equals 'true', like for kzalloc() family, only + * @orig_size bytes might be zeroed instead of s->object_size + */ + slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size); return object; } @@ -3852,11 +3856,11 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * Done outside of the IRQ disabled fastpath loop. */ slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s)); + slab_want_init_on_alloc(flags, s), s->object_size); return i; error: slub_put_cpu_ptr(s->cpu_slab); - slab_post_alloc_hook(s, objcg, flags, i, p, false); + slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); kmem_cache_free_bulk(s, i, p); return 0; } -- cgit v1.2.3-70-g09d2 From 5d1ba31087627423dfb2bd87badd62361701997b Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 21 Oct 2022 11:24:04 +0800 Subject: mm: kasan: Extend kasan_metadata_size() to also cover in-object size When kasan is enabled for slab/slub, it may save kasan' free_meta data in the former part of slab object data area in slab object's free path, which works fine. There is ongoing effort to extend slub's debug function which will redzone the latter part of kmalloc object area, and when both of the debug are enabled, there is possible conflict, especially when the kmalloc object has small size, as caught by 0Day bot [1]. To solve it, slub code needs to know the in-object kasan's meta data size. Currently, there is existing kasan_metadata_size() which returns the kasan's metadata size inside slub's metadata area, so extend it to also cover the in-object meta size by adding a boolean flag 'in_object'. There is no functional change to existing code logic. [1]. https://lore.kernel.org/lkml/YuYm3dWwpZwH58Hu@xsang-OptiPlex-9020/ Reported-by: kernel test robot Suggested-by: Andrey Konovalov Signed-off-by: Feng Tang Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Vlastimil Babka --- include/linux/kasan.h | 5 +++-- mm/kasan/generic.c | 19 +++++++++++++------ mm/slub.c | 4 ++-- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d811b3d7d2a1..96c9d56e5510 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -302,7 +302,7 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC -size_t kasan_metadata_size(struct kmem_cache *cache); +size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object); slab_flags_t kasan_never_merge(void); void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags); @@ -315,7 +315,8 @@ void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ /* Tag-based KASAN modes do not use per-object metadata. */ -static inline size_t kasan_metadata_size(struct kmem_cache *cache) +static inline size_t kasan_metadata_size(struct kmem_cache *cache, + bool in_object) { return 0; } diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index d8b5590f9484..b076f597a378 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -450,15 +450,22 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) __memset(alloc_meta, 0, sizeof(*alloc_meta)); } -size_t kasan_metadata_size(struct kmem_cache *cache) +size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) { + struct kasan_cache *info = &cache->kasan_info; + if (!kasan_requires_meta()) return 0; - return (cache->kasan_info.alloc_meta_offset ? - sizeof(struct kasan_alloc_meta) : 0) + - ((cache->kasan_info.free_meta_offset && - cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? - sizeof(struct kasan_free_meta) : 0); + + if (in_object) + return (info->free_meta_offset ? + 0 : sizeof(struct kasan_free_meta)); + else + return (info->alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + ((info->free_meta_offset && + info->free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); } static void __kasan_record_aux_stack(void *addr, bool can_alloc) diff --git a/mm/slub.c b/mm/slub.c index ecc44067625c..b81a4bba1b73 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -910,7 +910,7 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) if (slub_debug_orig_size(s)) off += sizeof(unsigned int); - off += kasan_metadata_size(s); + off += kasan_metadata_size(s, false); if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ @@ -1070,7 +1070,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) off += sizeof(unsigned int); } - off += kasan_metadata_size(s); + off += kasan_metadata_size(s, false); if (size_from_object(s) == off) return 1; -- cgit v1.2.3-70-g09d2 From 946fa0dbf2d8923a587f7348adf16563d59f1b3d Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 21 Oct 2022 11:24:05 +0800 Subject: mm/slub: extend redzone check to extra allocated kmalloc space than requested kmalloc will round up the request size to a fixed size (mostly power of 2), so there could be a extra space than what is requested, whose size is the actual buffer size minus original request size. To better detect out of bound access or abuse of this space, add redzone sanity check for it. In current kernel, some kmalloc user already knows the existence of the space and utilizes it after calling 'ksize()' to know the real size of the allocated buffer. So we skip the sanity check for objects which have been called with ksize(), as treating them as legitimate users. Kees Cook is working on sanitizing all these user cases, by using kmalloc_size_roundup() to avoid ambiguous usages. And after this is done, this special handling for ksize() can be removed. In some cases, the free pointer could be saved inside the latter part of object data area, which may overlap the redzone part(for small sizes of kmalloc objects). As suggested by Hyeonggon Yoo, force the free pointer to be in meta data area when kmalloc redzone debug is enabled, to make all kmalloc objects covered by redzone check. Suggested-by: Vlastimil Babka Signed-off-by: Feng Tang Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 4 ++++ mm/slab_common.c | 4 ++++ mm/slub.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 53 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 2642102f6699..190f2d4ec216 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -885,4 +885,8 @@ void __check_heap_object(const void *ptr, unsigned long n, } #endif +#ifdef CONFIG_SLUB_DEBUG +void skip_orig_size_check(struct kmem_cache *s, const void *object); +#endif + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 0042fb2730d1..8276022f0da4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1037,6 +1037,10 @@ size_t __ksize(const void *object) return folio_size(folio); } +#ifdef CONFIG_SLUB_DEBUG + skip_orig_size_check(folio_slab(folio)->slab_cache, object); +#endif + return slab_ksize(folio_slab(folio)->slab_cache); } diff --git a/mm/slub.c b/mm/slub.c index b81a4bba1b73..5f3e34923065 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -829,6 +829,17 @@ static inline void set_orig_size(struct kmem_cache *s, if (!slub_debug_orig_size(s)) return; +#ifdef CONFIG_KASAN_GENERIC + /* + * KASAN could save its free meta data in object's data area at + * offset 0, if the size is larger than 'orig_size', it will + * overlap the data redzone in [orig_size+1, object_size], and + * the check should be skipped. + */ + if (kasan_metadata_size(s, true) > orig_size) + orig_size = s->object_size; +#endif + p += get_info_end(s); p += sizeof(struct track) * 2; @@ -848,6 +859,11 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) return *(unsigned int *)p; } +void skip_orig_size_check(struct kmem_cache *s, const void *object) +{ + set_orig_size(s, (void *)object, s->object_size); +} + static void slab_bug(struct kmem_cache *s, char *fmt, ...) { struct va_format vaf; @@ -966,17 +982,28 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = kasan_reset_tag(object); + unsigned int poison_size = s->object_size; - if (s->flags & SLAB_RED_ZONE) + if (s->flags & SLAB_RED_ZONE) { memset(p - s->red_left_pad, val, s->red_left_pad); + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + /* + * Redzone the extra allocated space by kmalloc than + * requested, and the poison size will be limited to + * the original request size accordingly. + */ + poison_size = get_orig_size(s, object); + } + } + if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->object_size - 1); - p[s->object_size - 1] = POISON_END; + memset(p, POISON_FREE, poison_size - 1); + p[poison_size - 1] = POISON_END; } if (s->flags & SLAB_RED_ZONE) - memset(p + s->object_size, val, s->inuse - s->object_size); + memset(p + poison_size, val, s->inuse - poison_size); } static void restore_bytes(struct kmem_cache *s, char *message, u8 data, @@ -1120,6 +1147,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, { u8 *p = object; u8 *endobject = object + s->object_size; + unsigned int orig_size; if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", @@ -1129,6 +1157,17 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (!check_bytes_and_report(s, slab, object, "Right Redzone", endobject, val, s->inuse - s->object_size)) return 0; + + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + orig_size = get_orig_size(s, object); + + if (s->object_size > orig_size && + !check_bytes_and_report(s, slab, object, + "kmalloc Redzone", p + orig_size, + val, s->object_size - orig_size)) { + return 0; + } + } } else { if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { check_bytes_and_report(s, slab, p, "Alignment padding", @@ -4206,7 +4245,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || + if (slub_debug_orig_size(s) || + (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || s->ctor) { /* -- cgit v1.2.3-70-g09d2 From a0dc161ae77377ae770b5626bce9b72cff5d9ed6 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 24 Oct 2022 16:14:35 +0800 Subject: mm/slub, percpu: correct the calculation of early percpu allocation size SLUB allocator relies on percpu allocator to initialize its ->cpu_slab during early boot. For that, the dynamic chunk of percpu which serves the early allocation need be large enough to satisfy the kmalloc creation. However, the current BUILD_BUG_ON() in alloc_kmem_cache_cpus() doesn't consider the kmalloc array with NR_KMALLOC_TYPES length. Fix that with correct calculation. Signed-off-by: Baoquan He Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Vlastimil Babka Cc: Roman Gushchin Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Dennis Zhou Signed-off-by: Vlastimil Babka --- mm/slub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5eea9e446672..52b8995a03d1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4005,7 +4005,8 @@ init_kmem_cache_node(struct kmem_cache_node *n) static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < - KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); + NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * + sizeof(struct kmem_cache_cpu)); /* * Must align to double word boundary for the double cmpxchg -- cgit v1.2.3-70-g09d2 From 838de63b101147fc7d8af828465cf6d1d30232a8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 10 Nov 2022 09:10:30 +0100 Subject: mm/slab: move and adjust kernel-doc for kmem_cache_alloc Alexander reports an issue with the kmem_cache_alloc() comment in mm/slab.c: > The current comment mentioned that the flags only matters if the > cache has no available objects. It's different for the __GFP_ZERO > flag which will ensure that the returned object is always zeroed > in any case. > I have the feeling I run into this question already two times if > the user need to zero the object or not, but the user does not need > to zero the object afterwards. However another use of __GFP_ZERO > and only zero the object if the cache has no available objects would > also make no sense. and suggests thus mentioning __GFP_ZERO as the exception. But on closer inspection, the part about flags being only relevant if cache has no available objects is misleading. The slab user has no reliable way to determine if there are available objects, and e.g. the might_sleep() debug check can be performed even if objects are available, so passing correct flags given the allocation context always matters. Thus remove that sentence completely, and while at it, move the comment to from SLAB-specific mm/slab.c to the common include/linux/slab.h The comment otherwise refers flags description for kmalloc(), so add __GFP_ZERO comment there and remove a very misleading GFP_HIGHUSER (not applicable to slab) description from there. Mention kzalloc() and kmem_cache_zalloc() shortcuts. Reported-by: Alexander Aring Link: https://lore.kernel.org/all/20221011145413.8025-1-aahringo@redhat.com/ Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 23 +++++++++++++++++------ mm/slab.c | 10 ---------- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/include/linux/slab.h b/include/linux/slab.h index 90877fcde70b..1bf631f29cd2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -441,7 +441,18 @@ static_assert(PAGE_SHIFT <= 20); #endif /* !CONFIG_SLOB */ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc; + +/** + * kmem_cache_alloc - Allocate an object + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache. + * See kmem_cache_zalloc() for a shortcut of adding __GFP_ZERO to flags. + * + * Return: pointer to the new object or %NULL in case of error + */ +void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc; void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) __assume_slab_alignment __malloc; void kmem_cache_free(struct kmem_cache *s, void *objp); @@ -506,9 +517,9 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align __alloc_size(1); /** - * kmalloc - allocate memory + * kmalloc - allocate kernel memory * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate. + * @flags: describe the allocation context * * kmalloc is the normal method of allocating memory * for objects smaller than page size in the kernel. @@ -535,12 +546,12 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align * %GFP_ATOMIC * Allocation will not sleep. May use emergency pools. * - * %GFP_HIGHUSER - * Allocate memory from high memory on behalf of user. - * * Also it is possible to set different flags by OR'ing * in one or more of the following additional @flags: * + * %__GFP_ZERO + * Zero the allocated memory before returning. Also see kzalloc(). + * * %__GFP_HIGH * This allocation has high priority and may use emergency pools. * diff --git a/mm/slab.c b/mm/slab.c index 59c8e28f7b6a..f6f3e51317d5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3446,16 +3446,6 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, return ret; } -/** - * kmem_cache_alloc - Allocate an object - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * - * Allocate an object from this cache. The flags are only relevant - * if the cache has no available objects. - * - * Return: pointer to the new object or %NULL in case of error - */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { return __kmem_cache_alloc_lru(cachep, NULL, flags); -- cgit v1.2.3-70-g09d2 From 8b8817630ae80032e80b2eaf334de756ac1ff6a3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 4 Nov 2022 15:57:26 +0100 Subject: mm/migrate: make isolate_movable_page() skip slab pages In the next commit we want to rearrange struct slab fields to allow a larger rcu_head. Afterwards, the page->mapping field will overlap with SLUB's "struct list_head slab_list", where the value of prev pointer can become LIST_POISON2, which is 0x122 + POISON_POINTER_DELTA. Unfortunately the bit 1 being set can confuse PageMovable() to be a false positive and cause a GPF as reported by lkp [1]. To fix this, make isolate_movable_page() skip pages with the PageSlab flag set. This is a bit tricky as we need to add memory barriers to SLAB and SLUB's page allocation and freeing, and their counterparts to isolate_movable_page(). Based on my RFC from [2]. Added a comment update from Matthew's variant in [3] and, as done there, moved the PageSlab checks to happen before trying to take the page lock. [1] https://lore.kernel.org/all/208c1757-5edd-fd42-67d4-1940cc43b50f@intel.com/ [2] https://lore.kernel.org/all/aec59f53-0e53-1736-5932-25407125d4d4@suse.cz/ [3] https://lore.kernel.org/all/YzsVM8eToHUeTP75@casper.infradead.org/ Reported-by: kernel test robot Signed-off-by: Vlastimil Babka Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> --- mm/migrate.c | 15 ++++++++++++--- mm/slab.c | 6 +++++- mm/slub.c | 6 +++++- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 1379e1912772..959c99cff814 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -74,13 +74,22 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) if (unlikely(!get_page_unless_zero(page))) goto out; + if (unlikely(PageSlab(page))) + goto out_putpage; + /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ + smp_rmb(); /* - * Check PageMovable before holding a PG_lock because page's owner - * assumes anybody doesn't touch PG_lock of newly allocated page - * so unconditionally grabbing the lock ruins page's owner side. + * Check movable flag before taking the page lock because + * we use non-atomic bitops on newly allocated page flags so + * unconditionally grabbing the lock ruins page's owner side. */ if (unlikely(!__PageMovable(page))) goto out_putpage; + /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ + smp_rmb(); + if (unlikely(PageSlab(page))) + goto out_putpage; + /* * As movable pages are not isolated from LRU lists, concurrent * compaction threads can race against page migration functions diff --git a/mm/slab.c b/mm/slab.c index 59c8e28f7b6a..219beb48588e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1370,6 +1370,8 @@ static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, account_slab(slab, cachep->gfporder, cachep, flags); __folio_set_slab(folio); + /* Make the flag visible before any changes to folio->mapping */ + smp_wmb(); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0))) slab_set_pfmemalloc(slab); @@ -1387,9 +1389,11 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab) BUG_ON(!folio_test_slab(folio)); __slab_clear_pfmemalloc(slab); - __folio_clear_slab(folio); page_mapcount_reset(folio_page(folio, 0)); folio->mapping = NULL; + /* Make the mapping reset visible before clearing the flag */ + smp_wmb(); + __folio_clear_slab(folio); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; diff --git a/mm/slub.c b/mm/slub.c index 99ba865afc4a..5e6519d5169c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1800,6 +1800,8 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, slab = folio_slab(folio); __folio_set_slab(folio); + /* Make the flag visible before any changes to folio->mapping */ + smp_wmb(); if (page_is_pfmemalloc(folio_page(folio, 0))) slab_set_pfmemalloc(slab); @@ -2000,8 +2002,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) int pages = 1 << order; __slab_clear_pfmemalloc(slab); - __folio_clear_slab(folio); folio->mapping = NULL; + /* Make the mapping reset visible before clearing the flag */ + smp_wmb(); + __folio_clear_slab(folio); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; unaccount_slab(slab, order, s); -- cgit v1.2.3-70-g09d2 From 130d4df57390a29521cb7cccd1b3144c184c111c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Aug 2022 11:09:12 +0200 Subject: mm/sl[au]b: rearrange struct slab fields to allow larger rcu_head Joel reports [1] that increasing the rcu_head size for debugging purposes used to work before struct slab was split from struct page, but now runs into the various SLAB_MATCH() sanity checks of the layout. This is because the rcu_head in struct page is in union with large sub-structures and has space to grow without exceeding their size, while in struct slab (for SLAB and SLUB) it's in union only with a list_head. On closer inspection (and after the previous patch) we can put all fields except slab_cache to a union with rcu_head, as slab_cache is sufficient for the rcu freeing callbacks to work and the rest can be overwritten by rcu_head without causing issues. This is only somewhat complicated by the need to keep SLUB's freelist+counters aligned for cmpxchg_double. As a result the fields need to be reordered so that slab_cache is first (after page flags) and the union with rcu_head follows. For consistency, do that for SLAB as well, although not necessary there. As a result, the rcu_head field in struct page and struct slab is no longer at the same offset, but that doesn't matter as there is no casting that would rely on that in the slab freeing callbacks, so we can just drop the respective SLAB_MATCH() check. Also we need to update the SLAB_MATCH() for compound_head to reflect the new ordering. While at it, also add a static_assert to check the alignment needed for cmpxchg_double so mistakes are found sooner than a runtime GPF. [1] https://lore.kernel.org/all/85afd876-d8bb-0804-b2c5-48ed3055e702@joelfernandes.org/ Reported-by: Joel Fernandes Signed-off-by: Vlastimil Babka Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> --- mm/slab.h | 54 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 0202a8c2f0d2..b373952eef70 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -11,37 +11,43 @@ struct slab { #if defined(CONFIG_SLAB) + struct kmem_cache *slab_cache; union { - struct list_head slab_list; + struct { + struct list_head slab_list; + void *freelist; /* array of free object indexes */ + void *s_mem; /* first object */ + }; struct rcu_head rcu_head; }; - struct kmem_cache *slab_cache; - void *freelist; /* array of free object indexes */ - void *s_mem; /* first object */ unsigned int active; #elif defined(CONFIG_SLUB) - union { - struct list_head slab_list; - struct rcu_head rcu_head; -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct { - struct slab *next; - int slabs; /* Nr of slabs left */ - }; -#endif - }; struct kmem_cache *slab_cache; - /* Double-word boundary */ - void *freelist; /* first free object */ union { - unsigned long counters; struct { - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; + union { + struct list_head slab_list; +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct { + struct slab *next; + int slabs; /* Nr of slabs left */ + }; +#endif + }; + /* Double-word boundary */ + void *freelist; /* first free object */ + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; }; + struct rcu_head rcu_head; }; unsigned int __unused; @@ -66,9 +72,10 @@ struct slab { #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, __page_flags); -SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ #ifndef CONFIG_SLOB -SLAB_MATCH(rcu_head, rcu_head); +SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ +#else +SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ #endif SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG @@ -76,6 +83,9 @@ SLAB_MATCH(memcg_data, memcg_data); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB) +static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 2*sizeof(void *))); +#endif /** * folio_slab - Converts from folio to slab. -- cgit v1.2.3-70-g09d2 From 346907ceb9d11b9e22677c142b45ff50dd20a66a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 16 Nov 2022 15:56:32 +0100 Subject: mm, slab: ignore hardened usercopy parameters when disabled With CONFIG_HARDENED_USERCOPY not enabled, there are no __check_heap_object() checks happening that would use the struct kmem_cache useroffset and usersize fields. Yet the fields are still initialized, preventing merging of otherwise compatible caches. Also the fields contribute to struct kmem_cache size unnecessarily when unused. Thus #ifdef them out completely when CONFIG_HARDENED_USERCOPY is disabled. In kmem_dump_obj() print object_size instead of usersize, as that's actually the intention. In a quick virtme boot test, this has reduced the number of caches in /proc/slabinfo from 131 to 111. Cc: Kees Cook Signed-off-by: Vlastimil Babka Acked-by: Roman Gushchin Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- include/linux/slab_def.h | 2 ++ include/linux/slub_def.h | 2 ++ mm/slab.h | 2 -- mm/slab_common.c | 13 ++++++++++--- mm/slub.c | 4 ++++ 5 files changed, 18 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index f0ffad6a3365..5834bad8ad78 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -80,8 +80,10 @@ struct kmem_cache { unsigned int *random_seq; #endif +#ifdef CONFIG_HARDENED_USERCOPY unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ +#endif struct kmem_cache_node *node[MAX_NUMNODES]; }; diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index f9c68a9dac04..7ed5e455cbf4 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -136,8 +136,10 @@ struct kmem_cache { struct kasan_cache kasan_info; #endif +#ifdef CONFIG_HARDENED_USERCOPY unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ +#endif struct kmem_cache_node *node[MAX_NUMNODES]; }; diff --git a/mm/slab.h b/mm/slab.h index 0202a8c2f0d2..db9a7984e22e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -207,8 +207,6 @@ struct kmem_cache { unsigned int size; /* The aligned/padded/added on size */ unsigned int align; /* Alignment as calculated */ slab_flags_t flags; /* Active flags on the slab */ - unsigned int useroffset;/* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ const char *name; /* Slab name for sysfs */ int refcount; /* Use counter */ void (*ctor)(void *); /* Called on object slot creation */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 0042fb2730d1..af0f370fe7a2 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -143,8 +143,10 @@ int slab_unmergeable(struct kmem_cache *s) if (s->ctor) return 1; +#ifdef CONFIG_HARDENED_USERCOPY if (s->usersize) return 1; +#endif /* * We may have set a slab to be unmergeable during bootstrap. @@ -223,8 +225,10 @@ static struct kmem_cache *create_cache(const char *name, s->size = s->object_size = object_size; s->align = align; s->ctor = ctor; +#ifdef CONFIG_HARDENED_USERCOPY s->useroffset = useroffset; s->usersize = usersize; +#endif err = __kmem_cache_create(s, flags); if (err) @@ -317,7 +321,8 @@ kmem_cache_create_usercopy(const char *name, flags &= CACHE_CREATE_MASK; /* Fail closed on bad usersize of useroffset values. */ - if (WARN_ON(!usersize && useroffset) || + if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || + WARN_ON(!usersize && useroffset) || WARN_ON(size < usersize || size - usersize < useroffset)) usersize = useroffset = 0; @@ -595,8 +600,8 @@ void kmem_dump_obj(void *object) ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset; pr_cont(" pointer offset %lu", ptroffset); } - if (kp.kp_slab_cache && kp.kp_slab_cache->usersize) - pr_cont(" size %u", kp.kp_slab_cache->usersize); + if (kp.kp_slab_cache && kp.kp_slab_cache->object_size) + pr_cont(" size %u", kp.kp_slab_cache->object_size); if (kp.kp_ret) pr_cont(" allocated at %pS\n", kp.kp_ret); else @@ -640,8 +645,10 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, align = max(align, size); s->align = calculate_alignment(flags, align, size); +#ifdef CONFIG_HARDENED_USERCOPY s->useroffset = useroffset; s->usersize = usersize; +#endif err = __kmem_cache_create(s, flags); diff --git a/mm/slub.c b/mm/slub.c index 157527d7101b..e32db8540767 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5502,11 +5502,13 @@ static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) SLAB_ATTR_RO(cache_dma); #endif +#ifdef CONFIG_HARDENED_USERCOPY static ssize_t usersize_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%u\n", s->usersize); } SLAB_ATTR_RO(usersize); +#endif static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { @@ -5803,7 +5805,9 @@ static struct attribute *slab_attrs[] = { #ifdef CONFIG_FAILSLAB &failslab_attr.attr, #endif +#ifdef CONFIG_HARDENED_USERCOPY &usersize_attr.attr, +#endif #ifdef CONFIG_KFENCE &skip_kfence_attr.attr, #endif -- cgit v1.2.3-70-g09d2 From e240e53ae0abb0896e0f399bdfef41c69cec3123 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 14 Nov 2022 18:13:45 +0100 Subject: mm, slub: add CONFIG_SLUB_TINY For tiny systems that have used SLOB until now, SLUB might be impractical due to its higher memory usage. To help with that, introduce an option CONFIG_SLUB_TINY that modifies SLUB to use less memory. This is done by sacrificing scalability, security and debugging features, therefore not recommended for any system with more than 16MB RAM. This commit introduces the option and uses it to set other related options in a way that reduces memory usage. Signed-off-by: Vlastimil Babka Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- lib/Kconfig.kasan | 2 +- mm/Kconfig | 21 +++++++++++++++++---- mm/Kconfig.debug | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index ca09b1cf8ee9..836f70393e22 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -37,7 +37,7 @@ menuconfig KASAN (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)) && \ CC_HAS_WORKING_NOSANITIZE_ADDRESS) || \ HAVE_ARCH_KASAN_HW_TAGS - depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) + depends on (SLUB && SYSFS && !SLUB_TINY) || (SLAB && !DEBUG_SLAB) select STACKDEPOT_ALWAYS_INIT help Enables KASAN (Kernel Address Sanitizer) - a dynamic memory safety diff --git a/mm/Kconfig b/mm/Kconfig index 57e1d8c5b505..6701d72d3037 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -230,6 +230,19 @@ config SLOB endchoice +config SLUB_TINY + bool "Configure SLUB for minimal memory footprint" + depends on SLUB && EXPERT + select SLAB_MERGE_DEFAULT + help + Configures the SLUB allocator in a way to achieve minimal memory + footprint, sacrificing scalability, debugging and other features. + This is intended only for the smallest system that had used the + SLOB allocator and is not recommended for systems with more than + 16MB RAM. + + If unsure, say N. + config SLAB_MERGE_DEFAULT bool "Allow slab caches to be merged" default y @@ -247,7 +260,7 @@ config SLAB_MERGE_DEFAULT config SLAB_FREELIST_RANDOM bool "Randomize slab freelist" - depends on SLAB || SLUB + depends on SLAB || (SLUB && !SLUB_TINY) help Randomizes the freelist order used on creating new pages. This security feature reduces the predictability of the kernel slab @@ -255,7 +268,7 @@ config SLAB_FREELIST_RANDOM config SLAB_FREELIST_HARDENED bool "Harden slab freelist metadata" - depends on SLAB || SLUB + depends on SLAB || (SLUB && !SLUB_TINY) help Many kernel heap attacks try to target slab cache metadata and other infrastructure. This options makes minor performance @@ -267,7 +280,7 @@ config SLAB_FREELIST_HARDENED config SLUB_STATS default n bool "Enable SLUB performance statistics" - depends on SLUB && SYSFS + depends on SLUB && SYSFS && !SLUB_TINY help SLUB statistics are useful to debug SLUBs allocation behavior in order find ways to optimize the allocator. This should never be @@ -279,7 +292,7 @@ config SLUB_STATS config SLUB_CPU_PARTIAL default y - depends on SLUB && SMP + depends on SLUB && SMP && !SLUB_TINY bool "SLUB per cpu partial cache" help Per cpu partial caches accelerate objects allocation and freeing diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index ce8dded36de9..fca699ad1fb0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -56,7 +56,7 @@ config DEBUG_SLAB config SLUB_DEBUG default y bool "Enable SLUB debugging support" if EXPERT - depends on SLUB && SYSFS + depends on SLUB && SYSFS && !SLUB_TINY select STACKDEPOT if STACKTRACE_SUPPORT help SLUB has extensive debug support features. Disabling these can -- cgit v1.2.3-70-g09d2 From b1a413a39a1a7694acf3636a52c109821148ecdd Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 14 Nov 2022 18:18:39 +0100 Subject: mm, slub: disable SYSFS support with CONFIG_SLUB_TINY Currently SLUB enables its sysfs support depending unconditionally on the general CONFIG_SYSFS setting. To reduce the configuration combination space, make CONFIG_SLUB_TINY disable SLUB's sysfs support by reusing the existing SLAB_SUPPORTS_SYSFS define. It is unlikely that real tiny systems would combine CONFIG_SLUB_TINY with CONFIG_SYSFS, but a randconfig might. Signed-off-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- include/linux/slub_def.h | 2 +- mm/slub.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 7ed5e455cbf4..95b67863d5cf 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -144,7 +144,7 @@ struct kmem_cache { struct kmem_cache_node *node[MAX_NUMNODES]; }; -#ifdef CONFIG_SYSFS +#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) #define SLAB_SUPPORTS_SYSFS void sysfs_slab_unlink(struct kmem_cache *); void sysfs_slab_release(struct kmem_cache *); diff --git a/mm/slub.c b/mm/slub.c index e32db8540767..b81ceeb6e6de 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -298,7 +298,7 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); #else @@ -2935,7 +2935,7 @@ out: } #endif /* CONFIG_SLUB_DEBUG */ -#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) +#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct slab *)) { @@ -2949,7 +2949,7 @@ static unsigned long count_partial(struct kmem_cache_node *n, spin_unlock_irqrestore(&n->list_lock, flags); return x; } -#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ +#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) @@ -4924,7 +4924,7 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) return 0; } -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int count_inuse(struct slab *slab) { return slab->inuse; @@ -5182,7 +5182,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */ -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@ -6060,7 +6060,7 @@ static int __init slab_sysfs_init(void) } __initcall(slab_sysfs_init); -#endif /* CONFIG_SYSFS */ +#endif /* SLAB_SUPPORTS_SYSFS */ #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) static int slab_debugfs_show(struct seq_file *seq, void *v) -- cgit v1.2.3-70-g09d2 From 5a8a3c1f73c6488d1a2c18ac1f5308b1fd2aa5f0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Nov 2022 10:50:28 +0100 Subject: mm, slub: retain no free slabs on partial list with CONFIG_SLUB_TINY SLUB will leave a number of slabs on the partial list even if they are empty, to avoid some slab freeing and reallocation. The goal of CONFIG_SLUB_TINY is to minimize memory overhead, so set the limits to 0 for immediate slab page freeing. Signed-off-by: Vlastimil Babka Acked-by: Roman Gushchin Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- mm/slub.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index b81ceeb6e6de..19b6cf74bdfc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -241,6 +241,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* Enable to log cmpxchg failures */ #undef SLUB_DEBUG_CMPXCHG +#ifndef CONFIG_SLUB_TINY /* * Minimum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -253,6 +254,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 +#else +#define MIN_PARTIAL 0 +#define MAX_PARTIAL 0 +#endif #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) -- cgit v1.2.3-70-g09d2 From 90ce872c22b248724d1c87232410e3b38536e107 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 21 Nov 2022 11:44:54 +0100 Subject: mm, slub: lower the default slub_max_order with CONFIG_SLUB_TINY With CONFIG_SLUB_TINY we want to minimize memory overhead. By lowering the default slub_max_order we can make slab allocations use smaller pages. However depending on object sizes, order-0 might not be the best due to increased fragmentation. When testing on a 8MB RAM k210 system by Damien Le Moal [1], slub_max_order=1 had the best results, so use that as the default for CONFIG_SLUB_TINY. [1] https://lore.kernel.org/all/6a1883c4-4c3f-545a-90e8-2cd805bcf4ae@opensource.wdc.com/ Signed-off-by: Vlastimil Babka Acked-by: Roman Gushchin Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- mm/slub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 19b6cf74bdfc..f4c7f4e3751f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3888,7 +3888,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); * take the list_lock. */ static unsigned int slub_min_order; -static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static unsigned int slub_max_order = + IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER; static unsigned int slub_min_objects; /* -- cgit v1.2.3-70-g09d2 From 2f7c1c1396b587e8cfe18a1f0d628cedaae56b6a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Nov 2022 18:19:28 +0100 Subject: mm, slub: don't create kmalloc-rcl caches with CONFIG_SLUB_TINY Distinguishing kmalloc(__GFP_RECLAIMABLE) can help against fragmentation by grouping pages by mobility, but on tiny systems the extra memory overhead of separate set of kmalloc-rcl caches will probably be worse, and mobility grouping likely disabled anyway. Thus with CONFIG_SLUB_TINY, don't create kmalloc-rcl caches and use the regular ones. Signed-off-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- include/linux/slab.h | 9 +++++++-- mm/slab_common.c | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/slab.h b/include/linux/slab.h index 45efc6c553b8..ae2d19ec8467 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -336,12 +336,17 @@ enum kmalloc_cache_type { #endif #ifndef CONFIG_MEMCG_KMEM KMALLOC_CGROUP = KMALLOC_NORMAL, -#else - KMALLOC_CGROUP, #endif +#ifdef CONFIG_SLUB_TINY + KMALLOC_RECLAIM = KMALLOC_NORMAL, +#else KMALLOC_RECLAIM, +#endif #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, +#endif +#ifdef CONFIG_MEMCG_KMEM + KMALLOC_CGROUP, #endif NR_KMALLOC_TYPES }; diff --git a/mm/slab_common.c b/mm/slab_common.c index af0f370fe7a2..012fc75d3ffa 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -773,10 +773,16 @@ EXPORT_SYMBOL(kmalloc_size_roundup); #define KMALLOC_CGROUP_NAME(sz) #endif +#ifndef CONFIG_SLUB_TINY +#define KMALLOC_RCL_NAME(sz) .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz, +#else +#define KMALLOC_RCL_NAME(sz) +#endif + #define INIT_KMALLOC_INFO(__size, __short_size) \ { \ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ - .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ + KMALLOC_RCL_NAME(__short_size) \ KMALLOC_CGROUP_NAME(__short_size) \ KMALLOC_DMA_NAME(__short_size) \ .size = __size, \ @@ -862,7 +868,7 @@ void __init setup_kmalloc_cache_index_table(void) static void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) { - if (type == KMALLOC_RECLAIM) { + if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) { flags |= SLAB_RECLAIM_ACCOUNT; } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { if (mem_cgroup_kmem_disabled()) { -- cgit v1.2.3-70-g09d2 From fa9b88e459d710cadf3b01e8a64eda00cc91cdd6 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 21 Nov 2022 16:06:38 +0100 Subject: mm, slub: refactor free debug processing Since commit c7323a5ad078 ("mm/slub: restrict sysfs validation to debug caches and make it safe"), caches with debugging enabled use the free_debug_processing() function to do both freeing checks and actual freeing to partial list under list_lock, bypassing the fast paths. We will want to use the same path for CONFIG_SLUB_TINY, but without the debugging checks, so refactor the code so that free_debug_processing() does only the checks, while the freeing is handled by a new function free_to_partial_list(). For consistency, change return parameter alloc_debug_processing() from int to bool and correct the !SLUB_DEBUG variant to return true and not false. This didn't matter until now, but will in the following changes. Signed-off-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> --- mm/slub.c | 154 +++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 83 insertions(+), 71 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index f4c7f4e3751f..d814936649b1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1368,7 +1368,7 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, return 1; } -static noinline int alloc_debug_processing(struct kmem_cache *s, +static noinline bool alloc_debug_processing(struct kmem_cache *s, struct slab *slab, void *object, int orig_size) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { @@ -1380,7 +1380,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, trace(s, slab, object, 1); set_orig_size(s, object, orig_size); init_object(s, object, SLUB_RED_ACTIVE); - return 1; + return true; bad: if (folio_test_slab(slab_folio(slab))) { @@ -1393,7 +1393,7 @@ bad: slab->inuse = slab->objects; slab->freelist = NULL; } - return 0; + return false; } static inline int free_consistency_checks(struct kmem_cache *s, @@ -1646,17 +1646,17 @@ static inline void setup_object_debug(struct kmem_cache *s, void *object) {} static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} -static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object, int orig_size) { return 0; } +static inline bool alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) { return true; } -static inline void free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) {} +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) { return true; } static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } +static inline depot_stack_handle_t set_track_prepare(void) { return 0; } static inline void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -2833,38 +2833,28 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) } /* Supports checking bulk free of a constructed freelist */ -static noinline void free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) { - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - struct slab *slab_free = NULL; + bool checks_ok = false; void *object = head; int cnt = 0; - unsigned long flags; - bool checks_ok = false; - depot_stack_handle_t handle = 0; - - if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); - - spin_lock_irqsave(&n->list_lock, flags); if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!check_slab(s, slab)) goto out; } - if (slab->inuse < bulk_cnt) { + if (slab->inuse < *bulk_cnt) { slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", - slab->inuse, bulk_cnt); + slab->inuse, *bulk_cnt); goto out; } next_object: - if (++cnt > bulk_cnt) + if (++cnt > *bulk_cnt) goto out_cnt; if (s->flags & SLAB_CONSISTENCY_CHECKS) { @@ -2886,57 +2876,18 @@ next_object: checks_ok = true; out_cnt: - if (cnt != bulk_cnt) + if (cnt != *bulk_cnt) { slab_err(s, slab, "Bulk free expected %d objects but found %d\n", - bulk_cnt, cnt); - -out: - if (checks_ok) { - void *prior = slab->freelist; - - /* Perform the actual freeing while we still hold the locks */ - slab->inuse -= cnt; - set_freepointer(s, tail, prior); - slab->freelist = head; - - /* - * If the slab is empty, and node's partial list is full, - * it should be discarded anyway no matter it's on full or - * partial list. - */ - if (slab->inuse == 0 && n->nr_partial >= s->min_partial) - slab_free = slab; - - if (!prior) { - /* was on full list */ - remove_full(s, n, slab); - if (!slab_free) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } - } else if (slab_free) { - remove_partial(n, slab); - stat(s, FREE_REMOVE_PARTIAL); - } + *bulk_cnt, cnt); + *bulk_cnt = cnt; } - if (slab_free) { - /* - * Update the counters while still holding n->list_lock to - * prevent spurious validation warnings - */ - dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); - } - - spin_unlock_irqrestore(&n->list_lock, flags); +out: if (!checks_ok) slab_fix(s, "Object at 0x%p not freed", object); - if (slab_free) { - stat(s, FREE_SLAB); - free_slab(s, slab_free); - } + return checks_ok; } #endif /* CONFIG_SLUB_DEBUG */ @@ -3453,6 +3404,67 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) } EXPORT_SYMBOL(kmem_cache_alloc_node); +static noinline void free_to_partial_list( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; + int cnt = bulk_cnt; + unsigned long flags; + depot_stack_handle_t handle = 0; + + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); + + spin_lock_irqsave(&n->list_lock, flags); + + if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* + * If the slab is empty, and node's partial list is full, + * it should be discarded anyway no matter it's on full or + * partial list. + */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) + slab_free = slab; + + if (!prior) { + /* was on full list */ + remove_full(s, n, slab); + if (!slab_free) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } else if (slab_free) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } +} + /* * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -3479,7 +3491,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; if (kmem_cache_debug(s)) { - free_debug_processing(s, slab, head, tail, cnt, addr); + free_to_partial_list(s, slab, head, tail, cnt, addr); return; } -- cgit v1.2.3-70-g09d2 From 6cd6d33ca41ff4af21bc25c331ab34b50b4a9c8c Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 30 Nov 2022 16:54:51 +0800 Subject: mm/slub, kunit: Add a test case for kmalloc redzone check kmalloc redzone check for slub has been merged, and it's better to add a kunit case for it, which is inspired by a real-world case as described in commit 120ee599b5bf ("staging: octeon-usb: prevent memory corruption"): " octeon-hcd will crash the kernel when SLOB is used. This usually happens after the 18-byte control transfer when a device descriptor is read. The DMA engine is always transferring full 32-bit words and if the transfer is shorter, some random garbage appears after the buffer. The problem is not visible with SLUB since it rounds up the allocations to word boundary, and the extra bytes will go undetected. " To avoid interrupting the normal functioning of kmalloc caches, a kmem_cache mimicing kmalloc cache is created with similar flags, and kmalloc_trace() is used to really test the orig_size and redzone setup. Suggested-by: Vlastimil Babka Signed-off-by: Feng Tang Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- lib/slub_kunit.c | 22 ++++++++++++++++++++++ mm/slab.h | 4 +++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c index 5b0c8e7eb6dc..bdf358d520b4 100644 --- a/lib/slub_kunit.c +++ b/lib/slub_kunit.c @@ -135,6 +135,27 @@ static void test_clobber_redzone_free(struct kunit *test) kmem_cache_destroy(s); } +static void test_kmalloc_redzone_access(struct kunit *test) +{ + struct kmem_cache *s = test_kmem_cache_create("TestSlub_RZ_kmalloc", 32, + SLAB_KMALLOC|SLAB_STORE_USER|SLAB_RED_ZONE); + u8 *p = kmalloc_trace(s, GFP_KERNEL, 18); + + kasan_disable_current(); + + /* Suppress the -Warray-bounds warning */ + OPTIMIZER_HIDE_VAR(p); + p[18] = 0xab; + p[19] = 0xab; + + validate_slab_cache(s); + KUNIT_EXPECT_EQ(test, 2, slab_errors); + + kasan_enable_current(); + kmem_cache_free(s, p); + kmem_cache_destroy(s); +} + static int test_init(struct kunit *test) { slab_errors = 0; @@ -154,6 +175,7 @@ static struct kunit_case test_cases[] = { #endif KUNIT_CASE(test_clobber_redzone_free), + KUNIT_CASE(test_kmalloc_redzone_access), {} }; diff --git a/mm/slab.h b/mm/slab.h index 190f2d4ec216..d5a0b69b81ab 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -336,7 +336,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_NO_USER_FLAGS) + SLAB_TEMPORARY | SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS | SLAB_KMALLOC) #else #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) #endif @@ -356,6 +357,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ SLAB_ACCOUNT | \ + SLAB_KMALLOC | \ SLAB_NO_USER_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); -- cgit v1.2.3-70-g09d2 From 56d5a2b9ba85a390473e86b4fe4697560242a248 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 21 Nov 2022 16:23:50 +0100 Subject: mm, slub: split out allocations from pre/post hooks In the following patch we want to introduce CONFIG_SLUB_TINY allocation paths that don't use the percpu slab. To prepare, refactor the allocation functions: Split out __slab_alloc_node() from slab_alloc_node() where the former does the actual allocation and the latter calls the pre/post hooks. Analogically, split out __kmem_cache_alloc_bulk() from kmem_cache_alloc_bulk(). Signed-off-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> --- mm/slub.c | 130 ++++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 80 insertions(+), 50 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index d814936649b1..88f0ce49caab 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2907,10 +2907,10 @@ static unsigned long count_partial(struct kmem_cache_node *n, } #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ +#ifdef CONFIG_SLUB_DEBUG static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { -#ifdef CONFIG_SLUB_DEBUG static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; @@ -2941,8 +2941,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } -#endif } +#else /* CONFIG_SLUB_DEBUG */ +static inline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { } +#endif static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) { @@ -3239,45 +3242,13 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, return p; } -/* - * If the object has been wiped upon free, make sure it's fully initialized by - * zeroing out freelist pointer. - */ -static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, - void *obj) -{ - if (unlikely(slab_want_init_on_free(s)) && obj) - memset((void *)((char *)kasan_reset_tag(obj) + s->offset), - 0, sizeof(void *)); -} - -/* - * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) - * have the fastpath folded into their functions. So no function call - * overhead for requests that can be satisfied on the fastpath. - * - * The fastpath works by first checking if the lockless freelist can be used. - * If not then __slab_alloc is called for slow processing. - * - * Otherwise we can simply pick the next object from the lockless free list. - */ -static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, +static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { - void *object; struct kmem_cache_cpu *c; struct slab *slab; unsigned long tid; - struct obj_cgroup *objcg = NULL; - bool init = false; - - s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); - if (!s) - return NULL; - - object = kfence_alloc(s, orig_size, gfpflags); - if (unlikely(object)) - goto out; + void *object; redo: /* @@ -3347,6 +3318,48 @@ redo: stat(s, ALLOC_FASTPATH); } + return object; +} + +/* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); +} + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + void *object; + struct obj_cgroup *objcg = NULL; + bool init = false; + + s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); + if (!s) + return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); @@ -3799,18 +3812,12 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -/* Note that interrupts must be enabled when calling this function. */ -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, struct obj_cgroup *objcg) { struct kmem_cache_cpu *c; int i; - struct obj_cgroup *objcg = NULL; - /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); - if (unlikely(!s)) - return false; /* * Drain objects in the per cpu slab, while disabling local * IRQs, which protects against PREEMPT and interrupts @@ -3864,18 +3871,41 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_unlock_irq(&s->cpu_slab->lock); slub_put_cpu_ptr(s->cpu_slab); - /* - * memcg and kmem_cache debug support and memory initialization. - * Done outside of the IRQ disabled fastpath loop. - */ - slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s)); return i; + error: slub_put_cpu_ptr(s->cpu_slab); slab_post_alloc_hook(s, objcg, flags, i, p, false); kmem_cache_free_bulk(s, i, p); return 0; + +} + +/* Note that interrupts must be enabled when calling this function. */ +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + int i; + struct obj_cgroup *objcg = NULL; + + if (!size) + return 0; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); + if (unlikely(!s)) + return 0; + + i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg); + + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + if (i != 0) + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s)); + return i; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -- cgit v1.2.3-70-g09d2 From 0af8489b0216fa1dd83e264bef8063f2632633d7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Nov 2022 18:14:31 +0100 Subject: mm, slub: remove percpu slabs with CONFIG_SLUB_TINY SLUB gets most of its scalability by percpu slabs. However for CONFIG_SLUB_TINY the goal is minimal memory overhead, not scalability. Thus, #ifdef out the whole kmem_cache_cpu percpu structure and associated code. Additionally to the slab page savings, this reduces percpu allocator usage, and code size. This change builds on recent commit c7323a5ad078 ("mm/slub: restrict sysfs validation to debug caches and make it safe"), as caches with enabled debugging also avoid percpu slabs and all allocations and freeing ends up working with the partial list. With a bit more refactoring by the preceding patches, use the same code paths with CONFIG_SLUB_TINY. Signed-off-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- include/linux/slub_def.h | 4 ++ mm/slub.c | 102 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 103 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 95b67863d5cf..aa0ee1678d29 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -41,6 +41,7 @@ enum stat_item { CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ NR_SLUB_STAT_ITEMS }; +#ifndef CONFIG_SLUB_TINY /* * When changing the layout, make sure freelist and tid are still compatible * with this_cpu_cmpxchg_double() alignment requirements. @@ -57,6 +58,7 @@ struct kmem_cache_cpu { unsigned stat[NR_SLUB_STAT_ITEMS]; #endif }; +#endif /* CONFIG_SLUB_TINY */ #ifdef CONFIG_SLUB_CPU_PARTIAL #define slub_percpu_partial(c) ((c)->partial) @@ -88,7 +90,9 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { +#ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; +#endif /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; diff --git a/mm/slub.c b/mm/slub.c index 88f0ce49caab..00d921bd5417 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -337,10 +337,12 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) */ static nodemask_t slab_nodes; +#ifndef CONFIG_SLUB_TINY /* * Workqueue used for flush_cpu_slab(). */ static struct workqueue_struct *flushwq; +#endif /******************************************************************** * Core slab cache functions @@ -386,10 +388,12 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return freelist_dereference(s, object + s->offset); } +#ifndef CONFIG_SLUB_TINY static void prefetch_freepointer(const struct kmem_cache *s, void *object) { prefetchw(object + s->offset); } +#endif /* * When running under KMSAN, get_freepointer_safe() may return an uninitialized @@ -1681,11 +1685,13 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +#ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) { return false; } +#endif #endif /* CONFIG_SLUB_DEBUG */ /* @@ -2219,7 +2225,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!pfmemalloc_match(slab, pc->flags)) continue; - if (kmem_cache_debug(s)) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { object = alloc_single_from_partial(s, n, slab, pc->orig_size); if (object) @@ -2334,6 +2340,8 @@ static void *get_partial(struct kmem_cache *s, int node, struct partial_context return get_any_partial(s, pc); } +#ifndef CONFIG_SLUB_TINY + #ifdef CONFIG_PREEMPTION /* * Calculate the next globally unique transaction for disambiguation @@ -2347,7 +2355,7 @@ static void *get_partial(struct kmem_cache *s, int node, struct partial_context * different cpus. */ #define TID_STEP 1 -#endif +#endif /* CONFIG_PREEMPTION */ static inline unsigned long next_tid(unsigned long tid) { @@ -2808,6 +2816,13 @@ static int slub_cpu_dead(unsigned int cpu) return 0; } +#else /* CONFIG_SLUB_TINY */ +static inline void flush_all_cpus_locked(struct kmem_cache *s) { } +static inline void flush_all(struct kmem_cache *s) { } +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } +static inline int slub_cpu_dead(unsigned int cpu) { return 0; } +#endif /* CONFIG_SLUB_TINY */ + /* * Check if the objects in a per cpu structure fit numa * locality expectations. @@ -2955,6 +2970,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } +#ifndef CONFIG_SLUB_TINY /* * Check the slab->freelist and either transfer the freelist to the * per cpu freelist or deactivate the slab. @@ -3320,6 +3336,33 @@ redo: return object; } +#else /* CONFIG_SLUB_TINY */ +static void *__slab_alloc_node(struct kmem_cache *s, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + struct partial_context pc; + struct slab *slab; + void *object; + + pc.flags = gfpflags; + pc.slab = &slab; + pc.orig_size = orig_size; + object = get_partial(s, node, &pc); + + if (object) + return object; + + slab = new_slab(s, gfpflags, node); + if (unlikely(!slab)) { + slab_out_of_memory(s, gfpflags, node); + return NULL; + } + + object = alloc_single_from_new_slab(s, slab, orig_size); + + return object; +} +#endif /* CONFIG_SLUB_TINY */ /* * If the object has been wiped upon free, make sure it's fully initialized by @@ -3503,7 +3546,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, if (kfence_free(head)) return; - if (kmem_cache_debug(s)) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { free_to_partial_list(s, slab, head, tail, cnt, addr); return; } @@ -3604,6 +3647,7 @@ slab_empty: discard_slab(s, slab); } +#ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that * can perform fastpath freeing without additional function calls. @@ -3678,6 +3722,16 @@ redo: } stat(s, FREE_FASTPATH); } +#else /* CONFIG_SLUB_TINY */ +static void do_slab_free(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, + int cnt, unsigned long addr) +{ + void *tail_obj = tail ? : head; + + __slab_free(s, slab, head, tail_obj, cnt, addr); +} +#endif /* CONFIG_SLUB_TINY */ static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, void **p, int cnt, @@ -3812,6 +3866,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); +#ifndef CONFIG_SLUB_TINY static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p, struct obj_cgroup *objcg) { @@ -3880,6 +3935,36 @@ error: return 0; } +#else /* CONFIG_SLUB_TINY */ +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, struct obj_cgroup *objcg) +{ + int i; + + for (i = 0; i < size; i++) { + void *object = kfence_alloc(s, s->object_size, flags); + + if (unlikely(object)) { + p[i] = object; + continue; + } + + p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE, + _RET_IP_, s->object_size); + if (unlikely(!p[i])) + goto error; + + maybe_wipe_obj_freeptr(s, p[i]); + } + + return i; + +error: + slab_post_alloc_hook(s, objcg, flags, i, p, false); + kmem_cache_free_bulk(s, i, p); + return 0; +} +#endif /* CONFIG_SLUB_TINY */ /* Note that interrupts must be enabled when calling this function. */ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, @@ -4062,6 +4147,7 @@ init_kmem_cache_node(struct kmem_cache_node *n) #endif } +#ifndef CONFIG_SLUB_TINY static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < @@ -4081,6 +4167,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) return 1; } +#else +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) +{ + return 1; +} +#endif /* CONFIG_SLUB_TINY */ static struct kmem_cache *kmem_cache_node; @@ -4143,7 +4235,9 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); +#ifndef CONFIG_SLUB_TINY free_percpu(s->cpu_slab); +#endif free_kmem_cache_nodes(s); } @@ -4920,8 +5014,10 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { +#ifndef CONFIG_SLUB_TINY flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); WARN_ON(!flushwq); +#endif } struct kmem_cache * -- cgit v1.2.3-70-g09d2 From be784ba861b93c5cd2c0565c5819c290675b50be Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 21 Nov 2022 16:58:39 +0100 Subject: mm, slub: don't aggressively inline with CONFIG_SLUB_TINY SLUB fastpaths use __always_inline to avoid function calls. With CONFIG_SLUB_TINY we would rather save the memory. Add a __fastpath_inline macro that's __always_inline normally but empty with CONFIG_SLUB_TINY. bloat-o-meter results on x86_64 mm/slub.o: add/remove: 3/1 grow/shrink: 1/8 up/down: 865/-1784 (-919) Function old new delta kmem_cache_free 20 281 +261 slab_alloc_node.isra - 245 +245 slab_free.constprop.isra - 231 +231 __kmem_cache_alloc_lru.isra - 128 +128 __kmem_cache_release 88 83 -5 __kmem_cache_create 1446 1436 -10 __kmem_cache_free 271 142 -129 kmem_cache_alloc_node 330 127 -203 kmem_cache_free_bulk.part 826 613 -213 __kmem_cache_alloc_node 230 10 -220 kmem_cache_alloc_lru 325 12 -313 kmem_cache_alloc 325 10 -315 kmem_cache_free.part 376 - -376 Total: Before=26103, After=25184, chg -3.52% Signed-off-by: Vlastimil Babka Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> --- mm/slub.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 00d921bd5417..ac9e4a15fa32 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -187,6 +187,12 @@ do { \ #define USE_LOCKLESS_FAST_PATH() (false) #endif +#ifndef CONFIG_SLUB_TINY +#define __fastpath_inline __always_inline +#else +#define __fastpath_inline +#endif + #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); @@ -3386,7 +3392,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, * * Otherwise we can simply pick the next object from the lockless free list. */ -static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, +static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; @@ -3412,13 +3418,13 @@ out: return object; } -static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, +static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags, unsigned long addr, size_t orig_size) { return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size); } -static __always_inline +static __fastpath_inline void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) { @@ -3733,7 +3739,7 @@ static void do_slab_free(struct kmem_cache *s, } #endif /* CONFIG_SLUB_TINY */ -static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, +static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, void **p, int cnt, unsigned long addr) { -- cgit v1.2.3-70-g09d2 From 149b6fa228eda1d191abc440af7162264d716d90 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 11 Nov 2022 11:04:55 +0100 Subject: mm, slob: rename CONFIG_SLOB to CONFIG_SLOB_DEPRECATED As explained in [1], we would like to remove SLOB if possible. - There are no known users that need its somewhat lower memory footprint so much that they cannot handle SLUB (after some modifications by the previous patches) instead. - It is an extra maintenance burden, and a number of features are incompatible with it. - It blocks the API improvement of allowing kfree() on objects allocated via kmem_cache_alloc(). As the first step, rename the CONFIG_SLOB option in the slab allocator configuration choice to CONFIG_SLOB_DEPRECATED. Add CONFIG_SLOB depending on CONFIG_SLOB_DEPRECATED as an internal option to avoid code churn. This will cause existing .config files and defconfigs with CONFIG_SLOB=y to silently switch to the default (and recommended replacement) SLUB, while still allowing SLOB to be configured by anyone that notices and needs it. But those should contact the slab maintainers and linux-mm@kvack.org as explained in the updated help. With no valid objections, the plan is to update the existing defconfigs to SLUB and remove SLOB in a few cycles. To make SLUB more suitable replacement for SLOB, a CONFIG_SLUB_TINY option was introduced to limit SLUB's memory overhead. There is a number of defconfigs specifying CONFIG_SLOB=y. As part of this patch, update them to select CONFIG_SLUB and CONFIG_SLUB_TINY. [1] https://lore.kernel.org/all/b35c3f82-f67b-2103-7d82-7a7ba7521439@suse.cz/ Cc: Russell King Cc: Aaro Koskinen Cc: Janusz Krzysztofik Cc: Tony Lindgren Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: Yoshinori Sato Cc: Rich Felker Cc: Arnd Bergmann Cc: Josh Triplett Cc: Conor Dooley Cc: Damien Le Moal Cc: Christophe Leroy Cc: Geert Uytterhoeven Signed-off-by: Vlastimil Babka Acked-by: Aaro Koskinen # OMAP1 Reviewed-by: Damien Le Moal # riscv k210 Acked-by: Arnd Bergmann # arm Acked-by: Roman Gushchin Acked-by: Mike Rapoport Reviewed-by: Christoph Lameter --- arch/arm/configs/clps711x_defconfig | 3 ++- arch/arm/configs/collie_defconfig | 3 ++- arch/arm/configs/multi_v4t_defconfig | 3 ++- arch/arm/configs/omap1_defconfig | 3 ++- arch/arm/configs/pxa_defconfig | 3 ++- arch/arm/configs/tct_hammer_defconfig | 3 ++- arch/arm/configs/xcep_defconfig | 3 ++- arch/openrisc/configs/or1ksim_defconfig | 3 ++- arch/openrisc/configs/simple_smp_defconfig | 3 ++- arch/riscv/configs/nommu_k210_defconfig | 3 ++- arch/riscv/configs/nommu_k210_sdcard_defconfig | 3 ++- arch/riscv/configs/nommu_virt_defconfig | 3 ++- arch/sh/configs/rsk7201_defconfig | 3 ++- arch/sh/configs/rsk7203_defconfig | 3 ++- arch/sh/configs/se7206_defconfig | 3 ++- arch/sh/configs/shmin_defconfig | 3 ++- arch/sh/configs/shx3_defconfig | 3 ++- kernel/configs/tiny.config | 5 +++-- mm/Kconfig | 17 +++++++++++++++-- 19 files changed, 52 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/arch/arm/configs/clps711x_defconfig b/arch/arm/configs/clps711x_defconfig index 92481b2a88fa..adcee238822a 100644 --- a/arch/arm/configs/clps711x_defconfig +++ b/arch/arm/configs/clps711x_defconfig @@ -14,7 +14,8 @@ CONFIG_ARCH_EDB7211=y CONFIG_ARCH_P720T=y CONFIG_AEABI=y # CONFIG_COREDUMP is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/arch/arm/configs/collie_defconfig b/arch/arm/configs/collie_defconfig index 2a2d2cb3ce2e..69341c33e0cc 100644 --- a/arch/arm/configs/collie_defconfig +++ b/arch/arm/configs/collie_defconfig @@ -13,7 +13,8 @@ CONFIG_CMDLINE="noinitrd root=/dev/mtdblock2 rootfstype=jffs2 fbcon=rotate:1" CONFIG_FPE_NWFPE=y CONFIG_PM=y # CONFIG_SWAP is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/arch/arm/configs/multi_v4t_defconfig b/arch/arm/configs/multi_v4t_defconfig index e2fd822f741a..b60000a89aff 100644 --- a/arch/arm/configs/multi_v4t_defconfig +++ b/arch/arm/configs/multi_v4t_defconfig @@ -25,7 +25,8 @@ CONFIG_ARM_CLPS711X_CPUIDLE=y CONFIG_JUMP_LABEL=y CONFIG_PARTITION_ADVANCED=y # CONFIG_COREDUMP is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_BLOCK=y diff --git a/arch/arm/configs/omap1_defconfig b/arch/arm/configs/omap1_defconfig index 70511fe4b3ec..246f1bba7df5 100644 --- a/arch/arm/configs/omap1_defconfig +++ b/arch/arm/configs/omap1_defconfig @@ -42,7 +42,8 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=y # CONFIG_SWAP is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y # CONFIG_VM_EVENT_COUNTERS is not set CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index d60cc9cc4c21..0a0f12df40b5 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -49,7 +49,8 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_LDM_PARTITION=y CONFIG_CMDLINE_PARTITION=y CONFIG_BINFMT_MISC=y -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y # CONFIG_COMPACTION is not set CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/arm/configs/tct_hammer_defconfig b/arch/arm/configs/tct_hammer_defconfig index 3b29ae1fb750..6bd38b6f22c4 100644 --- a/arch/arm/configs/tct_hammer_defconfig +++ b/arch/arm/configs/tct_hammer_defconfig @@ -19,7 +19,8 @@ CONFIG_FPE_NWFPE=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_SWAP is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/arch/arm/configs/xcep_defconfig b/arch/arm/configs/xcep_defconfig index ea59e4b6bfc5..6bd9f71b71fc 100644 --- a/arch/arm/configs/xcep_defconfig +++ b/arch/arm/configs/xcep_defconfig @@ -26,7 +26,8 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y # CONFIG_BLOCK is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y # CONFIG_COMPAT_BRK is not set # CONFIG_VM_EVENT_COUNTERS is not set CONFIG_NET=y diff --git a/arch/openrisc/configs/or1ksim_defconfig b/arch/openrisc/configs/or1ksim_defconfig index 6e1e004047c7..0116e465238f 100644 --- a/arch/openrisc/configs/or1ksim_defconfig +++ b/arch/openrisc/configs/or1ksim_defconfig @@ -10,7 +10,8 @@ CONFIG_EXPERT=y # CONFIG_AIO is not set # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y CONFIG_MODULES=y # CONFIG_BLOCK is not set CONFIG_OPENRISC_BUILTIN_DTB="or1ksim" diff --git a/arch/openrisc/configs/simple_smp_defconfig b/arch/openrisc/configs/simple_smp_defconfig index ff49d868e040..b990cb6c9309 100644 --- a/arch/openrisc/configs/simple_smp_defconfig +++ b/arch/openrisc/configs/simple_smp_defconfig @@ -16,7 +16,8 @@ CONFIG_EXPERT=y # CONFIG_AIO is not set # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y CONFIG_MODULES=y # CONFIG_BLOCK is not set CONFIG_OPENRISC_BUILTIN_DTB="simple_smp" diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig index 96fe8def644c..79b3ccd58ff0 100644 --- a/arch/riscv/configs/nommu_k210_defconfig +++ b/arch/riscv/configs/nommu_k210_defconfig @@ -25,7 +25,8 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y # CONFIG_MMU is not set CONFIG_SOC_CANAAN=y CONFIG_NONPORTABLE=y diff --git a/arch/riscv/configs/nommu_k210_sdcard_defconfig b/arch/riscv/configs/nommu_k210_sdcard_defconfig index 379740654373..6b80bb13b8ed 100644 --- a/arch/riscv/configs/nommu_k210_sdcard_defconfig +++ b/arch/riscv/configs/nommu_k210_sdcard_defconfig @@ -17,7 +17,8 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y # CONFIG_MMU is not set CONFIG_SOC_CANAAN=y CONFIG_NONPORTABLE=y diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig index 1a56eda5ce46..4cf0f297091e 100644 --- a/arch/riscv/configs/nommu_virt_defconfig +++ b/arch/riscv/configs/nommu_virt_defconfig @@ -22,7 +22,8 @@ CONFIG_EXPERT=y # CONFIG_KALLSYMS is not set # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y # CONFIG_MMU is not set CONFIG_SOC_VIRT=y CONFIG_NONPORTABLE=y diff --git a/arch/sh/configs/rsk7201_defconfig b/arch/sh/configs/rsk7201_defconfig index 619c18699459..376e95fa77bc 100644 --- a/arch/sh/configs/rsk7201_defconfig +++ b/arch/sh/configs/rsk7201_defconfig @@ -10,7 +10,8 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_BLK_DEV_INITRD=y # CONFIG_AIO is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y CONFIG_PROFILING=y CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig index d00fafc021e1..1d5fd67a3949 100644 --- a/arch/sh/configs/rsk7203_defconfig +++ b/arch/sh/configs/rsk7203_defconfig @@ -11,7 +11,8 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_KALLSYMS_ALL=y -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y CONFIG_PROFILING=y CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index 122216123e63..78e0e7be57ee 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig @@ -21,7 +21,8 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_KALLSYMS_ALL=y # CONFIG_ELF_CORE is not set # CONFIG_COMPAT_BRK is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y CONFIG_PROFILING=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig index c0b6f40d01cc..e078b193a78a 100644 --- a/arch/sh/configs/shmin_defconfig +++ b/arch/sh/configs/shmin_defconfig @@ -9,7 +9,8 @@ CONFIG_LOG_BUF_SHIFT=14 # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SHMEM is not set -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y # CONFIG_BLK_DEV_BSG is not set CONFIG_CPU_SUBTYPE_SH7706=y CONFIG_MEMORY_START=0x0c000000 diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig index 32ec6eb1eabc..aa353dff7f19 100644 --- a/arch/sh/configs/shx3_defconfig +++ b/arch/sh/configs/shx3_defconfig @@ -20,7 +20,8 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_KALLSYMS_ALL=y -CONFIG_SLOB=y +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y CONFIG_PROFILING=y CONFIG_KPROBES=y CONFIG_MODULES=y diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 8a44b93da0f3..c2f9c912df1c 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -7,5 +7,6 @@ CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set # CONFIG_SLAB is not set -# CONFIG_SLUB is not set -CONFIG_SLOB=y +# CONFIG_SLOB_DEPRECATED is not set +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y diff --git a/mm/Kconfig b/mm/Kconfig index 6701d72d3037..623d95659ff9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -219,17 +219,30 @@ config SLUB and has enhanced diagnostics. SLUB is the default choice for a slab allocator. -config SLOB +config SLOB_DEPRECATED depends on EXPERT - bool "SLOB (Simple Allocator)" + bool "SLOB (Simple Allocator - DEPRECATED)" depends on !PREEMPT_RT help + Deprecated and scheduled for removal in a few cycles. SLUB + recommended as replacement. CONFIG_SLUB_TINY can be considered + on systems with 16MB or less RAM. + + If you need SLOB to stay, please contact linux-mm@kvack.org and + people listed in the SLAB ALLOCATOR section of MAINTAINERS file, + with your use case. + SLOB replaces the stock allocator with a drastically simpler allocator. SLOB is generally more space efficient but does not perform as well on large systems. endchoice +config SLOB + bool + default y + depends on SLOB_DEPRECATED + config SLUB_TINY bool "Configure SLUB for minimal memory footprint" depends on SLUB && EXPERT -- cgit v1.2.3-70-g09d2