diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-25 16:51:24 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-25 16:51:24 -0800 |
commit | e06635e26cd8144eee17e9f256e8fde8aed3ba4f (patch) | |
tree | 24bd832e3098ebf70999a1af6c16087ce4c5995e /mm | |
parent | f5f4745a7f057b58c9728ee4e2c5d6d79f382fe7 (diff) | |
parent | 9008fe8fad8255edfdbecea32d7eb0485d939d0d (diff) |
Merge tag 'slab-for-6.13-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab
Pull slab updates from Vlastimil Babka:
- Add new slab_strict_numa boot parameter to enforce per-object memory
policies on top of slab folio policies, for systems where saving cost
of remote accesses is more important than minimizing slab allocation
overhead (Christoph Lameter)
- Fix for freeptr_offset alignment check being too strict for m68k
(Geert Uytterhoeven)
- krealloc() fixes for not violating __GFP_ZERO guarantees on
krealloc() when slub_debug (redzone and object tracking) is enabled
(Feng Tang)
- Fix a memory leak in case sysfs registration fails for a slab cache,
and also no longer fail to create the cache in that case (Hyeonggon
Yoo)
- Fix handling of detected consistency problems (due to buggy slab
user) with slub_debug enabled, so that it does not cause further list
corruption bugs (yuan.gao)
- Code cleanup and kerneldocs polishing (Zhen Lei, Vlastimil Babka)
* tag 'slab-for-6.13-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab:
slab: Fix too strict alignment check in create_cache()
mm/slab: Allow cache creation to proceed even if sysfs registration fails
mm/slub: Avoid list corruption when removing a slab from the full list
mm/slub, kunit: Add testcase for krealloc redzone and zeroing
mm/slub: Improve redzone check and zeroing for krealloc()
mm/slub: Consider kfence case for get_orig_size()
SLUB: Add support for per object memory policies
mm, slab: add kerneldocs for common SLAB_ flags
mm/slab: remove duplicate check in create_cache()
mm/slub: Move krealloc() and related code to slub.c
mm/kasan: Don't store metadata inside kmalloc object when slub_debug_orig_size is on
Diffstat (limited to 'mm')
-rw-r--r-- | mm/kasan/generic.c | 7 | ||||
-rw-r--r-- | mm/slab.h | 11 | ||||
-rw-r--r-- | mm/slab_common.c | 103 | ||||
-rw-r--r-- | mm/slub.c | 218 |
4 files changed, 222 insertions, 117 deletions
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 6310a180278b..8b9e348113b1 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -392,9 +392,12 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can * be touched after it was freed, or * 2. Object has a constructor, which means it's expected to - * retain its content until the next allocation. + * retain its content until the next allocation, or + * 3. It is from a kmalloc cache which enables the debug option + * to store original size. */ - if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor) { + if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || + slub_debug_orig_size(cache)) { cache->kasan_info.free_meta_offset = *size; *size += sizeof(struct kasan_free_meta); goto free_meta_added; diff --git a/mm/slab.h b/mm/slab.h index 6c6fe6d630ce..632fedd71fea 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -73,6 +73,11 @@ struct slab { struct { unsigned inuse:16; unsigned objects:15; + /* + * If slab debugging is enabled then the + * frozen bit can be reused to indicate + * that the slab was corrupted + */ unsigned frozen:1; }; }; @@ -695,6 +700,12 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); +static inline bool slub_debug_orig_size(struct kmem_cache *s) +{ + return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && + (s->flags & SLAB_KMALLOC)); +} + #ifdef CONFIG_SLUB_DEBUG void skip_orig_size_check(struct kmem_cache *s, const void *object); #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index a7174455db9f..a29457bef626 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -222,15 +222,12 @@ static struct kmem_cache *create_cache(const char *name, struct kmem_cache *s; int err; - if (WARN_ON(args->useroffset + args->usersize > object_size)) - args->useroffset = args->usersize = 0; - /* If a custom freelist pointer is requested make sure it's sane. */ err = -EINVAL; if (args->use_freeptr_offset && (args->freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) || - !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)))) + !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t)))) goto out; err = -ENOMEM; @@ -257,11 +254,23 @@ out: * @object_size: The size of objects to be created in this cache. * @args: Additional arguments for the cache creation (see * &struct kmem_cache_args). - * @flags: See %SLAB_* flags for an explanation of individual @flags. + * @flags: See the desriptions of individual flags. The common ones are listed + * in the description below. * * Not to be called directly, use the kmem_cache_create() wrapper with the same * parameters. * + * Commonly used @flags: + * + * &SLAB_ACCOUNT - Account allocations to memcg. + * + * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries. + * + * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable. + * + * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed + * by a grace period - see the full description before using. + * * Context: Cannot be called within a interrupt, but can be interrupted. * * Return: a pointer to the cache on success, NULL on failure. @@ -1199,90 +1208,6 @@ module_init(slab_proc_init); #endif /* CONFIG_SLUB_DEBUG */ -static __always_inline __realloc_size(2) void * -__do_krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - size_t ks; - - /* Check for double-free before calling ksize. */ - if (likely(!ZERO_OR_NULL_PTR(p))) { - if (!kasan_check_byte(p)) - return NULL; - ks = ksize(p); - } else - ks = 0; - - /* If the object still fits, repoison it precisely. */ - if (ks >= new_size) { - /* Zero out spare memory. */ - if (want_init_on_alloc(flags)) { - kasan_disable_current(); - memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); - kasan_enable_current(); - } - - p = kasan_krealloc((void *)p, new_size, flags); - return (void *)p; - } - - ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); - if (ret && p) { - /* Disable KASAN checks as the object's redzone is accessed. */ - kasan_disable_current(); - memcpy(ret, kasan_reset_tag(p), ks); - kasan_enable_current(); - } - - return ret; -} - -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size - * is 0 and @p is not a %NULL pointer, the object pointed to is freed. - * - * If __GFP_ZERO logic is requested, callers must ensure that, starting with the - * initial memory allocation, every subsequent call to this API for the same - * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that - * __GFP_ZERO is not fully honored by this API. - * - * This is the case, since krealloc() only knows about the bucket size of an - * allocation (but not the exact size it was allocated with) and hence - * implements the following semantics for shrinking and growing buffers with - * __GFP_ZERO. - * - * new bucket - * 0 size size - * |--------|----------------| - * | keep | zero | - * - * In any case, the contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } - - ret = __do_krealloc(p, new_size, flags); - if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) - kfree(p); - - return ret; -} -EXPORT_SYMBOL(krealloc_noprof); - /** * kfree_sensitive - Clear sensitive information in memory before freeing * @p: object to free memory of diff --git a/mm/slub.c b/mm/slub.c index 5b832512044e..19980419b176 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -218,6 +218,10 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif #endif /* CONFIG_SLUB_DEBUG */ +#ifdef CONFIG_NUMA +static DEFINE_STATIC_KEY_FALSE(strict_numa); +#endif + /* Structure holding parameters for get_partial() call chain */ struct partial_context { gfp_t flags; @@ -230,12 +234,6 @@ static inline bool kmem_cache_debug(struct kmem_cache *s) return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); } -static inline bool slub_debug_orig_size(struct kmem_cache *s) -{ - return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && - (s->flags & SLAB_KMALLOC)); -} - void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) @@ -760,21 +758,10 @@ static inline void set_orig_size(struct kmem_cache *s, void *object, unsigned int orig_size) { void *p = kasan_reset_tag(object); - unsigned int kasan_meta_size; if (!slub_debug_orig_size(s)) return; - /* - * KASAN can save its free meta data inside of the object at offset 0. - * If this meta data size is larger than 'orig_size', it will overlap - * the data redzone in [orig_size+1, object_size]. Thus, we adjust - * 'orig_size' to be as at least as big as KASAN's meta data. - */ - kasan_meta_size = kasan_metadata_size(s, true); - if (kasan_meta_size > orig_size) - orig_size = kasan_meta_size; - p += get_info_end(s); p += sizeof(struct track) * 2; @@ -785,6 +772,9 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) { void *p = kasan_reset_tag(object); + if (is_kfence_address(object)) + return kfence_ksize(object); + if (!slub_debug_orig_size(s)) return s->object_size; @@ -1423,6 +1413,11 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) slab->inuse, slab->objects); return 0; } + if (slab->frozen) { + slab_err(s, slab, "Slab disabled since SLUB metadata consistency check failed"); + return 0; + } + /* Slab_pad_check fixes things up after itself */ slab_pad_check(s, slab); return 1; @@ -1603,6 +1598,7 @@ bad: slab_fix(s, "Marking all objects used"); slab->inuse = slab->objects; slab->freelist = NULL; + slab->frozen = 1; /* mark consistency-failed slab as frozen */ } return false; } @@ -2744,7 +2740,8 @@ static void *alloc_single_from_partial(struct kmem_cache *s, slab->inuse++; if (!alloc_debug_processing(s, slab, object, orig_size)) { - remove_partial(n, slab); + if (folio_test_slab(slab_folio(slab))) + remove_partial(n, slab); return NULL; } @@ -3956,6 +3953,28 @@ redo: object = c->freelist; slab = c->slab; +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If existing slab + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || !slab || + !node_isset(slab_nid(slab), mpol->nodes)) + + node = mempolicy_slab_node(); + } + } +#endif + if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); @@ -4728,6 +4747,126 @@ void kfree(const void *object) } EXPORT_SYMBOL(kfree); +static __always_inline __realloc_size(2) void * +__do_krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + size_t ks = 0; + int orig_size = 0; + struct kmem_cache *s = NULL; + + if (unlikely(ZERO_OR_NULL_PTR(p))) + goto alloc_new; + + /* Check for double-free. */ + if (!kasan_check_byte(p)) + return NULL; + + if (is_kfence_address(p)) { + ks = orig_size = kfence_ksize(p); + } else { + struct folio *folio; + + folio = virt_to_folio(p); + if (unlikely(!folio_test_slab(folio))) { + /* Big kmalloc object */ + WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE); + WARN_ON(p != folio_address(folio)); + ks = folio_size(folio); + } else { + s = folio_slab(folio)->slab_cache; + orig_size = get_orig_size(s, (void *)p); + ks = s->object_size; + } + } + + /* If the old object doesn't fit, allocate a bigger one */ + if (new_size > ks) + goto alloc_new; + + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) { + kasan_disable_current(); + if (orig_size && orig_size < new_size) + memset(kasan_reset_tag(p) + orig_size, 0, new_size - orig_size); + else + memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); + kasan_enable_current(); + } + + /* Setup kmalloc redzone when needed */ + if (s && slub_debug_orig_size(s)) { + set_orig_size(s, (void *)p, new_size); + if (s->flags & SLAB_RED_ZONE && new_size < ks) + memset_no_sanitize_memory(kasan_reset_tag(p) + new_size, + SLUB_RED_ACTIVE, ks - new_size); + } + + p = kasan_krealloc(p, new_size, flags); + return (void *)p; + +alloc_new: + ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); + if (ret && p) { + /* Disable KASAN checks as the object's redzone is accessed. */ + kasan_disable_current(); + memcpy(ret, kasan_reset_tag(p), orig_size ?: ks); + kasan_enable_current(); + } + + return ret; +} + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * When slub_debug_orig_size() is off, krealloc() only knows about the bucket + * size of an allocation (but not the exact size it was allocated with) and + * hence implements the following semantics for shrinking and growing buffers + * with __GFP_ZERO. + * + * new bucket + * 0 size size + * |--------|----------------| + * | keep | zero | + * + * Otherwise, the original allocation size 'orig_size' could be used to + * precisely clear the requested size, and the new size will also be stored + * as the new 'orig_size'. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc_noprof); + struct detached_freelist { struct slab *slab; void *tail; @@ -5602,6 +5741,23 @@ static int __init setup_slub_min_objects(char *str) __setup("slab_min_objects=", setup_slub_min_objects); __setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0); +#ifdef CONFIG_NUMA +static int __init setup_slab_strict_numa(char *str) +{ + if (nr_node_ids > 1) { + static_branch_enable(&strict_numa); + pr_info("SLUB: Strict NUMA enabled.\n"); + } else { + pr_warn("slab_strict_numa parameter set on non NUMA system.\n"); + } + + return 1; +} + +__setup("slab_strict_numa", setup_slab_strict_numa); +#endif + + #ifdef CONFIG_HARDENED_USERCOPY /* * Rejects incorrectly sized objects and objects that are to be copied @@ -5960,7 +6116,8 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, s = find_mergeable(size, align, flags, name, ctor); if (s) { if (sysfs_slab_alias(s, name)) - return NULL; + pr_err("SLUB: Unable to add cache alias %s to sysfs\n", + name); s->refcount++; @@ -6042,15 +6199,18 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!alloc_kmem_cache_cpus(s)) goto out; + err = 0; + /* Mutex is not taken during early boot */ - if (slab_state <= UP) { - err = 0; + if (slab_state <= UP) goto out; - } - err = sysfs_slab_add(s); - if (err) - goto out; + /* + * Failing to create sysfs files is not critical to SLUB functionality. + * If it fails, proceed with cache creation without these files. + */ + if (sysfs_slab_add(s)) + pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name); if (s->flags & SLAB_STORE_USER) debugfs_slab_add(s); @@ -7120,7 +7280,8 @@ out_del_kobj: void sysfs_slab_unlink(struct kmem_cache *s) { - kobject_del(&s->kobj); + if (s->kobj.state_in_sysfs) + kobject_del(&s->kobj); } void sysfs_slab_release(struct kmem_cache *s) @@ -7149,6 +7310,11 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) * If we have a leftover link then remove it. */ sysfs_remove_link(&slab_kset->kobj, name); + /* + * The original cache may have failed to generate sysfs file. + * In that case, sysfs_create_link() returns -ENOENT and + * symbolic link creation is skipped. + */ return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); } |