summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-25 16:51:24 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-25 16:51:24 -0800
commite06635e26cd8144eee17e9f256e8fde8aed3ba4f (patch)
tree24bd832e3098ebf70999a1af6c16087ce4c5995e /mm
parentf5f4745a7f057b58c9728ee4e2c5d6d79f382fe7 (diff)
parent9008fe8fad8255edfdbecea32d7eb0485d939d0d (diff)
Merge tag 'slab-for-6.13-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab
Pull slab updates from Vlastimil Babka: - Add new slab_strict_numa boot parameter to enforce per-object memory policies on top of slab folio policies, for systems where saving cost of remote accesses is more important than minimizing slab allocation overhead (Christoph Lameter) - Fix for freeptr_offset alignment check being too strict for m68k (Geert Uytterhoeven) - krealloc() fixes for not violating __GFP_ZERO guarantees on krealloc() when slub_debug (redzone and object tracking) is enabled (Feng Tang) - Fix a memory leak in case sysfs registration fails for a slab cache, and also no longer fail to create the cache in that case (Hyeonggon Yoo) - Fix handling of detected consistency problems (due to buggy slab user) with slub_debug enabled, so that it does not cause further list corruption bugs (yuan.gao) - Code cleanup and kerneldocs polishing (Zhen Lei, Vlastimil Babka) * tag 'slab-for-6.13-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: slab: Fix too strict alignment check in create_cache() mm/slab: Allow cache creation to proceed even if sysfs registration fails mm/slub: Avoid list corruption when removing a slab from the full list mm/slub, kunit: Add testcase for krealloc redzone and zeroing mm/slub: Improve redzone check and zeroing for krealloc() mm/slub: Consider kfence case for get_orig_size() SLUB: Add support for per object memory policies mm, slab: add kerneldocs for common SLAB_ flags mm/slab: remove duplicate check in create_cache() mm/slub: Move krealloc() and related code to slub.c mm/kasan: Don't store metadata inside kmalloc object when slub_debug_orig_size is on
Diffstat (limited to 'mm')
-rw-r--r--mm/kasan/generic.c7
-rw-r--r--mm/slab.h11
-rw-r--r--mm/slab_common.c103
-rw-r--r--mm/slub.c218
4 files changed, 222 insertions, 117 deletions
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 6310a180278b..8b9e348113b1 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -392,9 +392,12 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
* 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can
* be touched after it was freed, or
* 2. Object has a constructor, which means it's expected to
- * retain its content until the next allocation.
+ * retain its content until the next allocation, or
+ * 3. It is from a kmalloc cache which enables the debug option
+ * to store original size.
*/
- if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor) {
+ if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor ||
+ slub_debug_orig_size(cache)) {
cache->kasan_info.free_meta_offset = *size;
*size += sizeof(struct kasan_free_meta);
goto free_meta_added;
diff --git a/mm/slab.h b/mm/slab.h
index 6c6fe6d630ce..632fedd71fea 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -73,6 +73,11 @@ struct slab {
struct {
unsigned inuse:16;
unsigned objects:15;
+ /*
+ * If slab debugging is enabled then the
+ * frozen bit can be reused to indicate
+ * that the slab was corrupted
+ */
unsigned frozen:1;
};
};
@@ -695,6 +700,12 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
void __check_heap_object(const void *ptr, unsigned long n,
const struct slab *slab, bool to_user);
+static inline bool slub_debug_orig_size(struct kmem_cache *s)
+{
+ return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
+ (s->flags & SLAB_KMALLOC));
+}
+
#ifdef CONFIG_SLUB_DEBUG
void skip_orig_size_check(struct kmem_cache *s, const void *object);
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a7174455db9f..a29457bef626 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -222,15 +222,12 @@ static struct kmem_cache *create_cache(const char *name,
struct kmem_cache *s;
int err;
- if (WARN_ON(args->useroffset + args->usersize > object_size))
- args->useroffset = args->usersize = 0;
-
/* If a custom freelist pointer is requested make sure it's sane. */
err = -EINVAL;
if (args->use_freeptr_offset &&
(args->freeptr_offset >= object_size ||
!(flags & SLAB_TYPESAFE_BY_RCU) ||
- !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
+ !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
goto out;
err = -ENOMEM;
@@ -257,11 +254,23 @@ out:
* @object_size: The size of objects to be created in this cache.
* @args: Additional arguments for the cache creation (see
* &struct kmem_cache_args).
- * @flags: See %SLAB_* flags for an explanation of individual @flags.
+ * @flags: See the desriptions of individual flags. The common ones are listed
+ * in the description below.
*
* Not to be called directly, use the kmem_cache_create() wrapper with the same
* parameters.
*
+ * Commonly used @flags:
+ *
+ * &SLAB_ACCOUNT - Account allocations to memcg.
+ *
+ * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
+ *
+ * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
+ *
+ * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed
+ * by a grace period - see the full description before using.
+ *
* Context: Cannot be called within a interrupt, but can be interrupted.
*
* Return: a pointer to the cache on success, NULL on failure.
@@ -1199,90 +1208,6 @@ module_init(slab_proc_init);
#endif /* CONFIG_SLUB_DEBUG */
-static __always_inline __realloc_size(2) void *
-__do_krealloc(const void *p, size_t new_size, gfp_t flags)
-{
- void *ret;
- size_t ks;
-
- /* Check for double-free before calling ksize. */
- if (likely(!ZERO_OR_NULL_PTR(p))) {
- if (!kasan_check_byte(p))
- return NULL;
- ks = ksize(p);
- } else
- ks = 0;
-
- /* If the object still fits, repoison it precisely. */
- if (ks >= new_size) {
- /* Zero out spare memory. */
- if (want_init_on_alloc(flags)) {
- kasan_disable_current();
- memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
- kasan_enable_current();
- }
-
- p = kasan_krealloc((void *)p, new_size, flags);
- return (void *)p;
- }
-
- ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_);
- if (ret && p) {
- /* Disable KASAN checks as the object's redzone is accessed. */
- kasan_disable_current();
- memcpy(ret, kasan_reset_tag(p), ks);
- kasan_enable_current();
- }
-
- return ret;
-}
-
-/**
- * krealloc - reallocate memory. The contents will remain unchanged.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
- * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
- *
- * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
- * initial memory allocation, every subsequent call to this API for the same
- * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
- * __GFP_ZERO is not fully honored by this API.
- *
- * This is the case, since krealloc() only knows about the bucket size of an
- * allocation (but not the exact size it was allocated with) and hence
- * implements the following semantics for shrinking and growing buffers with
- * __GFP_ZERO.
- *
- * new bucket
- * 0 size size
- * |--------|----------------|
- * | keep | zero |
- *
- * In any case, the contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
-{
- void *ret;
-
- if (unlikely(!new_size)) {
- kfree(p);
- return ZERO_SIZE_PTR;
- }
-
- ret = __do_krealloc(p, new_size, flags);
- if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
- kfree(p);
-
- return ret;
-}
-EXPORT_SYMBOL(krealloc_noprof);
-
/**
* kfree_sensitive - Clear sensitive information in memory before freeing
* @p: object to free memory of
diff --git a/mm/slub.c b/mm/slub.c
index 5b832512044e..19980419b176 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -218,6 +218,10 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
#endif /* CONFIG_SLUB_DEBUG */
+#ifdef CONFIG_NUMA
+static DEFINE_STATIC_KEY_FALSE(strict_numa);
+#endif
+
/* Structure holding parameters for get_partial() call chain */
struct partial_context {
gfp_t flags;
@@ -230,12 +234,6 @@ static inline bool kmem_cache_debug(struct kmem_cache *s)
return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
}
-static inline bool slub_debug_orig_size(struct kmem_cache *s)
-{
- return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
- (s->flags & SLAB_KMALLOC));
-}
-
void *fixup_red_left(struct kmem_cache *s, void *p)
{
if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
@@ -760,21 +758,10 @@ static inline void set_orig_size(struct kmem_cache *s,
void *object, unsigned int orig_size)
{
void *p = kasan_reset_tag(object);
- unsigned int kasan_meta_size;
if (!slub_debug_orig_size(s))
return;
- /*
- * KASAN can save its free meta data inside of the object at offset 0.
- * If this meta data size is larger than 'orig_size', it will overlap
- * the data redzone in [orig_size+1, object_size]. Thus, we adjust
- * 'orig_size' to be as at least as big as KASAN's meta data.
- */
- kasan_meta_size = kasan_metadata_size(s, true);
- if (kasan_meta_size > orig_size)
- orig_size = kasan_meta_size;
-
p += get_info_end(s);
p += sizeof(struct track) * 2;
@@ -785,6 +772,9 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
{
void *p = kasan_reset_tag(object);
+ if (is_kfence_address(object))
+ return kfence_ksize(object);
+
if (!slub_debug_orig_size(s))
return s->object_size;
@@ -1423,6 +1413,11 @@ static int check_slab(struct kmem_cache *s, struct slab *slab)
slab->inuse, slab->objects);
return 0;
}
+ if (slab->frozen) {
+ slab_err(s, slab, "Slab disabled since SLUB metadata consistency check failed");
+ return 0;
+ }
+
/* Slab_pad_check fixes things up after itself */
slab_pad_check(s, slab);
return 1;
@@ -1603,6 +1598,7 @@ bad:
slab_fix(s, "Marking all objects used");
slab->inuse = slab->objects;
slab->freelist = NULL;
+ slab->frozen = 1; /* mark consistency-failed slab as frozen */
}
return false;
}
@@ -2744,7 +2740,8 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
slab->inuse++;
if (!alloc_debug_processing(s, slab, object, orig_size)) {
- remove_partial(n, slab);
+ if (folio_test_slab(slab_folio(slab)))
+ remove_partial(n, slab);
return NULL;
}
@@ -3956,6 +3953,28 @@ redo:
object = c->freelist;
slab = c->slab;
+#ifdef CONFIG_NUMA
+ if (static_branch_unlikely(&strict_numa) &&
+ node == NUMA_NO_NODE) {
+
+ struct mempolicy *mpol = current->mempolicy;
+
+ if (mpol) {
+ /*
+ * Special BIND rule support. If existing slab
+ * is in permitted set then do not redirect
+ * to a particular node.
+ * Otherwise we apply the memory policy to get
+ * the node we need to allocate on.
+ */
+ if (mpol->mode != MPOL_BIND || !slab ||
+ !node_isset(slab_nid(slab), mpol->nodes))
+
+ node = mempolicy_slab_node();
+ }
+ }
+#endif
+
if (!USE_LOCKLESS_FAST_PATH() ||
unlikely(!object || !slab || !node_match(slab, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
@@ -4728,6 +4747,126 @@ void kfree(const void *object)
}
EXPORT_SYMBOL(kfree);
+static __always_inline __realloc_size(2) void *
+__do_krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+ void *ret;
+ size_t ks = 0;
+ int orig_size = 0;
+ struct kmem_cache *s = NULL;
+
+ if (unlikely(ZERO_OR_NULL_PTR(p)))
+ goto alloc_new;
+
+ /* Check for double-free. */
+ if (!kasan_check_byte(p))
+ return NULL;
+
+ if (is_kfence_address(p)) {
+ ks = orig_size = kfence_ksize(p);
+ } else {
+ struct folio *folio;
+
+ folio = virt_to_folio(p);
+ if (unlikely(!folio_test_slab(folio))) {
+ /* Big kmalloc object */
+ WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE);
+ WARN_ON(p != folio_address(folio));
+ ks = folio_size(folio);
+ } else {
+ s = folio_slab(folio)->slab_cache;
+ orig_size = get_orig_size(s, (void *)p);
+ ks = s->object_size;
+ }
+ }
+
+ /* If the old object doesn't fit, allocate a bigger one */
+ if (new_size > ks)
+ goto alloc_new;
+
+ /* Zero out spare memory. */
+ if (want_init_on_alloc(flags)) {
+ kasan_disable_current();
+ if (orig_size && orig_size < new_size)
+ memset(kasan_reset_tag(p) + orig_size, 0, new_size - orig_size);
+ else
+ memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
+ kasan_enable_current();
+ }
+
+ /* Setup kmalloc redzone when needed */
+ if (s && slub_debug_orig_size(s)) {
+ set_orig_size(s, (void *)p, new_size);
+ if (s->flags & SLAB_RED_ZONE && new_size < ks)
+ memset_no_sanitize_memory(kasan_reset_tag(p) + new_size,
+ SLUB_RED_ACTIVE, ks - new_size);
+ }
+
+ p = kasan_krealloc(p, new_size, flags);
+ return (void *)p;
+
+alloc_new:
+ ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_);
+ if (ret && p) {
+ /* Disable KASAN checks as the object's redzone is accessed. */
+ kasan_disable_current();
+ memcpy(ret, kasan_reset_tag(p), orig_size ?: ks);
+ kasan_enable_current();
+ }
+
+ return ret;
+}
+
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
+ * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * When slub_debug_orig_size() is off, krealloc() only knows about the bucket
+ * size of an allocation (but not the exact size it was allocated with) and
+ * hence implements the following semantics for shrinking and growing buffers
+ * with __GFP_ZERO.
+ *
+ * new bucket
+ * 0 size size
+ * |--------|----------------|
+ * | keep | zero |
+ *
+ * Otherwise, the original allocation size 'orig_size' could be used to
+ * precisely clear the requested size, and the new size will also be stored
+ * as the new 'orig_size'.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
+ */
+void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
+{
+ void *ret;
+
+ if (unlikely(!new_size)) {
+ kfree(p);
+ return ZERO_SIZE_PTR;
+ }
+
+ ret = __do_krealloc(p, new_size, flags);
+ if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
+ kfree(p);
+
+ return ret;
+}
+EXPORT_SYMBOL(krealloc_noprof);
+
struct detached_freelist {
struct slab *slab;
void *tail;
@@ -5602,6 +5741,23 @@ static int __init setup_slub_min_objects(char *str)
__setup("slab_min_objects=", setup_slub_min_objects);
__setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0);
+#ifdef CONFIG_NUMA
+static int __init setup_slab_strict_numa(char *str)
+{
+ if (nr_node_ids > 1) {
+ static_branch_enable(&strict_numa);
+ pr_info("SLUB: Strict NUMA enabled.\n");
+ } else {
+ pr_warn("slab_strict_numa parameter set on non NUMA system.\n");
+ }
+
+ return 1;
+}
+
+__setup("slab_strict_numa", setup_slab_strict_numa);
+#endif
+
+
#ifdef CONFIG_HARDENED_USERCOPY
/*
* Rejects incorrectly sized objects and objects that are to be copied
@@ -5960,7 +6116,8 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
if (sysfs_slab_alias(s, name))
- return NULL;
+ pr_err("SLUB: Unable to add cache alias %s to sysfs\n",
+ name);
s->refcount++;
@@ -6042,15 +6199,18 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
if (!alloc_kmem_cache_cpus(s))
goto out;
+ err = 0;
+
/* Mutex is not taken during early boot */
- if (slab_state <= UP) {
- err = 0;
+ if (slab_state <= UP)
goto out;
- }
- err = sysfs_slab_add(s);
- if (err)
- goto out;
+ /*
+ * Failing to create sysfs files is not critical to SLUB functionality.
+ * If it fails, proceed with cache creation without these files.
+ */
+ if (sysfs_slab_add(s))
+ pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name);
if (s->flags & SLAB_STORE_USER)
debugfs_slab_add(s);
@@ -7120,7 +7280,8 @@ out_del_kobj:
void sysfs_slab_unlink(struct kmem_cache *s)
{
- kobject_del(&s->kobj);
+ if (s->kobj.state_in_sysfs)
+ kobject_del(&s->kobj);
}
void sysfs_slab_release(struct kmem_cache *s)
@@ -7149,6 +7310,11 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
* If we have a leftover link then remove it.
*/
sysfs_remove_link(&slab_kset->kobj, name);
+ /*
+ * The original cache may have failed to generate sysfs file.
+ * In that case, sysfs_create_link() returns -ENOENT and
+ * symbolic link creation is skipped.
+ */
return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
}