diff options
author | Vlastimil Babka <vbabka@suse.cz> | 2024-09-13 11:08:27 +0200 |
---|---|---|
committer | Vlastimil Babka <vbabka@suse.cz> | 2024-09-13 11:08:27 +0200 |
commit | a715e94dbda4ece41aac49b7b7ff8ddb55a7fe08 (patch) | |
tree | 337ca3751374479574ff2d2af58a8759b15e237b | |
parent | e02147cb703412fa13dd31908c734d7fb2314f55 (diff) | |
parent | 9028cdeb38e1f37d63cb3154799dd259b67e879e (diff) |
Merge branch 'slab/for-6.12/rcu_barriers' into slab/for-next
Merge most of SLUB feature work for 6.12:
- Barrier for pending kfree_rcu() in kmem_cache_destroy() and associated
refactoring of the destroy path (Vlastimil Babka)
- CONFIG_SLUB_RCU_DEBUG to allow KASAN catching UAF bugs in
SLAB_TYPESAFE_BY_RCU caches (Jann Horn)
- kmem_cache_charge() for delayed kmemcg charging (Shakeel Butt)
-rw-r--r-- | include/linux/kasan.h | 63 | ||||
-rw-r--r-- | include/linux/rcutiny.h | 5 | ||||
-rw-r--r-- | include/linux/rcutree.h | 1 | ||||
-rw-r--r-- | include/linux/slab.h | 29 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 109 | ||||
-rw-r--r-- | lib/slub_kunit.c | 31 | ||||
-rw-r--r-- | mm/Kconfig.debug | 32 | ||||
-rw-r--r-- | mm/kasan/common.c | 62 | ||||
-rw-r--r-- | mm/kasan/kasan_test.c | 46 | ||||
-rw-r--r-- | mm/slab.h | 7 | ||||
-rw-r--r-- | mm/slab_common.c | 127 | ||||
-rw-r--r-- | mm/slub.c | 139 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 5 |
13 files changed, 528 insertions, 128 deletions
diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 70d6a8f6e25d..00a3bf7c0d8f 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -175,13 +175,59 @@ static __always_inline void * __must_check kasan_init_slab_obj( return (void *)object; } -bool __kasan_slab_free(struct kmem_cache *s, void *object, - unsigned long ip, bool init); +bool __kasan_slab_pre_free(struct kmem_cache *s, void *object, + unsigned long ip); +/** + * kasan_slab_pre_free - Check whether freeing a slab object is safe. + * @object: Object to be freed. + * + * This function checks whether freeing the given object is safe. It may + * check for double-free and invalid-free bugs and report them. + * + * This function is intended only for use by the slab allocator. + * + * @Return true if freeing the object is unsafe; false otherwise. + */ +static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, + void *object) +{ + if (kasan_enabled()) + return __kasan_slab_pre_free(s, object, _RET_IP_); + return false; +} + +bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, + bool still_accessible); +/** + * kasan_slab_free - Poison, initialize, and quarantine a slab object. + * @object: Object to be freed. + * @init: Whether to initialize the object. + * @still_accessible: Whether the object contents are still accessible. + * + * This function informs that a slab object has been freed and is not + * supposed to be accessed anymore, except when @still_accessible is set + * (indicating that the object is in a SLAB_TYPESAFE_BY_RCU cache and an RCU + * grace period might not have passed yet). + * + * For KASAN modes that have integrated memory initialization + * (kasan_has_integrated_init() == true), this function also initializes + * the object's memory. For other modes, the @init argument is ignored. + * + * This function might also take ownership of the object to quarantine it. + * When this happens, KASAN will defer freeing the object to a later + * stage and handle it internally until then. The return value indicates + * whether KASAN took ownership of the object. + * + * This function is intended only for use by the slab allocator. + * + * @Return true if KASAN took ownership of the object; false otherwise. + */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, - void *object, bool init) + void *object, bool init, + bool still_accessible) { if (kasan_enabled()) - return __kasan_slab_free(s, object, _RET_IP_, init); + return __kasan_slab_free(s, object, init, still_accessible); return false; } @@ -371,7 +417,14 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache, { return (void *)object; } -static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) + +static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) +{ + return false; +} + +static inline bool kasan_slab_free(struct kmem_cache *s, void *object, + bool init, bool still_accessible) { return false; } diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d9ac7b136aea..522123050ff8 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -111,6 +111,11 @@ static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) kvfree(ptr); } +static inline void kvfree_rcu_barrier(void) +{ + rcu_barrier(); +} + #ifdef CONFIG_KASAN_GENERIC void kvfree_call_rcu(struct rcu_head *head, void *ptr); #else diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 254244202ea9..58e7db80f3a8 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -35,6 +35,7 @@ static inline void rcu_virt_note_context_switch(void) void synchronize_rcu_expedited(void); void kvfree_call_rcu(struct rcu_head *head, void *ptr); +void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_dyntick_idle(void); diff --git a/include/linux/slab.h b/include/linux/slab.h index eb2bf4629157..3be2a5ed4936 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -547,6 +547,35 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) __assume_slab_alignment __malloc; #define kmem_cache_alloc_lru(...) alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__)) +/** + * kmem_cache_charge - memcg charge an already allocated slab memory + * @objp: address of the slab object to memcg charge + * @gfpflags: describe the allocation context + * + * kmem_cache_charge allows charging a slab object to the current memcg, + * primarily in cases where charging at allocation time might not be possible + * because the target memcg is not known (i.e. softirq context) + * + * The objp should be pointer returned by the slab allocator functions like + * kmalloc (with __GFP_ACCOUNT in flags) or kmem_cache_alloc. The memcg charge + * behavior can be controlled through gfpflags parameter, which affects how the + * necessary internal metadata can be allocated. Including __GFP_NOFAIL denotes + * that overcharging is requested instead of failure, but is not applied for the + * internal metadata allocation. + * + * There are several cases where it will return true even if the charging was + * not done: + * More specifically: + * + * 1. For !CONFIG_MEMCG or cgroup_disable=memory systems. + * 2. Already charged slab objects. + * 3. For slab objects from KMALLOC_NORMAL caches - allocated by kmalloc() + * without __GFP_ACCOUNT + * 4. Allocating internal metadata has failed + * + * Return: true if charge was successful otherwise false. + */ +bool kmem_cache_charge(void *objp, gfp_t gfpflags); void kmem_cache_free(struct kmem_cache *s, void *objp); kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..be00aac5f4e7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3584,18 +3584,15 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) } /* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * Return: %true if a work is queued, %false otherwise. */ -static void kfree_rcu_monitor(struct work_struct *work) +static bool +kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) { - struct kfree_rcu_cpu *krcp = container_of(work, - struct kfree_rcu_cpu, monitor_work.work); unsigned long flags; + bool queued = false; int i, j; - // Drain ready for reclaim. - kvfree_rcu_drain_ready(krcp); - raw_spin_lock_irqsave(&krcp->lock, flags); // Attempt to start a new batch. @@ -3634,11 +3631,27 @@ static void kfree_rcu_monitor(struct work_struct *work) // be that the work is in the pending state when // channels have been detached following by each // other. - queue_rcu_work(system_wq, &krwp->rcu_work); + queued = queue_rcu_work(system_wq, &krwp->rcu_work); } } raw_spin_unlock_irqrestore(&krcp->lock, flags); + return queued; +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + struct kfree_rcu_cpu *krcp = container_of(work, + struct kfree_rcu_cpu, monitor_work.work); + + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + + // Queue a batch for a rest. + kvfree_rcu_queue_batch(krcp); // If there is nothing to detach, it means that our job is // successfully done here. In case of having at least one @@ -3859,6 +3872,86 @@ unlock_return: } EXPORT_SYMBOL_GPL(kvfree_call_rcu); +/** + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. + * + * Note that a single argument of kvfree_rcu() call has a slow path that + * triggers synchronize_rcu() following by freeing a pointer. It is done + * before the return from the function. Therefore for any single-argument + * call that will result in a kfree() to a cache that is to be destroyed + * during module exit, it is developer's responsibility to ensure that all + * such calls have returned before the call to kmem_cache_destroy(). + */ +void kvfree_rcu_barrier(void) +{ + struct kfree_rcu_cpu_work *krwp; + struct kfree_rcu_cpu *krcp; + bool queued; + int i, cpu; + + /* + * Firstly we detach objects and queue them over an RCU-batch + * for all CPUs. Finally queued works are flushed for each CPU. + * + * Please note. If there are outstanding batches for a particular + * CPU, those have to be finished first following by queuing a new. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * Check if this CPU has any objects which have been queued for a + * new GP completion. If not(means nothing to detach), we are done + * with it. If any batch is pending/running for this "krcp", below + * per-cpu flush_rcu_work() waits its completion(see last step). + */ + if (!need_offload_krc(krcp)) + continue; + + while (1) { + /* + * If we are not able to queue a new RCU work it means: + * - batches for this CPU are still in flight which should + * be flushed first and then repeat; + * - no objects to detach, because of concurrency. + */ + queued = kvfree_rcu_queue_batch(krcp); + + /* + * Bail out, if there is no need to offload this "krcp" + * anymore. As noted earlier it can run concurrently. + */ + if (queued || !need_offload_krc(krcp)) + break; + + /* There are ongoing batches. */ + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } + } + + /* + * Now we guarantee that all objects are flushed. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * A monitor work can drain ready to reclaim objects + * directly. Wait its completion if running or pending. + */ + cancel_delayed_work_sync(&krcp->monitor_work); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } +} +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); + static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c index e6667a28c014..6e3a1e5a7142 100644 --- a/lib/slub_kunit.c +++ b/lib/slub_kunit.c @@ -5,6 +5,7 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/rcupdate.h> #include "../mm/slab.h" static struct kunit_resource resource; @@ -157,6 +158,34 @@ static void test_kmalloc_redzone_access(struct kunit *test) kmem_cache_destroy(s); } +struct test_kfree_rcu_struct { + struct rcu_head rcu; +}; + +static void test_kfree_rcu(struct kunit *test) +{ + struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu", + sizeof(struct test_kfree_rcu_struct), + SLAB_NO_MERGE); + struct test_kfree_rcu_struct *p = kmem_cache_alloc(s, GFP_KERNEL); + + kfree_rcu(p, rcu); + kmem_cache_destroy(s); + + KUNIT_EXPECT_EQ(test, 0, slab_errors); +} + +static void test_leak_destroy(struct kunit *test) +{ + struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu", + 64, SLAB_NO_MERGE); + kmem_cache_alloc(s, GFP_KERNEL); + + kmem_cache_destroy(s); + + KUNIT_EXPECT_EQ(test, 1, slab_errors); +} + static int test_init(struct kunit *test) { slab_errors = 0; @@ -177,6 +206,8 @@ static struct kunit_case test_cases[] = { KUNIT_CASE(test_clobber_redzone_free), KUNIT_CASE(test_kmalloc_redzone_access), + KUNIT_CASE(test_kfree_rcu), + KUNIT_CASE(test_leak_destroy), {} }; diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index afc72fde0f03..41a58536531d 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -70,6 +70,38 @@ config SLUB_DEBUG_ON off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying "slab_debug=-". +config SLUB_RCU_DEBUG + bool "Enable UAF detection in TYPESAFE_BY_RCU caches (for KASAN)" + depends on SLUB_DEBUG + # SLUB_RCU_DEBUG should build fine without KASAN, but is currently useless + # without KASAN, so mark it as a dependency of KASAN for now. + depends on KASAN + default KASAN_GENERIC || KASAN_SW_TAGS + help + Make SLAB_TYPESAFE_BY_RCU caches behave approximately as if the cache + was not marked as SLAB_TYPESAFE_BY_RCU and every caller used + kfree_rcu() instead. + + This is intended for use in combination with KASAN, to enable KASAN to + detect use-after-free accesses in such caches. + (KFENCE is able to do that independent of this flag.) + + This might degrade performance. + Unfortunately this also prevents a very specific bug pattern from + triggering (insufficient checks against an object being recycled + within the RCU grace period); so this option can be turned off even on + KASAN builds, in case you want to test for such a bug. + + If you're using this for testing bugs / fuzzing and care about + catching all the bugs WAY more than performance, you might want to + also turn on CONFIG_RCU_STRICT_GRACE_PERIOD. + + WARNING: + This is designed as a debugging feature, not a security feature. + Objects are sometimes recycled without RCU delay under memory pressure. + + If unsure, say N. + config PAGE_OWNER bool "Track page owner" depends on DEBUG_KERNEL && STACKTRACE_SUPPORT diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 85e7c6b4575c..ed4873e18c75 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -208,15 +208,12 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, return (void *)object; } -static inline bool poison_slab_object(struct kmem_cache *cache, void *object, - unsigned long ip, bool init) +/* Returns true when freeing the object is not safe. */ +static bool check_slab_allocation(struct kmem_cache *cache, void *object, + unsigned long ip) { - void *tagged_object; - - if (!kasan_arch_is_ready()) - return false; + void *tagged_object = object; - tagged_object = object; object = kasan_reset_tag(object); if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != object)) { @@ -224,37 +221,47 @@ static inline bool poison_slab_object(struct kmem_cache *cache, void *object, return true; } - /* RCU slabs could be legally used after free within the RCU period. */ - if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) - return false; - if (!kasan_byte_accessible(tagged_object)) { kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE); return true; } + return false; +} + +static inline void poison_slab_object(struct kmem_cache *cache, void *object, + bool init, bool still_accessible) +{ + void *tagged_object = object; + + object = kasan_reset_tag(object); + + /* RCU slabs could be legally used after free within the RCU period. */ + if (unlikely(still_accessible)) + return; + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), KASAN_SLAB_FREE, init); if (kasan_stack_collection_enabled()) kasan_save_free_info(cache, tagged_object); +} - return false; +bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, + unsigned long ip) +{ + if (!kasan_arch_is_ready() || is_kfence_address(object)) + return false; + return check_slab_allocation(cache, object, ip); } -bool __kasan_slab_free(struct kmem_cache *cache, void *object, - unsigned long ip, bool init) +bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, + bool still_accessible) { - if (is_kfence_address(object)) + if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; - /* - * If the object is buggy, do not let slab put the object onto the - * freelist. The object will thus never be allocated again and its - * metadata will never get released. - */ - if (poison_slab_object(cache, object, ip, init)) - return true; + poison_slab_object(cache, object, init, still_accessible); /* * If the object is put into quarantine, do not let slab put the object @@ -504,11 +511,16 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) return true; } - if (is_kfence_address(ptr)) - return false; + if (is_kfence_address(ptr) || !kasan_arch_is_ready()) + return true; slab = folio_slab(folio); - return !poison_slab_object(slab->slab_cache, ptr, ip, false); + + if (check_slab_allocation(slab->slab_cache, ptr, ip)) + return false; + + poison_slab_object(slab->slab_cache, ptr, false, false); + return true; } void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 7b32be2a3cf0..567d33b493e2 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -996,6 +996,51 @@ static void kmem_cache_invalid_free(struct kunit *test) kmem_cache_destroy(cache); } +static void kmem_cache_rcu_uaf(struct kunit *test) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB_RCU_DEBUG); + + cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU, + NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + *p = 1; + + rcu_read_lock(); + + /* Free the object - this will internally schedule an RCU callback. */ + kmem_cache_free(cache, p); + + /* + * We should still be allowed to access the object at this point because + * the cache is SLAB_TYPESAFE_BY_RCU and we've been in an RCU read-side + * critical section since before the kmem_cache_free(). + */ + READ_ONCE(*p); + + rcu_read_unlock(); + + /* + * Wait for the RCU callback to execute; after this, the object should + * have actually been freed from KASAN's perspective. + */ + rcu_barrier(); + + KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*p)); + + kmem_cache_destroy(cache); +} + static void empty_cache_ctor(void *object) { } static void kmem_cache_double_destroy(struct kunit *test) @@ -1937,6 +1982,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmem_cache_oob), KUNIT_CASE(kmem_cache_double_free), KUNIT_CASE(kmem_cache_invalid_free), + KUNIT_CASE(kmem_cache_rcu_uaf), KUNIT_CASE(kmem_cache_double_destroy), KUNIT_CASE(kmem_cache_accounted), KUNIT_CASE(kmem_cache_bulk), diff --git a/mm/slab.h b/mm/slab.h index dcdb56b8e7f5..9f907e930609 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -443,6 +443,13 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s) return (s->flags & SLAB_KMALLOC); } +static inline bool is_kmalloc_normal(struct kmem_cache *s) +{ + if (!is_kmalloc_cache(s)) + return false; + return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); +} + /* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ diff --git a/mm/slab_common.c b/mm/slab_common.c index 85afeb69b3c0..11ef221bce17 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -40,11 +40,6 @@ LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; -static LIST_HEAD(slab_caches_to_rcu_destroy); -static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); -static DECLARE_WORK(slab_caches_to_rcu_destroy_work, - slab_caches_to_rcu_destroy_workfn); - /* * Set of flags that will prevent slab merging */ @@ -502,81 +497,19 @@ fail: } EXPORT_SYMBOL(kmem_buckets_create); -#ifdef SLAB_SUPPORTS_SYSFS /* * For a given kmem_cache, kmem_cache_destroy() should only be called * once or there will be a use-after-free problem. The actual deletion * and release of the kobject does not need slab_mutex or cpu_hotplug_lock * protection. So they are now done without holding those locks. - * - * Note that there will be a slight delay in the deletion of sysfs files - * if kmem_cache_release() is called indrectly from a work function. */ static void kmem_cache_release(struct kmem_cache *s) { - if (slab_state >= FULL) { - sysfs_slab_unlink(s); + kfence_shutdown_cache(s); + if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL) sysfs_slab_release(s); - } else { + else slab_kmem_cache_release(s); - } -} -#else -static void kmem_cache_release(struct kmem_cache *s) -{ - slab_kmem_cache_release(s); -} -#endif - -static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) -{ - LIST_HEAD(to_destroy); - struct kmem_cache *s, *s2; - - /* - * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the - * @slab_caches_to_rcu_destroy list. The slab pages are freed - * through RCU and the associated kmem_cache are dereferenced - * while freeing the pages, so the kmem_caches should be freed only - * after the pending RCU operations are finished. As rcu_barrier() - * is a pretty slow operation, we batch all pending destructions - * asynchronously. - */ - mutex_lock(&slab_mutex); - list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy); - mutex_unlock(&slab_mutex); - - if (list_empty(&to_destroy)) - return; - - rcu_barrier(); - - list_for_each_entry_safe(s, s2, &to_destroy, list) { - debugfs_slab_release(s); - kfence_shutdown_cache(s); - kmem_cache_release(s); - } -} - -static int shutdown_cache(struct kmem_cache *s) -{ - /* free asan quarantined objects */ - kasan_cache_shutdown(s); - - if (__kmem_cache_shutdown(s) != 0) - return -EBUSY; - - list_del(&s->list); - - if (s->flags & SLAB_TYPESAFE_BY_RCU) { - list_add_tail(&s->list, &slab_caches_to_rcu_destroy); - schedule_work(&slab_caches_to_rcu_destroy_work); - } else { - kfence_shutdown_cache(s); - debugfs_slab_release(s); - } - - return 0; } void slab_kmem_cache_release(struct kmem_cache *s) @@ -588,29 +521,63 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - int err = -EBUSY; - bool rcu_set; + int err; if (unlikely(!s) || !kasan_check_byte(s)) return; + /* in-flight kfree_rcu()'s may include objects from our cache */ + kvfree_rcu_barrier(); + + if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) && + (s->flags & SLAB_TYPESAFE_BY_RCU)) { + /* + * Under CONFIG_SLUB_RCU_DEBUG, when objects in a + * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally + * defer their freeing with call_rcu(). + * Wait for such call_rcu() invocations here before actually + * destroying the cache. + * + * It doesn't matter that we haven't looked at the slab refcount + * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so + * the refcount should be 1 here. + */ + rcu_barrier(); + } + cpus_read_lock(); mutex_lock(&slab_mutex); - rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; - s->refcount--; - if (s->refcount) - goto out_unlock; + if (s->refcount) { + mutex_unlock(&slab_mutex); + cpus_read_unlock(); + return; + } - err = shutdown_cache(s); + /* free asan quarantined objects */ + kasan_cache_shutdown(s); + + err = __kmem_cache_shutdown(s); WARN(err, "%s %s: Slab cache still has objects when called from %pS", __func__, s->name, (void *)_RET_IP_); -out_unlock: + + list_del(&s->list); + mutex_unlock(&slab_mutex); cpus_read_unlock(); - if (!err && !rcu_set) - kmem_cache_release(s); + + if (slab_state >= FULL) + sysfs_slab_unlink(s); + debugfs_slab_release(s); + + if (err) + return; + + if (s->flags & SLAB_TYPESAFE_BY_RCU) + rcu_barrier(); + + kmem_cache_release(s); } EXPORT_SYMBOL(kmem_cache_destroy); diff --git a/mm/slub.c b/mm/slub.c index d52c88f29f69..81cea762d094 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2184,6 +2184,45 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, __memcg_slab_free_hook(s, slab, p, objects, obj_exts); } + +static __fastpath_inline +bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + struct slabobj_ext *slab_exts; + struct kmem_cache *s; + struct folio *folio; + struct slab *slab; + unsigned long off; + + folio = virt_to_folio(p); + if (!folio_test_slab(folio)) { + return folio_memcg_kmem(folio) || + (__memcg_kmem_charge_page(folio_page(folio, 0), flags, + folio_order(folio)) == 0); + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + /* + * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency + * of slab_obj_exts being allocated from the same slab and thus the slab + * becoming effectively unfreeable. + */ + if (is_kmalloc_normal(s)) + return true; + + /* Ignore already charged objects. */ + slab_exts = slab_obj_exts(slab); + if (slab_exts) { + off = obj_to_index(s, slab, p); + if (unlikely(slab_exts[off].objcg)) + return true; + } + + return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); +} + #else /* CONFIG_MEMCG */ static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, @@ -2197,18 +2236,37 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } + +static inline bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + return true; +} #endif /* CONFIG_MEMCG */ +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head); + +struct rcu_delayed_free { + struct rcu_head head; + void *object; +}; +#endif + /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. * * Returns true if freeing of the object can proceed, false if its reuse - * was delayed by KASAN quarantine, or it was returned to KFENCE. + * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned + * to KFENCE. */ static __always_inline -bool slab_free_hook(struct kmem_cache *s, void *x, bool init) +bool slab_free_hook(struct kmem_cache *s, void *x, bool init, + bool after_rcu_delay) { + /* Are the object contents still accessible? */ + bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay; + kmemleak_free_recursive(x, s->flags); kmsan_slab_free(s, x); @@ -2218,7 +2276,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) debug_check_no_obj_freed(x, s->object_size); /* Use KCSAN to help debug racy use-after-free. */ - if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) + if (!still_accessible) __kcsan_check_access(x, s->object_size, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); @@ -2226,6 +2284,35 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) return false; /* + * Give KASAN a chance to notice an invalid free operation before we + * modify the object. + */ + if (kasan_slab_pre_free(s, x)) + return false; + +#ifdef CONFIG_SLUB_RCU_DEBUG + if (still_accessible) { + struct rcu_delayed_free *delayed_free; + + delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT); + if (delayed_free) { + /* + * Let KASAN track our call stack as a "related work + * creation", just like if the object had been freed + * normally via kfree_rcu(). + * We have to do this manually because the rcu_head is + * not located inside the object. + */ + kasan_record_aux_stack_noalloc(x); + + delayed_free->object = x; + call_rcu(&delayed_free->head, slab_free_after_rcu_debug); + return false; + } + } +#endif /* CONFIG_SLUB_RCU_DEBUG */ + + /* * As memory initialization might be integrated into KASAN, * kasan_slab_free and initialization memset's must be * kept together to avoid discrepancies in behavior. @@ -2255,7 +2342,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init); + return !kasan_slab_free(s, x, init, still_accessible); } static __fastpath_inline @@ -2269,7 +2356,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, bool init; if (is_kfence_address(next)) { - slab_free_hook(s, next, false); + slab_free_hook(s, next, false, false); return false; } @@ -2284,7 +2371,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (likely(slab_free_hook(s, object, init))) { + if (likely(slab_free_hook(s, object, init, false))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -4073,6 +4160,15 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); +bool kmem_cache_charge(void *objp, gfp_t gfpflags) +{ + if (!memcg_kmem_online()) + return true; + + return memcg_slab_post_charge(objp, gfpflags); +} +EXPORT_SYMBOL(kmem_cache_charge); + /** * kmem_cache_alloc_node - Allocate an object on the specified node * @s: The cache to allocate from. @@ -4481,7 +4577,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, memcg_slab_free_hook(s, slab, &object, 1); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) do_slab_free(s, slab, object, object, 1, addr); } @@ -4490,7 +4586,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) { - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); } #endif @@ -4509,6 +4605,33 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, do_slab_free(s, slab, head, tail, cnt, addr); } +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) +{ + struct rcu_delayed_free *delayed_free = + container_of(rcu_head, struct rcu_delayed_free, head); + void *object = delayed_free->object; + struct slab *slab = virt_to_slab(object); + struct kmem_cache *s; + + kfree(delayed_free); + + if (WARN_ON(is_kfence_address(object))) + return; + + /* find the object and the cache again */ + if (WARN_ON(!slab)) + return; + s = slab->slab_cache; + if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU))) + return; + + /* resume freeing */ + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) + do_slab_free(s, slab, object, object, 1, _THIS_IP_); +} +#endif /* CONFIG_SLUB_RCU_DEBUG */ + #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 64d07b842e73..e25381bf32d0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -714,6 +714,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) out: release_sock(sk); if (newsk && mem_cgroup_sockets_enabled) { + gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; int amt = 0; /* atomically get the memory usage, set and charge the @@ -731,8 +732,8 @@ out: } if (amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt, - GFP_KERNEL | __GFP_NOFAIL); + mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp); + kmem_cache_charge(newsk, gfp); release_sock(newsk); } |