From d629d819579327267884a12de21ef6d4b539db88 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 23 Apr 2008 22:31:08 +0300 Subject: slub: improve kmem_cache_destroy() error message As pointed out by Ingo, the SLUB warning of calling kmem_cache_destroy() with cache that still has objects triggers in practice. So turn this WARN_ON() into a nice SLUB specific error message to avoid people confusing it to a SLUB bug. Cc: Ingo Molnar Acked-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 39592b5ce68a..378d3f1b548f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2426,8 +2426,11 @@ void kmem_cache_destroy(struct kmem_cache *s) if (!s->refcount) { list_del(&s->list); up_write(&slub_lock); - if (kmem_cache_close(s)) - WARN_ON(1); + if (kmem_cache_close(s)) { + printk(KERN_ERR "SLUB %s: %s called for cache that " + "still has objects.\n", s->name, __func__); + dump_stack(); + } sysfs_slab_remove(s); } else up_write(&slub_lock); -- cgit v1.2.3-70-g09d2 From 599870b175987008b5f5c82a70b89f751e12822e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 23 Apr 2008 12:36:52 -0700 Subject: slub: free_list() cleanup free_list looked a bit screwy so here is an attempt to clean it up. free_list is is only used for freeing partial lists. We do not need to return a parameter if we decrement nr_partial within the function which allows a simplification of the whole thing. The current version modifies nr_partial outside of the list_lock which is technically not correct. It was only ok because we should be the only user of this slab cache at this point. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 378d3f1b548f..c937233127e2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2372,25 +2372,21 @@ const char *kmem_cache_name(struct kmem_cache *s) EXPORT_SYMBOL(kmem_cache_name); /* - * Attempt to free all slabs on a node. Return the number of slabs we - * were unable to free. + * Attempt to free all partial slabs on a node. */ -static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, - struct list_head *list) +static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - int slabs_inuse = 0; unsigned long flags; struct page *page, *h; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry_safe(page, h, list, lru) + list_for_each_entry_safe(page, h, &n->partial, lru) if (!page->inuse) { list_del(&page->lru); discard_slab(s, page); - } else - slabs_inuse++; + n->nr_partial--; + } spin_unlock_irqrestore(&n->list_lock, flags); - return slabs_inuse; } /* @@ -2407,8 +2403,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); - n->nr_partial -= free_list(s, n, &n->partial); - if (slabs_node(s, node)) + free_partial(s, n); + if (n->nr_partial || slabs_node(s, node)) return 1; } free_kmem_cache_nodes(s); -- cgit v1.2.3-70-g09d2 From 33b12c38134e95e5afa73214af6f49abd7b8418e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 25 Apr 2008 12:22:43 -0700 Subject: slub: Dump list of objects not freed on kmem_cache_close() Dump a list of unfreed objects if a slab cache is closed but objects still remain. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index c937233127e2..64c2b2bfbd79 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2371,6 +2371,32 @@ const char *kmem_cache_name(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_name); +static void list_slab_objects(struct kmem_cache *s, struct page *page, + const char *text) +{ +#ifdef CONFIG_SLUB_DEBUG + void *addr = page_address(page); + void *p; + DECLARE_BITMAP(map, page->objects); + + bitmap_zero(map, page->objects); + slab_err(s, page, "%s", text); + slab_lock(page); + for_each_free_object(p, s, page->freelist) + set_bit(slab_index(p, s, addr), map); + + for_each_object(p, s, addr, page->objects) { + + if (!test_bit(slab_index(p, s, addr), map)) { + printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", + p, p - addr); + print_tracking(s, p); + } + } + slab_unlock(page); +#endif +} + /* * Attempt to free all partial slabs on a node. */ @@ -2380,12 +2406,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) struct page *page, *h; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry_safe(page, h, &n->partial, lru) + list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { list_del(&page->lru); discard_slab(s, page); n->nr_partial--; + } else { + list_slab_objects(s, page, + "Objects remaining on kmem_cache_close()"); } + } spin_unlock_irqrestore(&n->list_lock, flags); } -- cgit v1.2.3-70-g09d2 From 39b264641a0c3b5e0e742e2046b49e92d1f3be88 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:11:30 +0300 Subject: slub: Store max number of objects in the page struct. Split the inuse field up to be able to store the number of objects in this page in the page struct as well. Necessary if we want to have pages of various orders for a slab. Also avoids touching struct kmem_cache cachelines in __slab_alloc(). Update diagnostic code to check the number of objects and make sure that the number of objects always stays within the bounds of a 16 bit unsigned integer. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/mm_types.h | 5 ++++- mm/slub.c | 54 ++++++++++++++++++++++++++++++------------------ 2 files changed, 38 insertions(+), 21 deletions(-) (limited to 'mm/slub.c') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index af190ceab971..e0bd2235296b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -42,7 +42,10 @@ struct page { * to show when page is mapped * & limit reverse map searches. */ - unsigned int inuse; /* SLUB: Nr of objects */ + struct { /* SLUB */ + u16 inuse; + u16 objects; + }; }; union { struct { diff --git a/mm/slub.c b/mm/slub.c index 64c2b2bfbd79..6641025c597f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -301,7 +301,7 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; base = page_address(page); - if (object < base || object >= base + s->objects * s->size || + if (object < base || object >= base + page->objects * s->size || (object - base) % s->size) { return 0; } @@ -451,8 +451,8 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", - page, page->inuse, page->freelist, page->flags); + printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); } @@ -652,6 +652,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) p + off, POISON_INUSE, s->size - off); } +/* Check the pad bytes at the end of a slab page */ static int slab_pad_check(struct kmem_cache *s, struct page *page) { u8 *start; @@ -664,20 +665,20 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - end = start + (PAGE_SIZE << s->order); - length = s->objects * s->size; - remainder = end - (start + length); + length = (PAGE_SIZE << s->order); + end = start + length; + remainder = length % s->size; if (!remainder) return 1; - fault = check_bytes(start + length, POISON_INUSE, remainder); + fault = check_bytes(end - remainder, POISON_INUSE, remainder); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", start, length); + print_section("Padding", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, start, end); return 0; @@ -739,15 +740,24 @@ static int check_object(struct kmem_cache *s, struct page *page, static int check_slab(struct kmem_cache *s, struct page *page) { + int maxobj; + VM_BUG_ON(!irqs_disabled()); if (!PageSlab(page)) { slab_err(s, page, "Not a valid slab page"); return 0; } - if (page->inuse > s->objects) { + + maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + if (page->objects > maxobj) { + slab_err(s, page, "objects %u > max %u", + s->name, page->objects, maxobj); + return 0; + } + if (page->inuse > page->objects) { slab_err(s, page, "inuse %u > max %u", - s->name, page->inuse, s->objects); + s->name, page->inuse, page->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -765,7 +775,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) void *fp = page->freelist; void *object = NULL; - while (fp && nr <= s->objects) { + while (fp && nr <= page->objects) { if (fp == search) return 1; if (!check_valid_pointer(s, page, fp)) { @@ -777,7 +787,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) } else { slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; - page->inuse = s->objects; + page->inuse = page->objects; slab_fix(s, "Freelist cleared"); return 0; } @@ -788,10 +798,10 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - if (page->inuse != s->objects - nr) { + if (page->inuse != page->objects - nr) { slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, s->objects - nr); - page->inuse = s->objects - nr; + "counted were %d", page->inuse, page->objects - nr); + page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } return search == NULL; @@ -910,7 +920,7 @@ bad: * as used avoids touching the remaining objects. */ slab_fix(s, "Marking all objects used"); - page->inuse = s->objects; + page->inuse = page->objects; page->freelist = NULL; } return 0; @@ -1081,6 +1091,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) return NULL; + page->objects = s->objects; mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, @@ -1519,7 +1530,7 @@ load_freelist: goto debug; c->freelist = object[c->offset]; - c->page->inuse = s->objects; + c->page->inuse = c->page->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); unlock_out: @@ -1818,6 +1829,9 @@ static inline int slab_order(int size, int min_objects, int rem; int min_order = slub_min_order; + if ((PAGE_SIZE << min_order) / size > 65535) + return get_order(size * 65535) - 1; + for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); order <= max_order; order++) { @@ -3251,7 +3265,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page, return 0; /* Now we know that a valid freelist exists */ - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); @@ -3528,10 +3542,10 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, struct page *page, enum track_item alloc) { void *addr = page_address(page); - DECLARE_BITMAP(map, s->objects); + DECLARE_BITMAP(map, page->objects); void *p; - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); for_each_free_object(p, s, page->freelist) set_bit(slab_index(p, s, addr), map); -- cgit v1.2.3-70-g09d2 From 224a88be40c45c0da5bdc45a8118004a37c60e8a Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:11:31 +0300 Subject: slub: for_each_object must be passed the number of objects in a slab Pass the number of objects to the for_each_object macro. Most of these are debug related. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 6641025c597f..67f7d6068934 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -327,8 +327,8 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) } /* Loop over all objects in a slab */ -#define for_each_object(__p, __s, __addr) \ - for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) /* Scan freelist */ @@ -774,6 +774,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) int nr = 0; void *fp = page->freelist; void *object = NULL; + unsigned long max_objects; while (fp && nr <= page->objects) { if (fp == search) @@ -798,6 +799,16 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } + max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + if (max_objects > 65535) + max_objects = 65535; + + if (page->objects != max_objects) { + slab_err(s, page, "Wrong number of objects. Found %d but " + "should be %d", page->objects, max_objects); + page->objects = max_objects; + slab_fix(s, "Number of objects adjusted."); + } if (page->inuse != page->objects - nr) { slab_err(s, page, "Wrong object count. Counter is %d but " "counted were %d", page->inuse, page->objects - nr); @@ -1135,7 +1146,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) memset(start, POISON_INUSE, PAGE_SIZE << s->order); last = start; - for_each_object(p, s, start) { + for_each_object(p, s, start, page->objects) { setup_object(s, page, last); set_freepointer(s, last, p); last = p; @@ -1157,7 +1168,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) void *p; slab_pad_check(s, page); - for_each_object(p, s, page_address(page)) + for_each_object(p, s, page_address(page), + page->objects) check_object(s, page, p, 0); ClearSlabDebug(page); } @@ -3273,7 +3285,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page, return 0; } - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) if (!check_object(s, page, p, 1)) return 0; @@ -3549,7 +3561,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, for_each_free_object(p, s, page->freelist) set_bit(slab_index(p, s, addr), map); - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) add_location(t, s, get_track(s, p, alloc)); } -- cgit v1.2.3-70-g09d2 From 834f3d119234b35a1985a2449831d99356637937 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:11:31 +0300 Subject: slub: Add kmem_cache_order_objects struct Pack the order and the number of objects into a single word. This saves some memory in the kmem_cache_structure and more importantly allows us to fetch both values atomically. Later the slab orders become runtime configurable and we need to fetch these two items together in order to properly allocate a slab and initialize its objects. Fix the race by fetching the order and the number of objects in one word. [penberg@cs.helsinki.fi: fix memset() page order in new_slab()] Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 12 ++++++-- mm/slub.c | 76 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 61 insertions(+), 27 deletions(-) (limited to 'mm/slub.c') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 79d59c937fac..4131e5fbd18b 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -52,6 +52,15 @@ struct kmem_cache_node { #endif }; +/* + * Word size structure that can be atomically updated or read and that + * contains both the order and the number of objects that a slab of the + * given order would contain. + */ +struct kmem_cache_order_objects { + unsigned long x; +}; + /* * Slab cache management. */ @@ -61,7 +70,7 @@ struct kmem_cache { int size; /* The size of an object including meta data */ int objsize; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ - int order; /* Current preferred allocation order */ + struct kmem_cache_order_objects oo; /* * Avoid an extra cache line for UP, SMP and for the node local to @@ -70,7 +79,6 @@ struct kmem_cache { struct kmem_cache_node local_node; /* Allocation and freeing of slabs */ - int objects; /* Number of objects in slab */ gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ void (*ctor)(struct kmem_cache *, void *); diff --git a/mm/slub.c b/mm/slub.c index 67f7d6068934..0a220df5ed7c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -341,6 +341,26 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline struct kmem_cache_order_objects oo_make(int order, + unsigned long size) +{ + struct kmem_cache_order_objects x = { + (order << 16) + (PAGE_SIZE << order) / size + }; + + return x; +} + +static inline int oo_order(struct kmem_cache_order_objects x) +{ + return x.x >> 16; +} + +static inline int oo_objects(struct kmem_cache_order_objects x) +{ + return x.x & ((1 << 16) - 1); +} + #ifdef CONFIG_SLUB_DEBUG /* * Debug settings: @@ -665,7 +685,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << s->order); + length = (PAGE_SIZE << compound_order(page)); end = start + length; remainder = length % s->size; if (!remainder) @@ -1090,19 +1110,21 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node) {} static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - int pages = 1 << s->order; + struct kmem_cache_order_objects oo = s->oo; + int order = oo_order(oo); + int pages = 1 << order; flags |= s->allocflags; if (node == -1) - page = alloc_pages(flags, s->order); + page = alloc_pages(flags, order); else - page = alloc_pages_node(node, flags, s->order); + page = alloc_pages_node(node, flags, order); if (!page) return NULL; - page->objects = s->objects; + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, @@ -1143,7 +1165,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << s->order); + memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); last = start; for_each_object(p, s, start, page->objects) { @@ -1162,7 +1184,8 @@ out: static void __free_slab(struct kmem_cache *s, struct page *page) { - int pages = 1 << s->order; + int order = compound_order(page); + int pages = 1 << order; if (unlikely(SlabDebug(page))) { void *p; @@ -1181,7 +1204,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); reset_page_mapcount(page); - __free_pages(page, s->order); + __free_pages(page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -2202,6 +2225,7 @@ static int calculate_sizes(struct kmem_cache *s) unsigned long flags = s->flags; unsigned long size = s->objsize; unsigned long align = s->align; + int order; /* * Round up object size to the next word boundary. We can only @@ -2294,17 +2318,17 @@ static int calculate_sizes(struct kmem_cache *s) * page allocator order 0 allocs so take a reasonably large * order that will allows us a good number of objects. */ - s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER); + order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER); s->flags |= __PAGE_ALLOC_FALLBACK; s->allocflags |= __GFP_NOWARN; } else - s->order = calculate_order(size); + order = calculate_order(size); - if (s->order < 0) + if (order < 0) return 0; s->allocflags = 0; - if (s->order) + if (order) s->allocflags |= __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) @@ -2316,9 +2340,9 @@ static int calculate_sizes(struct kmem_cache *s) /* * Determine the number of objects per slab */ - s->objects = (PAGE_SIZE << s->order) / size; + s->oo = oo_make(order, size); - return !!s->objects; + return !!oo_objects(s->oo); } @@ -2351,7 +2375,7 @@ error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, s->order, + s->name, (unsigned long)size, s->size, oo_order(s->oo), s->offset, flags); return 0; } @@ -2789,8 +2813,9 @@ int kmem_cache_shrink(struct kmem_cache *s) struct kmem_cache_node *n; struct page *page; struct page *t; + int objects = oo_objects(s->oo); struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); unsigned long flags; if (!slabs_by_inuse) @@ -2803,7 +2828,7 @@ int kmem_cache_shrink(struct kmem_cache *s) if (!n->nr_partial) continue; - for (i = 0; i < s->objects; i++) + for (i = 0; i < objects; i++) INIT_LIST_HEAD(slabs_by_inuse + i); spin_lock_irqsave(&n->list_lock, flags); @@ -2835,7 +2860,7 @@ int kmem_cache_shrink(struct kmem_cache *s) * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = s->objects - 1; i >= 0; i--) + for (i = objects - 1; i >= 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); @@ -3351,7 +3376,7 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->oo)) * sizeof(unsigned long), GFP_KERNEL); if (!map) @@ -3719,7 +3744,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, - n->nr_partial; if (flags & SO_OBJECTS) - x = full_slabs * s->objects; + x = full_slabs * oo_objects(s->oo); else x = full_slabs; total += x; @@ -3798,13 +3823,13 @@ SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->objects); + return sprintf(buf, "%d\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->order); + return sprintf(buf, "%d\n", oo_order(s->oo)); } SLAB_ATTR_RO(order); @@ -4451,11 +4476,12 @@ static int s_show(struct seq_file *m, void *p) nr_inuse += count_partial(n); } - nr_objs = nr_slabs * s->objects; - nr_inuse += (nr_slabs - nr_partials) * s->objects; + nr_objs = nr_slabs * oo_objects(s->oo); + nr_inuse += (nr_slabs - nr_partials) * oo_objects(s->oo); seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, s->objects, (1 << s->order)); + nr_objs, s->size, oo_objects(s->oo), + (1 << oo_order(s->oo))); seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 0UL); -- cgit v1.2.3-70-g09d2 From 205ab99dd103e3dd5b0964dad8a16dfe2db69b2e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:11:40 +0300 Subject: slub: Update statistics handling for variable order slabs Change the statistics to consider that slabs of the same slabcache can have different number of objects in them since they may be of different order. Provide a new sysfs field total_objects which shows the total objects that the allocated slabs of a slabcache could hold. Add a max field that holds the largest slab order that was ever used for a slab cache. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- Documentation/vm/slabinfo.c | 27 ++++---- include/linux/slub_def.h | 2 + mm/slub.c | 150 ++++++++++++++++++++++++++++---------------- 3 files changed, 110 insertions(+), 69 deletions(-) (limited to 'mm/slub.c') diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index 22d7e3e4d60c..d3ce295bffac 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c @@ -31,7 +31,7 @@ struct slabinfo { int hwcache_align, object_size, objs_per_slab; int sanity_checks, slab_size, store_user, trace; int order, poison, reclaim_account, red_zone; - unsigned long partial, objects, slabs; + unsigned long partial, objects, slabs, objects_partial, objects_total; unsigned long alloc_fastpath, alloc_slowpath; unsigned long free_fastpath, free_slowpath; unsigned long free_frozen, free_add_partial, free_remove_partial; @@ -540,7 +540,8 @@ void slabcache(struct slabinfo *s) return; store_size(size_str, slab_size(s)); - snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs); + snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, + s->partial, s->cpu_slabs); if (!line++) first_line(); @@ -776,7 +777,6 @@ void totals(void) unsigned long used; unsigned long long wasted; unsigned long long objwaste; - long long objects_in_partial_slabs; unsigned long percentage_partial_slabs; unsigned long percentage_partial_objs; @@ -790,18 +790,11 @@ void totals(void) wasted = size - used; objwaste = s->slab_size - s->object_size; - objects_in_partial_slabs = s->objects - - (s->slabs - s->partial - s ->cpu_slabs) * - s->objs_per_slab; - - if (objects_in_partial_slabs < 0) - objects_in_partial_slabs = 0; - percentage_partial_slabs = s->partial * 100 / s->slabs; if (percentage_partial_slabs > 100) percentage_partial_slabs = 100; - percentage_partial_objs = objects_in_partial_slabs * 100 + percentage_partial_objs = s->objects_partial * 100 / s->objects; if (percentage_partial_objs > 100) @@ -823,8 +816,8 @@ void totals(void) min_objects = s->objects; if (used < min_used) min_used = used; - if (objects_in_partial_slabs < min_partobj) - min_partobj = objects_in_partial_slabs; + if (s->objects_partial < min_partobj) + min_partobj = s->objects_partial; if (percentage_partial_slabs < min_ppart) min_ppart = percentage_partial_slabs; if (percentage_partial_objs < min_ppartobj) @@ -848,8 +841,8 @@ void totals(void) max_objects = s->objects; if (used > max_used) max_used = used; - if (objects_in_partial_slabs > max_partobj) - max_partobj = objects_in_partial_slabs; + if (s->objects_partial > max_partobj) + max_partobj = s->objects_partial; if (percentage_partial_slabs > max_ppart) max_ppart = percentage_partial_slabs; if (percentage_partial_objs > max_ppartobj) @@ -864,7 +857,7 @@ void totals(void) total_objects += s->objects; total_used += used; - total_partobj += objects_in_partial_slabs; + total_partobj += s->objects_partial; total_ppart += percentage_partial_slabs; total_ppartobj += percentage_partial_objs; @@ -1160,6 +1153,8 @@ void read_slab_dir(void) slab->hwcache_align = get_obj("hwcache_align"); slab->object_size = get_obj("object_size"); slab->objects = get_obj("objects"); + slab->objects_partial = get_obj("objects_partial"); + slab->objects_total = get_obj("objects_total"); slab->objs_per_slab = get_obj("objs_per_slab"); slab->order = get_obj("order"); slab->partial = get_obj("partial"); diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 4131e5fbd18b..4236b5dee812 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -48,6 +48,7 @@ struct kmem_cache_node { struct list_head partial; #ifdef CONFIG_SLUB_DEBUG atomic_long_t nr_slabs; + atomic_long_t total_objects; struct list_head full; #endif }; @@ -79,6 +80,7 @@ struct kmem_cache { struct kmem_cache_node local_node; /* Allocation and freeing of slabs */ + struct kmem_cache_order_objects max; gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ void (*ctor)(struct kmem_cache *, void *); diff --git a/mm/slub.c b/mm/slub.c index 0a220df5ed7c..c8514e93ffdf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -886,7 +886,7 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node) return atomic_long_read(&n->nr_slabs); } -static inline void inc_slabs_node(struct kmem_cache *s, int node) +static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) { struct kmem_cache_node *n = get_node(s, node); @@ -896,14 +896,17 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node) * dilemma by deferring the increment of the count during * bootstrap (see early_kmem_cache_node_alloc). */ - if (!NUMA_BUILD || n) + if (!NUMA_BUILD || n) { atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); + } } -static inline void dec_slabs_node(struct kmem_cache *s, int node) +static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) { struct kmem_cache_node *n = get_node(s, node); atomic_long_dec(&n->nr_slabs); + atomic_long_sub(objects, &n->total_objects); } /* Object debug checks for alloc/free paths */ @@ -1101,9 +1104,12 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, static inline unsigned long slabs_node(struct kmem_cache *s, int node) { return 0; } -static inline void inc_slabs_node(struct kmem_cache *s, int node) {} -static inline void dec_slabs_node(struct kmem_cache *s, int node) {} +static inline void inc_slabs_node(struct kmem_cache *s, int node, + int objects) {} +static inline void dec_slabs_node(struct kmem_cache *s, int node, + int objects) {} #endif + /* * Slab allocation and freeing */ @@ -1155,7 +1161,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; - inc_slabs_node(s, page_to_nid(page)); + inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | @@ -1230,7 +1236,7 @@ static void free_slab(struct kmem_cache *s, struct page *page) static void discard_slab(struct kmem_cache *s, struct page *page) { - dec_slabs_node(s, page_to_nid(page)); + dec_slabs_node(s, page_to_nid(page), page->objects); free_slab(s, page); } @@ -2144,7 +2150,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, init_tracking(kmalloc_caches, n); #endif init_kmem_cache_node(n); - inc_slabs_node(kmalloc_caches, node); + inc_slabs_node(kmalloc_caches, node, page->objects); /* * lockdep requires consistent irq usage for each lock @@ -2341,6 +2347,8 @@ static int calculate_sizes(struct kmem_cache *s) * Determine the number of objects per slab */ s->oo = oo_make(order, size); + if (oo_objects(s->oo) > oo_objects(s->max)) + s->max = s->oo; return !!oo_objects(s->oo); @@ -2813,7 +2821,7 @@ int kmem_cache_shrink(struct kmem_cache *s) struct kmem_cache_node *n; struct page *page; struct page *t; - int objects = oo_objects(s->oo); + int objects = oo_objects(s->max); struct list_head *slabs_by_inuse = kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); unsigned long flags; @@ -3276,7 +3284,8 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, } #if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO) -static unsigned long count_partial(struct kmem_cache_node *n) +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct page *)) { unsigned long flags; unsigned long x = 0; @@ -3284,10 +3293,25 @@ static unsigned long count_partial(struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) - x += page->inuse; + x += get_count(page); spin_unlock_irqrestore(&n->list_lock, flags); return x; } + +static int count_inuse(struct page *page) +{ + return page->inuse; +} + +static int count_total(struct page *page) +{ + return page->objects; +} + +static int count_free(struct page *page) +{ + return page->objects - page->inuse; +} #endif #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) @@ -3376,7 +3400,7 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->oo)) * + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); if (!map) @@ -3676,22 +3700,23 @@ static int list_locations(struct kmem_cache *s, char *buf, } enum slab_stat_type { - SL_FULL, - SL_PARTIAL, - SL_CPU, - SL_OBJECTS + SL_ALL, /* All slabs */ + SL_PARTIAL, /* Only partially allocated slabs */ + SL_CPU, /* Only slabs used for cpu caches */ + SL_OBJECTS, /* Determine allocated objects not slabs */ + SL_TOTAL /* Determine object capacity not slabs */ }; -#define SO_FULL (1 << SL_FULL) +#define SO_ALL (1 << SL_ALL) #define SO_PARTIAL (1 << SL_PARTIAL) #define SO_CPU (1 << SL_CPU) #define SO_OBJECTS (1 << SL_OBJECTS) +#define SO_TOTAL (1 << SL_TOTAL) static ssize_t show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { unsigned long total = 0; - int cpu; int node; int x; unsigned long *nodes; @@ -3702,56 +3727,60 @@ static ssize_t show_slab_objects(struct kmem_cache *s, return -ENOMEM; per_cpu = nodes + nr_node_ids; - for_each_possible_cpu(cpu) { - struct page *page; - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + if (flags & SO_CPU) { + int cpu; - if (!c) - continue; + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - page = c->page; - node = c->node; - if (node < 0) - continue; - if (page) { - if (flags & SO_CPU) { - if (flags & SO_OBJECTS) - x = page->inuse; + if (!c || c->node < 0) + continue; + + if (c->page) { + if (flags & SO_TOTAL) + x = c->page->objects; + else if (flags & SO_OBJECTS) + x = c->page->inuse; else x = 1; + total += x; - nodes[node] += x; + nodes[c->node] += x; } - per_cpu[node]++; + per_cpu[c->node]++; } } - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + if (flags & SO_ALL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); - if (flags & SO_PARTIAL) { - if (flags & SO_OBJECTS) - x = count_partial(n); else - x = n->nr_partial; + x = atomic_long_read(&n->nr_slabs); total += x; nodes[node] += x; } - if (flags & SO_FULL) { - int full_slabs = atomic_long_read(&n->nr_slabs) - - per_cpu[node] - - n->nr_partial; + } else if (flags & SO_PARTIAL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); - if (flags & SO_OBJECTS) - x = full_slabs * oo_objects(s->oo); + if (flags & SO_TOTAL) + x = count_partial(n, count_total); + else if (flags & SO_OBJECTS) + x = count_partial(n, count_inuse); else - x = full_slabs; + x = n->nr_partial; total += x; nodes[node] += x; } } - x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA for_each_node_state(node, N_NORMAL_MEMORY) @@ -3852,7 +3881,7 @@ SLAB_ATTR_RO(aliases); static ssize_t slabs_show(struct kmem_cache *s, char *buf) { - return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); + return show_slab_objects(s, buf, SO_ALL); } SLAB_ATTR_RO(slabs); @@ -3870,10 +3899,22 @@ SLAB_ATTR_RO(cpu_slabs); static ssize_t objects_show(struct kmem_cache *s, char *buf) { - return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); } SLAB_ATTR_RO(objects); +static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects_partial); + +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); +} +SLAB_ATTR_RO(total_objects); + static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); @@ -4131,6 +4172,8 @@ static struct attribute *slab_attrs[] = { &objs_per_slab_attr.attr, &order_attr.attr, &objects_attr.attr, + &objects_partial_attr.attr, + &total_objects_attr.attr, &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, @@ -4459,7 +4502,8 @@ static int s_show(struct seq_file *m, void *p) unsigned long nr_partials = 0; unsigned long nr_slabs = 0; unsigned long nr_inuse = 0; - unsigned long nr_objs; + unsigned long nr_objs = 0; + unsigned long nr_free = 0; struct kmem_cache *s; int node; @@ -4473,11 +4517,11 @@ static int s_show(struct seq_file *m, void *p) nr_partials += n->nr_partial; nr_slabs += atomic_long_read(&n->nr_slabs); - nr_inuse += count_partial(n); + nr_objs += atomic_long_read(&n->total_objects); + nr_free += count_partial(n, count_free); } - nr_objs = nr_slabs * oo_objects(s->oo); - nr_inuse += (nr_slabs - nr_partials) * oo_objects(s->oo); + nr_inuse = nr_objs - nr_free; seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, nr_objs, s->size, oo_objects(s->oo), -- cgit v1.2.3-70-g09d2 From 65c3376aaca96c66aa76014aaf430398964b68cb Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:11:40 +0300 Subject: slub: Fallback to minimal order during slab page allocation If any higher order allocation fails then fall back the smallest order necessary to contain at least one object. This enables fallback for all allocations to order 0 pages. The fallback will waste more memory (objects will not fit neatly) and the fallback slabs will be not as efficient as larger slabs since they contain less objects. Note that SLAB also depends on order 1 allocations for some slabs that waste too much memory if forced into PAGE_SIZE'd page. SLUB now can now deal with failing order 1 allocs which SLAB cannot do. Add a new field min that will contain the objects for the smallest possible order for a slab cache. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 2 ++ mm/slub.c | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 30 insertions(+), 11 deletions(-) (limited to 'mm/slub.c') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 4236b5dee812..71e43a12ebbb 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -29,6 +29,7 @@ enum stat_item { DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ + ORDER_FALLBACK, /* Number of times fallback was necessary */ NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { @@ -81,6 +82,7 @@ struct kmem_cache { /* Allocation and freeing of slabs */ struct kmem_cache_order_objects max; + struct kmem_cache_order_objects min; gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ void (*ctor)(struct kmem_cache *, void *); diff --git a/mm/slub.c b/mm/slub.c index c8514e93ffdf..35c22d940ba7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1113,28 +1113,43 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, /* * Slab allocation and freeing */ +static inline struct page *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo) +{ + int order = oo_order(oo); + + if (node == -1) + return alloc_pages(flags, order); + else + return alloc_pages_node(node, flags, order); +} + static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; struct kmem_cache_order_objects oo = s->oo; - int order = oo_order(oo); - int pages = 1 << order; flags |= s->allocflags; - if (node == -1) - page = alloc_pages(flags, order); - else - page = alloc_pages_node(node, flags, order); - - if (!page) - return NULL; + page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, + oo); + if (unlikely(!page)) { + oo = s->min; + /* + * Allocation may have failed due to fragmentation. + * Try a lower order alloc if possible + */ + page = alloc_slab_page(flags, node, oo); + if (!page) + return NULL; + stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + } page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); + 1 << oo_order(oo)); return page; } @@ -2347,6 +2362,7 @@ static int calculate_sizes(struct kmem_cache *s) * Determine the number of objects per slab */ s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -4163,7 +4179,7 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); - +STAT_ATTR(ORDER_FALLBACK, order_fallback); #endif static struct attribute *slab_attrs[] = { @@ -4216,6 +4232,7 @@ static struct attribute *slab_attrs[] = { &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &order_fallback_attr.attr, #endif NULL }; -- cgit v1.2.3-70-g09d2 From 319d1e240683d37924ea8977c91730c3393fd453 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:11:41 +0300 Subject: slub: Drop fallback to page allocator method There is now a generic method of falling back to a slab page of minimal order. No need anymore for the fallback to kmalloc_large(). Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 43 ++----------------------------------------- 1 file changed, 2 insertions(+), 41 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 35c22d940ba7..de6f38761d1f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -204,8 +204,6 @@ static inline void ClearSlabDebug(struct page *page) /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ -#define __KMALLOC_CACHE 0x20000000 /* objects freed using kfree */ -#define __PAGE_ALLOC_FALLBACK 0x10000000 /* Allow fallback to page alloc */ /* Not all arches define cache_line_size */ #ifndef cache_line_size @@ -1623,27 +1621,6 @@ new_slab: c->page = new; goto load_freelist; } - - /* - * No memory available. - * - * If the slab uses higher order allocs but the object is - * smaller than a page size then we can fallback in emergencies - * to the page allocator via kmalloc_large. The page allocator may - * have failed to obtain a higher order page and we can try to - * allocate a single page if the object fits into a single page. - * That is only possible if certain conditions are met that are being - * checked when a slab is created. - */ - if (!(gfpflags & __GFP_NORETRY) && - (s->flags & __PAGE_ALLOC_FALLBACK)) { - if (gfpflags & __GFP_WAIT) - local_irq_enable(); - object = kmalloc_large(s->objsize, gfpflags); - if (gfpflags & __GFP_WAIT) - local_irq_disable(); - return object; - } return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) @@ -2330,20 +2307,7 @@ static int calculate_sizes(struct kmem_cache *s) */ size = ALIGN(size, align); s->size = size; - - if ((flags & __KMALLOC_CACHE) && - PAGE_SIZE / size < slub_min_objects) { - /* - * Kmalloc cache that would not have enough objects in - * an order 0 page. Kmalloc slabs can fallback to - * page allocator order 0 allocs so take a reasonably large - * order that will allows us a good number of objects. - */ - order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER); - s->flags |= __PAGE_ALLOC_FALLBACK; - s->allocflags |= __GFP_NOWARN; - } else - order = calculate_order(size); + order = calculate_order(size); if (order < 0) return 0; @@ -2589,7 +2553,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, down_write(&slub_lock); if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, - flags | __KMALLOC_CACHE, NULL)) + flags, NULL)) goto panic; list_add(&s->list, &slab_caches); @@ -3105,9 +3069,6 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; - if ((s->flags & __PAGE_ALLOC_FALLBACK)) - return 1; - if (s->ctor) return 1; -- cgit v1.2.3-70-g09d2 From 06b285dc3d6194abe79ab9dcaaab703d6f75627c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:11:41 +0300 Subject: slub: Make the order configurable for each slab cache Makes /sys/kernel/slab//order writable. The allocation order of a slab cache can then be changed dynamically during runtime. This can be used to override the objects per slabs value establisheed with the slub_min_objects setting that was manually specified or calculated on bootup. The changes of the slab order can occur while allocate_slab() runs. Allocate slab needs the order and the number of slab objects that are both changed by the change of order. Both are put into a single word (struct kmem_cache_order_objects). They can then be atomically updated and retrieved. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index de6f38761d1f..23a2683d6c9f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2218,7 +2218,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; unsigned long size = s->objsize; @@ -2307,7 +2307,10 @@ static int calculate_sizes(struct kmem_cache *s) */ size = ALIGN(size, align); s->size = size; - order = calculate_order(size); + if (forced_order >= 0) + order = forced_order; + else + order = calculate_order(size); if (order < 0) return 0; @@ -2346,7 +2349,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); - if (!calculate_sizes(s)) + if (!calculate_sizes(s, -1)) goto error; s->refcount = 1; @@ -3833,11 +3836,23 @@ static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objs_per_slab); +static ssize_t order_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int order = simple_strtoul(buf, NULL, 10); + + if (order > slub_max_order || order < slub_min_order) + return -EINVAL; + + calculate_sizes(s, order); + return length; +} + static ssize_t order_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", oo_order(s->oo)); } -SLAB_ATTR_RO(order); +SLAB_ATTR(order); static ssize_t ctor_show(struct kmem_cache *s, char *buf) { @@ -3971,7 +3986,7 @@ static ssize_t red_zone_store(struct kmem_cache *s, s->flags &= ~SLAB_RED_ZONE; if (buf[0] == '1') s->flags |= SLAB_RED_ZONE; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(red_zone); @@ -3990,7 +4005,7 @@ static ssize_t poison_store(struct kmem_cache *s, s->flags &= ~SLAB_POISON; if (buf[0] == '1') s->flags |= SLAB_POISON; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(poison); @@ -4009,7 +4024,7 @@ static ssize_t store_user_store(struct kmem_cache *s, s->flags &= ~SLAB_STORE_USER; if (buf[0] == '1') s->flags |= SLAB_STORE_USER; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(store_user); -- cgit v1.2.3-70-g09d2 From 31d33baf36bda7a2fea800648d87c9fe6155e7ca Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:11:41 +0300 Subject: slub: Simplify any_slab_object checks Since we now have total_objects counter per node use that to check for the presence of any objects. The loop over all cpu slabs is not that useful since any cpu slab would require an object allocation first. So drop that. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 23a2683d6c9f..06533f342be0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3775,14 +3775,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, static int any_slab_objects(struct kmem_cache *s) { int node; - int cpu; - - for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c && c->page) - return 1; - } for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -3790,7 +3782,7 @@ static int any_slab_objects(struct kmem_cache *s) if (!n) continue; - if (n->nr_partial || atomic_long_read(&n->nr_slabs)) + if (atomic_read(&n->total_objects)) return 1; } return 0; -- cgit v1.2.3-70-g09d2 From 114e9e89e668ec561c9b0f3dea7bcc8af7c29d21 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:11:41 +0300 Subject: slub: Drop DEFAULT_MAX_ORDER / DEFAULT_MIN_OBJECTS We can now fallback to order 0 slabs. So set the slub_max_order to PAGE_CACHE_ORDER_COSTLY but keep the slub_min_objects at 4. This will mostly preserve the orders used in 2.6.25. F.e. The 2k kmalloc slab will use order 1 allocs and the 4k kmalloc slab order 2. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 06533f342be0..6572cef0c43c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -149,25 +149,6 @@ static inline void ClearSlabDebug(struct page *page) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST -#if PAGE_SHIFT <= 12 - -/* - * Small page size. Make sure that we do not fragment memory - */ -#define DEFAULT_MAX_ORDER 1 -#define DEFAULT_MIN_OBJECTS 4 - -#else - -/* - * Large page machines are customarily able to handle larger - * page orders. - */ -#define DEFAULT_MAX_ORDER 2 -#define DEFAULT_MIN_OBJECTS 8 - -#endif - /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -1821,8 +1802,8 @@ static struct page *get_object_page(const void *x) * take the list_lock. */ static int slub_min_order; -static int slub_max_order = DEFAULT_MAX_ORDER; -static int slub_min_objects = DEFAULT_MIN_OBJECTS; +static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static int slub_min_objects = 4; /* * Merge control. If this is set then no merging of slab caches will occur. -- cgit v1.2.3-70-g09d2 From 9b2cd506e5f2117f94c28a0040bf5da058105316 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:11:41 +0300 Subject: slub: Calculate min_objects based on number of processors. The mininum objects per slab is calculated based on the number of processors that may come online. Processors min_objects --------------------------- 1 8 2 12 4 16 8 20 16 24 32 28 64 32 1024 48 4096 56 The higher the number of processors the large the order sizes used for various slab caches will become. This has been shown to address the performance issues in hackbench on 16p etc. The calculation is only performed if slub_min_objects is zero (default). If one specifies a slub_min_objects on boot then that setting is taken. As suggested by Zhang Yanmin's performance tests on 16-core Tigerton, use the formula '4 * (fls(nr_cpu_ids) + 1)': ./hackbench 100 process 2000: 1) 2.6.25-rc6slab: 23.5 seconds 2) 2.6.25-rc7SLUB+slub_min_objects=20: 31 seconds 3) 2.6.25-rc7SLUB+slub_min_objects=24: 23.5 seconds Signed-off-by: Christoph Lameter Signed-off-by: Zhang Yanmin Signed-off-by: Pekka Enberg --- mm/slub.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 6572cef0c43c..e2e6ba7a5172 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1803,7 +1803,7 @@ static struct page *get_object_page(const void *x) */ static int slub_min_order; static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; -static int slub_min_objects = 4; +static int slub_min_objects; /* * Merge control. If this is set then no merging of slab caches will occur. @@ -1880,6 +1880,8 @@ static inline int calculate_order(int size) * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; + if (!min_objects) + min_objects = 4 * (fls(nr_cpu_ids) + 1); while (min_objects > 1) { fraction = 8; while (fraction >= 4) { -- cgit v1.2.3-70-g09d2 From c124f5b54f879e5870befcc076addbd5d614663f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 14 Apr 2008 19:13:29 +0300 Subject: slub: pack objects denser Since we now have more orders available use a denser packing. Increase slab order if more than 1/16th of a slab would be wasted. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index e2e6ba7a5172..d821ce6fff39 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1818,7 +1818,7 @@ static int slub_nomerge; * system components. Generally order 0 allocations should be preferred since * order 0 does not cause fragmentation in the page allocator. Larger objects * be problematic to put into order 0 slabs because there may be too much - * unused space left. We go to a higher order if more than 1/8th of the slab + * unused space left. We go to a higher order if more than 1/16th of the slab * would be wasted. * * In order to reach satisfactory performance we must ensure that a minimum @@ -1883,7 +1883,7 @@ static inline int calculate_order(int size) if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); while (min_objects > 1) { - fraction = 8; + fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, slub_max_order, fraction); -- cgit v1.2.3-70-g09d2