From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Mar 2009 19:07:44 +0900 Subject: percpu: use dynamic percpu allocator as the default percpu allocator This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use dynamic percpu allocator. The first chunk is allocated using embedding helper and 8k is reserved for modules. This ensures that the new allocator behaves almost identically to the original allocator as long as static percpu variables are concerned, so it shouldn't introduce much breakage. s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing range limit the addressing model imposes. Unfortunately, this breaks if the address is specified using a variable, so for now, the two archs aren't converted. The following architectures are affected by this change. * sh * arm * cris * mips * sparc(32) * blackfin * avr32 * parisc (broken, under investigation) * m32r * powerpc(32) As this change makes the dynamic allocator the default one, CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert - CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted archs. These archs implement their own setup_per_cpu_areas() and the conversion is not trivial. * powerpc(64) * sparc(64) * ia64 * alpha * s390 Boot and batch alloc/free tests on x86_32 with debug code (x86_32 doesn't use default first chunk initialization). Compile tested on sparc(32), powerpc(32), arm and alpha. Kyle McMartin reported that this change breaks parisc. The problem is still under investigation and he is okay with pushing this patch forward and fixing parisc later. [ Impact: use dynamic allocator for most archs w/o custom percpu setup ] Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Martin Schwidefsky Reviewed-by: Christoph Lameter Cc: Paul Mundt Cc: Russell King Cc: Mikael Starvik Cc: Ralf Baechle Cc: Bryan Wu Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Hirokazu Takata Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Heiko Carstens Cc: Ingo Molnar --- arch/x86/Kconfig | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef6b4f9..a48a90076d83 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,9 +149,6 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA - def_bool y - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP -- cgit v1.2.3-70-g09d2 From 08fc45806103e59a37418e84719b878f9bb32540 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: percpu: build first chunk allocators selectively There's no need to build unused first chunk allocators in. Define CONFIG_NEED_PER_CPU_*_FIRST_CHUNK and let archs enable them selectively. Signed-off-by: Tejun Heo --- arch/x86/Kconfig | 10 ++++++++++ include/linux/percpu.h | 27 +++++---------------------- mm/percpu.c | 19 +++++++++++-------- 3 files changed, 26 insertions(+), 30 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e06b2eeff9f2..f7ac27215512 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -150,6 +150,16 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_PAGE_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_LPAGE_FIRST_CHUNK + def_bool y + depends on NEED_MULTIPLE_NODES + config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 7989f61b03f3..e26788e0da4a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -70,17 +70,21 @@ extern size_t __init pcpu_setup_first_chunk( ssize_t dyn_size, size_t unit_size, void *base_addr, const int *unit_map); +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern ssize_t __init pcpu_page_first_chunk( size_t static_size, size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); +#endif -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern int __init pcpu_lpage_build_unit_map( size_t static_size, size_t reserved_size, ssize_t *dyn_sizep, size_t *unit_sizep, @@ -98,27 +102,6 @@ extern ssize_t __init pcpu_lpage_first_chunk( extern void *pcpu_lpage_remapped(void *kaddr); #else -static inline int pcpu_lpage_build_unit_map( - size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) -{ - return -EINVAL; -} - -static inline ssize_t __init pcpu_lpage_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) -{ - return -EINVAL; -} - static inline void *pcpu_lpage_remapped(void *kaddr) { return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index 6feac7934904..7971997de310 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1414,8 +1414,9 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, return pcpu_unit_size; } -static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep) +static inline size_t pcpu_calc_fc_sizes(size_t static_size, + size_t reserved_size, + ssize_t *dyn_sizep) { size_t size_sum; @@ -1427,6 +1428,8 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, return size_sum; } +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @static_size: the size of static percpu area in bytes @@ -1495,7 +1498,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, unit_size, base, NULL); } +#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || + !CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @static_size: the size of static percpu area in bytes @@ -1598,12 +1604,9 @@ out_free_ar: free_bootmem(__pa(pages), pages_size); return ret; } +#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ -/* - * Large page remapping first chunk setup helper - */ -#ifdef CONFIG_NEED_MULTIPLE_NODES - +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** * pcpu_lpage_build_unit_map - build unit_map for large page remapping * @static_size: the size of static percpu area in bytes @@ -1982,7 +1985,7 @@ void *pcpu_lpage_remapped(void *kaddr) return NULL; } -#endif +#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */ /* * Generic percpu area setup. -- cgit v1.2.3-70-g09d2 From 4518e6a0c038b98be4c480e6f4481e8676bd15dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA Embedding percpu first chunk allocator can now handle very sparse unit mapping. Use embedding allocator instead of lpage for 64bit NUMA. This removes extra TLB pressure and the need to do complex and fragile dancing when changing page attributes. For 32bit, using very sparse unit mapping isn't a good idea because the vmalloc space is very constrained. 32bit NUMA machines aren't exactly the focus of optimization and it isn't very clear whether lpage performs better than page. Use page first chunk allocator for 32bit NUMAs. As this leaves setup_pcpu_*() functions pretty much empty, fold them into setup_per_cpu_areas(). Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Andi Kleen --- arch/x86/Kconfig | 4 -- arch/x86/kernel/setup_percpu.c | 155 ++++++++--------------------------------- 2 files changed, 28 insertions(+), 131 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f7ac27215512..869d7d301448 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -156,10 +156,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK config NEED_PER_CPU_PAGE_FIRST_CHUNK def_bool y -config NEED_PER_CPU_LPAGE_FIRST_CHUNK - def_bool y - depends on NEED_MULTIPLE_NODES - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 67f6314de9f1..d559af913e1f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif +#ifdef CONFIG_X86_32 /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) #endif return false; } +#endif /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu @@ -136,128 +138,23 @@ static void __init pcpu_fc_free(void *ptr, size_t size) free_bootmem(__pa(ptr), size); } -/* - * Large page remapping allocator - */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -static void __init pcpul_map(void *ptr, size_t size, void *addr) -{ - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)addr); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); -} - -static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { +#ifdef CONFIG_NEED_MULTIPLE_NODES if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else return REMOTE_DISTANCE; -} - -static int __init setup_pcpu_lpage(bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; - struct pcpu_alloc_info *ai; - int rc; - - /* on non-NUMA, embedding is better */ - if (!chosen && !pcpu_need_numa()) - return -EINVAL; - - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - /* allocate and build unit_map */ - ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpu_lpage_cpu_distance); - if (IS_ERR(ai)) { - pr_warning("PERCPU: failed to build unit_map (%ld)\n", - PTR_ERR(ai)); - return PTR_ERR(ai); - } - - /* do the parameters look okay? */ - if (!chosen) { - size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = 0; - int group; - - for (group = 0; group < ai->nr_groups; group++) - tot_size += ai->unit_size * ai->groups[group].nr_units; - - /* don't consume more than 20% of vmalloc area */ - if (tot_size > vm_size / 5) { - pr_info("PERCPU: too large chunk size %zuMB for " - "large page remap\n", tot_size >> 20); - rc = -EINVAL; - goto out_free; - } - } - - rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); -out_free: - pcpu_free_alloc_info(ai); - return rc; -} #else -static int __init setup_pcpu_lpage(bool chosen) -{ - return -EINVAL; -} + return LOCAL_DISTANCE; #endif - -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * module and dynamic reserves and embedded into linear physical - * mapping so that it can use PMD mapping without additional TLB - * pressure. - */ -static int __init setup_pcpu_embed(bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!chosen && (!cpu_has_pse || pcpu_need_numa())) - return -EINVAL; - - return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, - PAGE_SIZE, NULL, pcpu_fc_alloc, - pcpu_fc_free); } -/* - * Page allocator - * - * Boring fallback 4k page allocator. This allocator puts more - * pressure on PTE TLBs but other than that behaves nicely on both UMA - * and NUMA. - */ static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static int __init setup_pcpu_page(void) -{ - return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpup_populate_pte); -} - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -281,30 +178,34 @@ void __init setup_per_cpu_areas(void) NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. */ +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif rc = -EINVAL; - if (pcpu_chosen_fc != PCPU_FC_AUTO) { - if (pcpu_chosen_fc != PCPU_FC_PAGE) { - if (pcpu_chosen_fc == PCPU_FC_LPAGE) - rc = setup_pcpu_lpage(true); - else - rc = setup_pcpu_embed(true); - - if (rc < 0) - pr_warning("PERCPU: %s allocator failed (%d), " - "falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], rc); - } - } else { - rc = setup_pcpu_lpage(false); + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - rc = setup_pcpu_embed(false); + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) - rc = setup_pcpu_page(); + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); -- cgit v1.2.3-70-g09d2