From 1470afefc3c42df5d1662f87d079b46651bdc95b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:02 -0700 Subject: cpumask: introduce for_each_cpu_or Equivalent of for_each_cpu_and, except it ORs the two masks together so it iterates all the CPUs present in either mask. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- lib/find_bit.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'lib') diff --git a/lib/find_bit.c b/lib/find_bit.c index c10920e66788..32f99e9a670e 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -182,6 +182,15 @@ unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned l EXPORT_SYMBOL(_find_next_andnot_bit); #endif +#ifndef find_next_or_bit +unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_or_bit); +#endif + #ifndef find_next_zero_bit unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) -- cgit v1.2.3-70-g09d2 From 8b57b11cca88f397035a95b9e12b03511847b0e8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:02 -0700 Subject: pcpcntrs: fix dying cpu summation race In commit f689054aace2 ("percpu_counter: add percpu_counter_sum_all interface") a race condition between a cpu dying and percpu_counter_sum() iterating online CPUs was identified. The solution was to iterate all possible CPUs for summation via percpu_counter_sum_all(). We recently had a percpu_counter_sum() call in XFS trip over this same race condition and it fired a debug assert because the filesystem was unmounting and the counter *should* be zero just before we destroy it. That was reported here: https://lore.kernel.org/linux-kernel/20230314090649.326642-1-yebin@huaweicloud.com/ likely as a result of running generic/648 which exercises filesystems in the presence of CPU online/offline events. The solution to use percpu_counter_sum_all() is an awful one. We use percpu counters and percpu_counter_sum() for accurate and reliable threshold detection for space management, so a summation race condition during these operations can result in overcommit of available space and that may result in filesystem shutdowns. As percpu_counter_sum_all() iterates all possible CPUs rather than just those online or even those present, the mask can include CPUs that aren't even installed in the machine, or in the case of machines that can hot-plug CPU capable nodes, even have physical sockets present in the machine. Fundamentally, this race condition is caused by the CPU being offlined being removed from the cpu_online_mask before the notifier that cleans up per-cpu state is run. Hence percpu_counter_sum() will not sum the count for a cpu currently being taken offline, regardless of whether the notifier has run or not. This is the root cause of the bug. The percpu counter notifier iterates all the registered counters, locks the counter and moves the percpu count to the global sum. This is serialised against other operations that move the percpu counter to the global sum as well as percpu_counter_sum() operations that sum the percpu counts while holding the counter lock. Hence the notifier is safe to run concurrently with sum operations, and the only thing we actually need to care about is that percpu_counter_sum() iterates dying CPUs. That's trivial to do, and when there are no CPUs dying, it has no addition overhead except for a cpumask_or() operation. This change makes percpu_counter_sum() always do the right thing in the presence of CPU hot unplug events and makes percpu_counter_sum_all() unnecessary. This, in turn, means that filesystems like XFS, ext4, and btrfs don't have to work out when they should use percpu_counter_sum() vs percpu_counter_sum_all() in their space accounting algorithms Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- lib/percpu_counter.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index dba56c5c1837..0e096311e0c0 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -131,7 +131,7 @@ static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; - for_each_cpu(cpu, cpu_mask) { + for_each_cpu_or(cpu, cpu_online_mask, cpu_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } @@ -141,11 +141,20 @@ static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, /* * Add up all the per-cpu counts, return the result. This is a more accurate - * but much slower version of percpu_counter_read_positive() + * but much slower version of percpu_counter_read_positive(). + * + * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums + * from CPUs that are in the process of being taken offline. Dying cpus have + * been removed from the online mask, but may not have had the hotplug dead + * notifier called to fold the percpu count back into the global counter sum. + * By including dying CPUs in the iteration mask, we avoid this race condition + * so __percpu_counter_sum() just does the right thing when CPUs are being taken + * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { - return __percpu_counter_sum_mask(fbc, cpu_online_mask); + + return __percpu_counter_sum_mask(fbc, cpu_dying_mask); } EXPORT_SYMBOL(__percpu_counter_sum); -- cgit v1.2.3-70-g09d2 From e9b60c7f97130795c7aa81a649ae4b93a172a277 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:03 -0700 Subject: pcpcntr: remove percpu_counter_sum_all() percpu_counter_sum_all() is now redundant as the race condition it was invented to handle is now dealt with by percpu_counter_sum() directly and all users of percpu_counter_sum_all() have been removed. Remove it. This effectively reverts the changes made in f689054aace2 ("percpu_counter: add percpu_counter_sum_all interface") except for the cpumask iteration that fixes percpu_counter_sum() made earlier in this series. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/percpu_counter.h | 6 ------ lib/percpu_counter.c | 40 +++++++++++----------------------------- 2 files changed, 11 insertions(+), 35 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 521a733e21a9..75b73c83bc9d 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -45,7 +45,6 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); -s64 percpu_counter_sum_all(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); @@ -196,11 +195,6 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc) return percpu_counter_read(fbc); } -static inline s64 percpu_counter_sum_all(struct percpu_counter *fbc) -{ - return percpu_counter_read(fbc); -} - static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return true; diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 0e096311e0c0..5004463c4f9f 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -122,23 +122,6 @@ void percpu_counter_sync(struct percpu_counter *fbc) } EXPORT_SYMBOL(percpu_counter_sync); -static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, - const struct cpumask *cpu_mask) -{ - s64 ret; - int cpu; - unsigned long flags; - - raw_spin_lock_irqsave(&fbc->lock, flags); - ret = fbc->count; - for_each_cpu_or(cpu, cpu_online_mask, cpu_mask) { - s32 *pcount = per_cpu_ptr(fbc->counters, cpu); - ret += *pcount; - } - raw_spin_unlock_irqrestore(&fbc->lock, flags); - return ret; -} - /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive(). @@ -153,22 +136,21 @@ static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { + s64 ret; + int cpu; + unsigned long flags; - return __percpu_counter_sum_mask(fbc, cpu_dying_mask); + raw_spin_lock_irqsave(&fbc->lock, flags); + ret = fbc->count; + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { + s32 *pcount = per_cpu_ptr(fbc->counters, cpu); + ret += *pcount; + } + raw_spin_unlock_irqrestore(&fbc->lock, flags); + return ret; } EXPORT_SYMBOL(__percpu_counter_sum); -/* - * This is slower version of percpu_counter_sum as it traverses all possible - * cpus. Use this only in the cases where accurate data is needed in the - * presense of CPUs getting offlined. - */ -s64 percpu_counter_sum_all(struct percpu_counter *fbc) -{ - return __percpu_counter_sum_mask(fbc, cpu_possible_mask); -} -EXPORT_SYMBOL(percpu_counter_sum_all); - int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { -- cgit v1.2.3-70-g09d2 From 0fa99fdfe1b38da396d0b2d1496a823bcd0ebea0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 7 Mar 2023 13:02:46 -0500 Subject: maple_tree: fix mas_skip_node() end slot detection Patch series "Fix mas_skip_node() for mas_empty_area()", v2. mas_empty_area() was incorrectly returning an error when there was room. The issue was tracked down to mas_skip_node() using the incorrect end-of-slot count. Instead of using the nodes hard limit, the limit of data should be used. mas_skip_node() was also setting the min and max to that of the child node, which was unnecessary. Within these limits being set, there was also a bug that corrupted the maple state's max if the offset was set to the maximum node pivot. The bug was without consequence unless there was a sufficient gap in the next child node which would cause an error to be returned. This patch set fixes these errors by removing the limit setting from mas_skip_node() and uses the mas_data_end() for slot limits, and adds tests for all failures discovered. This patch (of 2): mas_skip_node() is used to move the maple state to the node with a higher limit. It does this by walking up the tree and increasing the slot count. Since slot count may not be able to be increased, it may need to walk up multiple times to find room to walk right to a higher limit node. The limit of slots that was being used was the node limit and not the last location of data in the node. This would cause the maple state to be shifted outside actual data and enter an error state, thus returning -EBUSY. The result of the incorrect error state means that mas_awalk() would return an error instead of finding the allocation space. The fix is to use mas_data_end() in mas_skip_node() to detect the nodes data end point and continue walking the tree up until it is safe to move to a node with a higher limit. The walk up the tree also sets the maple state limits so remove the buggy code from mas_skip_node(). Setting the limits had the unfortunate side effect of triggering another bug if the parent node was full and the there was no suitable gap in the second last child, but room in the next child. mas_skip_node() may also be passed a maple state in an error state from mas_anode_descend() when no allocations are available. Return on such an error state immediately. Link: https://lkml.kernel.org/r/20230307180247.2220303-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230307180247.2220303-2-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: Snild Dolkow Link: https://lore.kernel.org/linux-mm/cb8dc31a-fef2-1d09-f133-e9f7b9f9e77a@sony.com/ Tested-by: Snild Dolkow Cc: Peng Zhang Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 646297cae5d1..9e2735cbc2b4 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5099,35 +5099,21 @@ static inline bool mas_rewind_node(struct ma_state *mas) */ static inline bool mas_skip_node(struct ma_state *mas) { - unsigned char slot, slot_count; - unsigned long *pivots; - enum maple_type mt; + if (mas_is_err(mas)) + return false; - mt = mte_node_type(mas->node); - slot_count = mt_slots[mt] - 1; do { if (mte_is_root(mas->node)) { - slot = mas->offset; - if (slot > slot_count) { + if (mas->offset >= mas_data_end(mas)) { mas_set_err(mas, -EBUSY); return false; } } else { mas_ascend(mas); - slot = mas->offset; - mt = mte_node_type(mas->node); - slot_count = mt_slots[mt] - 1; } - } while (slot > slot_count); - - mas->offset = ++slot; - pivots = ma_pivots(mas_mn(mas), mt); - if (slot > 0) - mas->min = pivots[slot - 1] + 1; - - if (slot <= slot_count) - mas->max = pivots[slot]; + } while (mas->offset >= mas_data_end(mas)); + mas->offset++; return true; } -- cgit v1.2.3-70-g09d2 From 4bd6dded6318dc8e2514d74868c1f8fb38b61a60 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 7 Mar 2023 13:02:47 -0500 Subject: test_maple_tree: add more testing for mas_empty_area() Test robust filling of an entire area of the tree, then test one beyond. This is to test the walking back up the tree at the end of nodes and error condition. Test inspired by the reproducer code provided by Snild Dolkow. The last test in the function tests for the case of a corrupted maple state caused by the incorrect limits set during mas_skip_node(). There needs to be a gap in the second last child and last child, but the search must rule out the second last child's gap. This would avoid correcting the maple state to the correct max limit and return an error. Link: https://lkml.kernel.org/r/20230307180247.2220303-3-Liam.Howlett@oracle.com Cc: Snild Dolkow Link: https://lore.kernel.org/linux-mm/cb8dc31a-fef2-1d09-f133-e9f7b9f9e77a@sony.com/ Fixes: e15e06a83923 ("lib/test_maple_tree: add testing for maple tree") Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'lib') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 3d19b1f78d71..f1db333270e9 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2670,6 +2670,49 @@ static noinline void check_empty_area_window(struct maple_tree *mt) rcu_read_unlock(); } +static noinline void check_empty_area_fill(struct maple_tree *mt) +{ + const unsigned long max = 0x25D78000; + unsigned long size; + int loop, shift; + MA_STATE(mas, mt, 0, 0); + + mt_set_non_kernel(99999); + for (shift = 12; shift <= 16; shift++) { + loop = 5000; + size = 1 << shift; + while (loop--) { + mas_set(&mas, 0); + mas_lock(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, max, size) != 0); + MT_BUG_ON(mt, mas.last != mas.index + size - 1); + mas_store_gfp(&mas, (void *)size, GFP_KERNEL); + mas_unlock(&mas); + mas_reset(&mas); + } + } + + /* No space left. */ + size = 0x1000; + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, max, size) != -EBUSY); + rcu_read_unlock(); + + /* Fill a depth 3 node to the maximum */ + for (unsigned long i = 629440511; i <= 629440800; i += 6) + mtree_store_range(mt, i, i + 5, (void *)i, GFP_KERNEL); + /* Make space in the second-last depth 4 node */ + mtree_erase(mt, 631668735); + /* Make space in the last depth 4 node */ + mtree_erase(mt, 629506047); + mas_reset(&mas); + /* Search from just after the gap in the second-last depth 4 */ + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area(&mas, 629506048, 690000000, 0x5000) != 0); + rcu_read_unlock(); + mt_set_non_kernel(0); +} + static DEFINE_MTREE(tree); static int maple_tree_seed(void) { @@ -2926,6 +2969,11 @@ static int maple_tree_seed(void) check_empty_area_window(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_empty_area_fill(&tree); + mtree_destroy(&tree); + + #if defined(BENCH) skip: #endif -- cgit v1.2.3-70-g09d2 From 13684e966d46283e0e89b6a4941596dc52b18bf3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 15 Mar 2023 15:28:17 +0100 Subject: lib: dhry: fix unstable smp_processor_id(_) usage When running the in-kernel Dhrystone benchmark with CONFIG_DEBUG_PREEMPT=y: BUG: using smp_processor_id() in preemptible [00000000] code: bash/938 Fix this by not using smp_processor_id() directly, but instead wrapping the whole benchmark inside a get_cpu()/put_cpu() pair. This makes sure the whole benchmark is run on the same CPU core, and the reported values are consistent. Link: https://lkml.kernel.org/r/b0d29932bb24ad82cea7f821e295c898e9657be0.1678890070.git.geert+renesas@glider.be Fixes: d5528cc16893f1f6 ("lib: add Dhrystone benchmark test") Signed-off-by: Geert Uytterhoeven Reported-by: Tobias Klausmann Link: https://bugzilla.kernel.org/show_bug.cgi?id=217179 Signed-off-by: Andrew Morton --- lib/dhry_run.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/dhry_run.c b/lib/dhry_run.c index f9d33efa6d09..f15ac666e9d3 100644 --- a/lib/dhry_run.c +++ b/lib/dhry_run.c @@ -31,6 +31,7 @@ MODULE_PARM_DESC(iterations, static void dhry_benchmark(void) { + unsigned int cpu = get_cpu(); int i, n; if (iterations > 0) { @@ -45,9 +46,10 @@ static void dhry_benchmark(void) } report: + put_cpu(); if (n >= 0) - pr_info("CPU%u: Dhrystones per Second: %d (%d DMIPS)\n", - smp_processor_id(), n, n / DHRY_VAX); + pr_info("CPU%u: Dhrystones per Second: %d (%d DMIPS)\n", cpu, + n, n / DHRY_VAX); else if (n == -EAGAIN) pr_err("Please increase the number of iterations\n"); else -- cgit v1.2.3-70-g09d2