diff options
Diffstat (limited to 'kernel/rcu')
-rw-r--r-- | kernel/rcu/rcutorture.c | 19 | ||||
-rw-r--r-- | kernel/rcu/srcu.c | 143 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 2 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 35 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 13 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 28 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 5 | ||||
-rw-r--r-- | kernel/rcu/tree_trace.c | 3 | ||||
-rw-r--r-- | kernel/rcu/update.c | 6 |
9 files changed, 130 insertions, 124 deletions
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 87c51225ceec..d81345be730e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -564,10 +564,25 @@ static void srcu_torture_stats(void) pr_alert("%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; long c0, c1; + struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); - c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx]; - c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx]; + u0 = counts->unlock_count[!idx]; + u1 = counts->unlock_count[idx]; + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = counts->lock_count[!idx]; + l1 = counts->lock_count[idx]; + + c0 = l0 - u0; + c1 = l1 - u1; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 9b9cdd549caa..e773129c8b08 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) rcu_batch_init(&sp->batch_check1); rcu_batch_init(&sp->batch_done); INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + sp->per_cpu_ref = alloc_percpu(struct srcu_array); return sp->per_cpu_ref ? 0 : -ENOMEM; } @@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* - * Returns approximate total of the readers' ->seq[] values for the + * Returns approximate total of the readers' ->lock_count[] values for the * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; - unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); - sum += t; + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->lock_count[idx]); } return sum; } /* - * Returns approximate number of readers active on the specified rank - * of the per-CPU ->c[] counters. + * Returns approximate total of the readers' ->unlock_count[] values for the + * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; - unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); - sum += t; + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->unlock_count[idx]); } return sum; } /* * Return true if the number of pre-existing readers is determined to - * be stably zero. An example unstable zero can occur if the call - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, - * but due to task migration, sees the corresponding __srcu_read_unlock() - * decrement. This can happen because srcu_readers_active_idx() takes - * time to sum the array, and might in fact be interrupted or preempted - * partway through the summation. + * be zero. */ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) { - unsigned long seq; + unsigned long unlocks; - seq = srcu_readers_seq_idx(sp, idx); + unlocks = srcu_readers_unlock_idx(sp, idx); /* - * The following smp_mb() A pairs with the smp_mb() B located in - * __srcu_read_lock(). This pairing ensures that if an - * __srcu_read_lock() increments its counter after the summation - * in srcu_readers_active_idx(), then the corresponding SRCU read-side - * critical section will see any changes made prior to the start - * of the current SRCU grace period. + * Make sure that a lock is always counted if the corresponding unlock + * is counted. Needs to be a smp_mb() as the read side may contain a + * read from a variable that is written to before the synchronize_srcu() + * in the write side. In this case smp_mb()s A and B act like the store + * buffering pattern. * - * Also, if the above call to srcu_readers_seq_idx() saw the - * increment of ->seq[], then the call to srcu_readers_active_idx() - * must see the increment of ->c[]. + * This smp_mb() also pairs with smp_mb() C to prevent accesses after the + * synchronize_srcu() from being executed before the grace period ends. */ smp_mb(); /* A */ /* - * Note that srcu_readers_active_idx() can incorrectly return - * zero even though there is a pre-existing reader throughout. - * To see this, suppose that task A is in a very long SRCU - * read-side critical section that started on CPU 0, and that - * no other reader exists, so that the sum of the counters - * is equal to one. Then suppose that task B starts executing - * srcu_readers_active_idx(), summing up to CPU 1, and then that - * task C starts reading on CPU 0, so that its increment is not - * summed, but finishes reading on CPU 2, so that its decrement - * -is- summed. Then when task B completes its sum, it will - * incorrectly get zero, despite the fact that task A has been - * in its SRCU read-side critical section the whole time. - * - * We therefore do a validation step should srcu_readers_active_idx() - * return zero. - */ - if (srcu_readers_active_idx(sp, idx) != 0) - return false; - - /* - * The remainder of this function is the validation step. - * The following smp_mb() D pairs with the smp_mb() C in - * __srcu_read_unlock(). If the __srcu_read_unlock() was seen - * by srcu_readers_active_idx() above, then any destructive - * operation performed after the grace period will happen after - * the corresponding SRCU read-side critical section. + * If the locks are the same as the unlocks, then there must have + * been no readers on this index at some time in between. This does not + * mean that there are no more readers, as one could have read the + * current index but not have incremented the lock counter yet. * - * Note that there can be at most NR_CPUS worth of readers using - * the old index, which is not enough to overflow even a 32-bit - * integer. (Yes, this does mean that systems having more than - * a billion or so CPUs need to be 64-bit systems.) Therefore, - * the sum of the ->seq[] counters cannot possibly overflow. - * Therefore, the only way that the return values of the two - * calls to srcu_readers_seq_idx() can be equal is if there were - * no increments of the corresponding rank of ->seq[] counts - * in the interim. But the missed-increment scenario laid out - * above includes an increment of the ->seq[] counter by - * the corresponding __srcu_read_lock(). Therefore, if this - * scenario occurs, the return values from the two calls to - * srcu_readers_seq_idx() will differ, and thus the validation - * step below suffices. + * Possible bug: There is no guarantee that there haven't been ULONG_MAX + * increments of ->lock_count[] since the unlocks were counted, meaning + * that this could return true even if there are still active readers. + * Since there are no memory barriers around srcu_flip(), the CPU is not + * required to increment ->completed before running + * srcu_readers_unlock_idx(), which means that there could be an + * arbitrarily large number of critical sections that execute after + * srcu_readers_unlock_idx() but use the old value of ->completed. */ - smp_mb(); /* D */ - - return srcu_readers_seq_idx(sp, idx) == seq; + return srcu_readers_lock_idx(sp, idx) == unlocks; } /** @@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->lock_count[0]); + sum += READ_ONCE(cpuc->lock_count[1]); + sum -= READ_ONCE(cpuc->unlock_count[0]); + sum -= READ_ONCE(cpuc->unlock_count[1]); } return sum; } @@ -298,9 +265,8 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - __this_cpu_inc(sp->per_cpu_ref->c[idx]); + __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - __this_cpu_inc(sp->per_cpu_ref->seq[idx]); return idx; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -314,7 +280,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *sp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_dec(sp->per_cpu_ref->c[idx]); + this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -349,12 +315,21 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) /* * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->c[] and ->seq[] arrays. This allows + * use the other rank of the ->(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *sp) { - sp->completed++; + WRITE_ONCE(sp->completed, sp->completed + 1); + + /* + * Ensure that if the updater misses an __srcu_read_unlock() + * increment, that task's next __srcu_read_lock() will see the + * above counter update. Note that both this memory barrier + * and the one in srcu_readers_active_idx_check() provide the + * guarantee for __srcu_read_lock(). + */ + smp_mb(); /* D */ /* Pairs with C. */ } /* @@ -392,6 +367,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, head->next = NULL; head->func = func; spin_lock_irqsave(&sp->queue_lock, flags); + smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; @@ -425,6 +401,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) head->next = NULL; head->func = wakeme_after_rcu; spin_lock_irq(&sp->queue_lock); + smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ if (!sp->running) { /* steal the processing owner */ sp->running = true; @@ -444,8 +421,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) spin_unlock_irq(&sp->queue_lock); } - if (!done) + if (!done) { wait_for_completion(&rcu.completion); + smp_mb(); /* Caller's later accesses after GP. */ + } + } /** @@ -613,7 +593,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount) /* * Invoke a limited number of SRCU callbacks that have passed through * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. + * the workqueue. Note that needed memory barriers have been executed + * in this task's context by srcu_readers_active_idx_check(). */ static void srcu_invoke_callbacks(struct srcu_struct *sp) { diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index b23a4d076f3d..fa6a48d3917b 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -41,8 +41,6 @@ /* Forward declarations for tiny_plugin.h. */ struct rcu_ctrlblk; -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); -static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, rcu_callback_t func, struct rcu_ctrlblk *rcp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6e4b1dcebeb2..d80e0d2f68c6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -717,7 +717,7 @@ static int cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) { return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && - rdp->nxttail[RCU_DONE_TAIL] != NULL; + rdp->nxttail[RCU_NEXT_TAIL] != NULL; } /* @@ -1317,11 +1317,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } - /* And if it has been a really long time, kick the CPU as well. */ - if (ULONG_CMP_GE(jiffies, - rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || - ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + /* + * If more than halfway to RCU CPU stall-warning time, do + * a resched_cpu() to try to loosen things up a bit. + */ + if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) + resched_cpu(rdp->cpu); return 0; } @@ -1374,7 +1375,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) } /* - * Dump stacks of all tasks running on stalled CPUs. + * Dump stacks of all tasks running on stalled CPUs. First try using + * NMIs, but fall back to manual remote stack tracing on architectures + * that don't support NMI-based stack dumps. The NMI-triggered stack + * traces are more accurate because they are printed by the target CPU. */ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) { @@ -1384,11 +1388,10 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->qsmask != 0) { - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + if (!trigger_single_cpu_backtrace(cpu)) dump_cpu_task(cpu); - } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1476,6 +1479,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected) { rcu_dump_cpu_stacks(rsp); + + /* Complain about tasks blocking the grace period. */ + rcu_print_detail_task_stall(rsp); } else { if (READ_ONCE(rsp->gpnum) != gpnum || READ_ONCE(rsp->completed) == gpnum) { @@ -1492,9 +1498,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) } } - /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); - rcu_check_gp_kthread_starvation(rsp); panic_on_rcu_stall(); @@ -3857,7 +3860,6 @@ static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); @@ -3879,7 +3881,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) * of the next grace period. */ rnp = rdp->mynode; - mask = rdp->grpmask; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ if (!rdp->beenonline) WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); @@ -3963,7 +3964,7 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_state *rsp; for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); + rdp = per_cpu_ptr(rsp->rda, cpu); rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3b953dcf6afc..b60f2b6caa14 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -521,7 +521,6 @@ struct rcu_state { struct mutex exp_mutex; /* Serialize expedited GP. */ struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ @@ -690,18 +689,6 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #ifdef CONFIG_RCU_TRACE */ /* - * Place this after a lock-acquisition primitive to guarantee that - * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies - * if the UNLOCK and LOCK are executed by the same CPU or if the - * UNLOCK and LOCK operate on the same lock variable. - */ -#ifdef CONFIG_PPC -#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ -#else /* #ifdef CONFIG_PPC */ -#define smp_mb__after_unlock_lock() do { } while (0) -#endif /* #else #ifdef CONFIG_PPC */ - -/* * Wrappers for the rcu_node::lock acquire and release. * * Because the rcu_nodes form a tree, the tree traversal locking will observe diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e155a465cf84..a7b639ccd46e 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -20,16 +20,26 @@ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ -/* Wrapper functions for expedited grace periods. */ +/* + * Record the start of an expedited grace period. + */ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) { rcu_seq_start(&rsp->expedited_sequence); } + +/* + * Record the end of an expedited grace period. + */ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) { rcu_seq_end(&rsp->expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } + +/* + * Take a snapshot of the expedited-grace-period counter. + */ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { unsigned long s; @@ -39,6 +49,12 @@ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); return s; } + +/* + * Given a counter snapshot from rcu_exp_gp_seq_snap(), return true + * if a full expedited grace period has elapsed since that snapshot + * was taken. + */ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) { return rcu_seq_done(&rsp->expedited_sequence, s); @@ -621,6 +637,11 @@ void synchronize_sched_expedited(void) { struct rcu_state *rsp = &rcu_sched_state; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched_expedited() in RCU read-side critical section"); + /* If only one CPU, this is automatically a grace period. */ if (rcu_blocking_is_gp()) return; @@ -690,6 +711,11 @@ void synchronize_rcu_expedited(void) { struct rcu_state *rsp = rcu_state_p; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 652209589adf..a240f3308be6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2366,8 +2366,9 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) } /* - * Each pass through this loop sets up one rcu_data structure and - * spawns one rcu_nocb_kthread(). + * Each pass through this loop sets up one rcu_data structure. + * Should the corresponding CPU come online in the future, then + * we will spawn the needed set of rcu_nocb_kthread() kthreads. */ for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index b833cd0a29e8..8751a748499a 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -194,9 +194,8 @@ static int show_rcuexp(struct seq_file *m, void *v) s2 += atomic_long_read(&rdp->exp_workdone2); s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, s0, s1, s2, s3, - atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); return 0; diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4f6db7e6a117..9e03db9ea9c0 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -132,8 +132,7 @@ bool rcu_gp_is_normal(void) } EXPORT_SYMBOL_GPL(rcu_gp_is_normal); -static atomic_t rcu_expedited_nesting = - ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); +static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); /* * Should normal grace-period primitives be expedited? Intended for @@ -182,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); */ void rcu_end_inkernel_boot(void) { - if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) - rcu_unexpedite_gp(); + rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); } |