From 2800486ee34825d954f64c6f98037daea328f121 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 17:03:50 +0200 Subject: sched/fair: Avoid newidle balance for !active CPUs On CPU hot unplug, when parking the last kthread we'll try and schedule into idle to kill the CPU. This last schedule can (and does) trigger newidle balance because at this point the sched domains are still up because of commit: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds") Obviously pulling tasks to an already offline CPU is a bad idea, and all balancing operations _should_ be subject to cpu_active_mask, make it so. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds") Link: http://lkml.kernel.org/r/20170907150613.994135806@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8415d1ec2b84..3bcea40d3029 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8447,6 +8447,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) */ this_rq->idle_stamp = rq_clock(this_rq); + /* + * Do not pull tasks towards !active CPUs... + */ + if (!cpu_active(this_cpu)) + return 0; + /* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding -- cgit v1.2.3-70-g09d2 From edd8e41d2e3cbd6ebe13ead30eb1adc6f48cbb33 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 17:03:51 +0200 Subject: sched/fair: Plug hole between hotplug and active_load_balance() The load balancer applies cpu_active_mask to whatever sched_domains it finds, however in the case of active_balance there is a hole between setting rq->{active_balance,push_cpu} and running the stop_machine work doing the actual migration. The @push_cpu can go offline in this window, which would result in us moving a task onto a dead cpu, which is a fairly bad thing. Double check the active mask before the stop work does the migration. CPU0 CPU1 stop_machine(takedown_cpu) load_balance() cpu_stopper_thread() ... work = multi_cpu_stop stop_one_cpu_nowait( /* wait for CPU0 */ .func = active_load_balance_cpu_stop ); cpu_stopper_thread() work = multi_cpu_stop /* sync with CPU1 */ take_cpu_down() play_dead(); work = active_load_balance_cpu_stop set_task_cpu(p, CPU1); /* oops!! */ Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170907150614.044460912@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bcea40d3029..efeebed935ae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8560,6 +8560,13 @@ static int active_load_balance_cpu_stop(void *data) struct rq_flags rf; rq_lock_irq(busiest_rq, &rf); + /* + * Between queueing the stop-work and running it is a hole in which + * CPUs can become inactive. We should not move tasks from or to + * inactive CPUs. + */ + if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) + goto out_unlock; /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || -- cgit v1.2.3-70-g09d2 From 4ff9083b8a9a80bdf4ebbbec22cda4cbfb60f7aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 17:03:52 +0200 Subject: sched/core: WARN() when migrating to an offline CPU Migrating tasks to offline CPUs is a pretty big fail, warn about it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170907150614.094206976@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 136a76d80dbf..18a6966567da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1173,6 +1173,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif + /* + * Clearly, migrating tasks to offline CPUs is a fairly daft thing. + */ + WARN_ON_ONCE(!cpu_online(new_cpu)); #endif trace_sched_migrate_task(p, new_cpu); -- cgit v1.2.3-70-g09d2 From 9469eb01db891b55367ee7539f1b9f7f6fd2819d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 17:03:53 +0200 Subject: sched/debug: Add debugfs knob for "sched_debug" I'm forever late for editing my kernel cmdline, add a runtime knob to disable the "sched_debug" thing. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170907150614.142924283@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 5 +++++ kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 4 +--- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4a23bbc3111b..b19d06ea6e10 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -181,11 +181,16 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; +__read_mostly bool sched_debug_enabled; + static __init int sched_init_debug(void) { debugfs_create_file("sched_features", 0644, NULL, NULL, &sched_feat_fops); + debugfs_create_bool("sched_debug", 0644, NULL, + &sched_debug_enabled); + return 0; } late_initcall(sched_init_debug); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ab1c7f5409a0..7ea2a0339771 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1954,6 +1954,8 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); #ifdef CONFIG_SCHED_DEBUG +extern bool sched_debug_enabled; + extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6f7b43982f73..2ab2aa68c796 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -14,11 +14,9 @@ cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_debug_enabled; - static int __init sched_debug_setup(char *str) { - sched_debug_enabled = 1; + sched_debug_enabled = true; return 0; } -- cgit v1.2.3-70-g09d2