From f9a25f776d780bfa3279f0b6e5f5cf3224997976 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 19 Jul 2019 15:59:55 +0200 Subject: cpusets: Rebuild root domain deadline accounting information When the topology of root domains is modified by CPUset or CPUhotplug operations information about the current deadline bandwidth held in the root domain is lost. This patch addresses the issue by recalculating the lost deadline bandwidth information by circling through the deadline tasks held in CPUsets and adding their current load to the root domain they are associated with. Tested-by: Dietmar Eggemann Signed-off-by: Mathieu Poirier Signed-off-by: Juri Lelli [ Various additional modifications. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-4-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel/sched/deadline.c') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ef5b9f6b1d42..0f9d2180be23 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2283,6 +2283,36 @@ void __init init_sched_dl_class(void) GFP_KERNEL, cpu_to_node(i)); } +void dl_add_task_root_domain(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + struct dl_bw *dl_b; + + rq = task_rq_lock(p, &rf); + if (!dl_task(p)) + goto unlock; + + dl_b = &rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + + __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); + + raw_spin_unlock(&dl_b->lock); + +unlock: + task_rq_unlock(rq, p, &rf); +} + +void dl_clear_root_domain(struct root_domain *rd) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + rd->dl_bw.total_bw = 0; + raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); +} + #endif /* CONFIG_SMP */ static void switched_from_dl(struct rq *rq, struct task_struct *p) -- cgit v1.2.3-70-g09d2 From 59d06cea1198d665ba11f7e8c5f45b00ff2e4812 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 15:59:56 +0200 Subject: sched/deadline: Fix bandwidth accounting at all levels after offline migration If a task happens to be throttled while the CPU it was running on gets hotplugged off, the bandwidth associated with the task is not correctly migrated with it when the replenishment timer fires (offline_migration). Fix things up, for this_bw, running_bw and total_bw, when replenishment timer fires and task is migrated (dl_task_offline_migration()). Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-5-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel/sched/deadline.c') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0f9d2180be23..039dde2b1dac 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; + struct dl_bw *dl_b; later_rq = find_lock_later_rq(p, rq); if (!later_rq) { @@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p double_lock_balance(rq, later_rq); } + if (p->dl.dl_non_contending || p->dl.dl_throttled) { + /* + * Inactive timer is armed (or callback is running, but + * waiting for us to release rq locks). In any case, when it + * will fire (or continue), it will see running_bw of this + * task migrated to later_rq (and correctly handle it). + */ + sub_running_bw(&p->dl, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); + + add_rq_bw(&p->dl, &later_rq->dl); + add_running_bw(&p->dl, &later_rq->dl); + } else { + sub_rq_bw(&p->dl, &rq->dl); + add_rq_bw(&p->dl, &later_rq->dl); + } + + /* + * And we finally need to fixup root_domain(s) bandwidth accounting, + * since p is still hanging out in the old (now moved to default) root + * domain. + */ + dl_b = &rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); + raw_spin_unlock(&dl_b->lock); + + dl_b = &later_rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span)); + raw_spin_unlock(&dl_b->lock); + set_task_cpu(p, later_rq->cpu); double_unlock_balance(later_rq, rq); -- cgit v1.2.3-70-g09d2 From f95d4eaee6d0207bff2dc93371133d31227d4cfb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:40 +0000 Subject: sched/{rt,deadline}: Fix set_next_task vs pick_next_task Because pick_next_task() implies set_curr_task() and some of the details haven't mattered too much, some of what _should_ be in set_curr_task() ended up in pick_next_task, correct this. This prepares the way for a pick_next_task() variant that does not affect the current state; allowing remote picking. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/38c61d5240553e043c27c5e00b9dd0d184dd6081.1559129225.git.vpillai@digitalocean.com --- kernel/sched/deadline.c | 22 +++++++++++----------- kernel/sched/rt.c | 26 +++++++++++++------------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel/sched/deadline.c') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 039dde2b1dac..2dc2784b196c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1727,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) } #endif -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static void set_next_task_dl(struct rq *rq, struct task_struct *p) { p->se.exec_start = rq_clock_task(rq); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); + + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, p); + + if (rq->curr->sched_class != &dl_sched_class) + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + deadline_queue_push_tasks(rq); } static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, @@ -1791,15 +1799,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = dl_task_of(dl_se); - set_next_task(rq, p); - - if (hrtick_enabled(rq)) - start_hrtick_dl(rq, p); - - deadline_queue_push_tasks(rq); - - if (rq->curr->sched_class != &dl_sched_class) - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + set_next_task_dl(rq, p); return p; } @@ -1846,7 +1846,7 @@ static void task_fork_dl(struct task_struct *p) static void set_curr_task_dl(struct rq *rq) { - set_next_task(rq, rq->curr); + set_next_task_dl(rq, rq->curr); } #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a532558a5176..40bb71004325 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1498,12 +1498,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p) { p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); + + /* + * If prev task was rt, put_prev_task() has already updated the + * utilization. We only care of the case where we start to schedule a + * rt task + */ + if (rq->curr->sched_class != &rt_sched_class) + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + rt_queue_push_tasks(rq); } static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, @@ -1577,17 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = _pick_next_task_rt(rq); - set_next_task(rq, p); - - rt_queue_push_tasks(rq); - - /* - * If prev task was rt, put_prev_task() has already updated the - * utilization. We only care of the case where we start to schedule a - * rt task - */ - if (rq->curr->sched_class != &rt_sched_class) - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + set_next_task_rt(rq, p); return p; } @@ -2356,7 +2356,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) static void set_curr_task_rt(struct rq *rq) { - set_next_task(rq, rq->curr); + set_next_task_rt(rq, rq->curr); } static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) -- cgit v1.2.3-70-g09d2 From 03b7fad167efca3b7abbbb39733933f9df56e79c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:41 +0000 Subject: sched: Add task_struct pointer to sched_class::set_curr_task In preparation of further separating pick_next_task() and set_curr_task() we have to pass the actual task into it, while there, rename the thing to better pair with put_prev_task(). Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/a96d1bcdd716db4a4c5da2fece647a1456c0ed78.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 12 ++++++------ kernel/sched/deadline.c | 7 +------ kernel/sched/fair.c | 17 ++++++++++++++--- kernel/sched/idle.c | 27 +++++++++++++++------------ kernel/sched/rt.c | 7 +------ kernel/sched/sched.h | 7 ++++--- kernel/sched/stop_task.c | 17 +++++++---------- 7 files changed, 48 insertions(+), 46 deletions(-) (limited to 'kernel/sched/deadline.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 364b6d7da2be..0c4220789092 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1494,7 +1494,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); } /* @@ -4325,7 +4325,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (queued) enqueue_task(rq, p, queue_flag); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4392,7 +4392,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_curr(rq); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); out_unlock: task_rq_unlock(rq, p, &rf); } @@ -4840,7 +4840,7 @@ change: enqueue_task(rq, p, queue_flags); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); @@ -6042,7 +6042,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -6919,7 +6919,7 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); if (running) - set_curr_task(rq, tsk); + set_next_task(rq, tsk); task_rq_unlock(rq, tsk, &rf); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2dc2784b196c..6eae79350303 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1844,11 +1844,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void set_curr_task_dl(struct rq *rq) -{ - set_next_task_dl(rq, rq->curr); -} - #ifdef CONFIG_SMP /* Only try algorithms three times */ @@ -2466,6 +2461,7 @@ const struct sched_class dl_sched_class = { .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, + .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, @@ -2476,7 +2472,6 @@ const struct sched_class dl_sched_class = { .task_woken = task_woken_dl, #endif - .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d8043fc8317..8ce1b8893947 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10150,9 +10150,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ -static void set_curr_task_fair(struct rq *rq) +static void set_next_task_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = &rq->curr->se; + struct sched_entity *se = &p->se; + +#ifdef CONFIG_SMP + if (task_on_rq_queued(p)) { + /* + * Move the next running task to the front of the list, so our + * cfs_tasks list becomes MRU one. + */ + list_move(&se->group_node, &rq->cfs_tasks); + } +#endif for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10423,7 +10433,9 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, @@ -10436,7 +10448,6 @@ const struct sched_class fair_sched_class = { .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..54194d41035c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -374,14 +374,25 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +static void set_next_task_idle(struct rq *rq, struct task_struct *next) +{ + update_idle_core(rq); + schedstat_inc(rq->sched_goidle); +} + static struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { + struct task_struct *next = rq->idle; + put_prev_task(rq, prev); - update_idle_core(rq); - schedstat_inc(rq->sched_goidle); + set_next_task_idle(rq, next); - return rq->idle; + return next; } /* @@ -397,10 +408,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) raw_spin_lock_irq(&rq->lock); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ -} - /* * scheduler tick hitting a task of our scheduling class. * @@ -413,10 +420,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { } -static void set_curr_task_idle(struct rq *rq) -{ -} - static void switched_to_idle(struct rq *rq, struct task_struct *p) { BUG(); @@ -451,13 +454,13 @@ const struct sched_class idle_sched_class = { .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, + .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, .get_rr_interval = get_rr_interval_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 40bb71004325..f71bcbe1a00c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2354,11 +2354,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) } } -static void set_curr_task_rt(struct rq *rq) -{ - set_next_task_rt(rq, rq->curr); -} - static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* @@ -2380,6 +2375,7 @@ const struct sched_class rt_sched_class = { .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, + .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, @@ -2391,7 +2387,6 @@ const struct sched_class rt_sched_class = { .switched_from = switched_from_rt, #endif - .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, .get_rr_interval = get_rr_interval_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3449d0dd7f0..f3c50445bf22 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1707,6 +1707,7 @@ struct sched_class { struct task_struct *prev, struct rq_flags *rf); void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); @@ -1721,7 +1722,6 @@ struct sched_class { void (*rq_offline)(struct rq *rq); #endif - void (*set_curr_task)(struct rq *rq); void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); void (*task_dead)(struct task_struct *p); @@ -1755,9 +1755,10 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) prev->sched_class->put_prev_task(rq, prev); } -static inline void set_curr_task(struct rq *rq, struct task_struct *curr) +static inline void set_next_task(struct rq *rq, struct task_struct *next) { - curr->sched_class->set_curr_task(rq); + WARN_ON_ONCE(rq->curr != next); + next->sched_class->set_next_task(rq, next); } #ifdef CONFIG_SMP diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index c183b790ca54..47a3d2a18a9a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,6 +23,11 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } +static void set_next_task_stop(struct rq *rq, struct task_struct *stop) +{ + stop->se.exec_start = rq_clock_task(rq); +} + static struct task_struct * pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -32,8 +37,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf return NULL; put_prev_task(rq, prev); - - stop->se.exec_start = rq_clock_task(rq); + set_next_task_stop(rq, stop); return stop; } @@ -86,13 +90,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void set_curr_task_stop(struct rq *rq) -{ - struct task_struct *stop = rq->stop; - - stop->se.exec_start = rq_clock_task(rq); -} - static void switched_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ @@ -128,13 +125,13 @@ const struct sched_class stop_sched_class = { .pick_next_task = pick_next_task_stop, .put_prev_task = put_prev_task_stop, + .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_stop, .task_tick = task_tick_stop, .get_rr_interval = get_rr_interval_stop, -- cgit v1.2.3-70-g09d2 From 5f2a45fc9e89e022233085e6f0f352eb6ff770bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:43 +0000 Subject: sched: Allow put_prev_task() to drop rq->lock Currently the pick_next_task() loop is convoluted and ugly because of how it can drop the rq->lock and needs to restart the picking. For the RT/Deadline classes, it is put_prev_task() where we do balancing, and we could do this before the picking loop. Make this possible. Signed-off-by: Peter Zijlstra (Intel) Cc: Valentin Schneider Cc: Aaron Lu Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/e4519f6850477ab7f3d257062796e6425ee4ba7c.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 14 +++++++++++++- kernel/sched/fair.c | 2 +- kernel/sched/idle.c | 2 +- kernel/sched/rt.c | 14 +++++++++++++- kernel/sched/sched.h | 4 ++-- kernel/sched/stop_task.c | 2 +- 7 files changed, 32 insertions(+), 8 deletions(-) (limited to 'kernel/sched/deadline.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c4220789092..7bbe78a31ba5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6090,7 +6090,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) for_each_class(class) { next = class->pick_next_task(rq, NULL, NULL); if (next) { - next->sched_class->put_prev_task(rq, next); + next->sched_class->put_prev_task(rq, next, NULL); return next; } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6eae79350303..2872e15a87cd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1804,13 +1804,25 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); + + if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_dl_task(rq); + rq_repin_lock(rq, rf); + } } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7c27eda9f24..4418c1998e69 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6901,7 +6901,7 @@ idle: /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 54194d41035c..8d59de2e4a6e 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -374,7 +374,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f71bcbe1a00c..dbdabd76f192 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1592,7 +1592,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { update_curr_rt(rq); @@ -1604,6 +1604,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) */ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); + } } #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 304d98e712bf..e085cffb8004 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1710,7 +1710,7 @@ struct sched_class { struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf); void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP @@ -1756,7 +1756,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev); + prev->sched_class->put_prev_task(rq, prev, NULL); } static inline void set_next_task(struct rq *rq, struct task_struct *next) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47a3d2a18a9a..8f414018d5e0 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -59,7 +59,7 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *curr = rq->curr; u64 delta_exec; -- cgit v1.2.3-70-g09d2 From 67692435c411e5c53a1c588ecca2037aebd81f2e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:44 +0000 Subject: sched: Rework pick_next_task() slow-path Avoid the RETRY_TASK case in the pick_next_task() slow path. By doing the put_prev_task() early, we get the rt/deadline pull done, and by testing rq->nr_running we know if we need newidle_balance(). This then gives a stable state to pick a task from. Since the fast-path is fair only; it means the other classes will always have pick_next_task(.prev=NULL, .rf=NULL) and we can simplify. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/aa34d24b36547139248f32a30138791ac6c02bd6.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 19 ++++++++++++------- kernel/sched/deadline.c | 30 ++---------------------------- kernel/sched/fair.c | 9 ++++++--- kernel/sched/idle.c | 4 +++- kernel/sched/rt.c | 29 +---------------------------- kernel/sched/sched.h | 13 ++++++++----- kernel/sched/stop_task.c | 3 ++- 7 files changed, 34 insertions(+), 73 deletions(-) (limited to 'kernel/sched/deadline.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7bbe78a31ba5..a6661852907b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3791,7 +3791,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) - goto again; + goto restart; /* Assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) @@ -3800,14 +3800,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -again: +restart: + /* + * Ensure that we put DL/RT tasks before the pick loop, such that they + * can PULL higher prio tasks when we lower the RQ 'priority'. + */ + prev->sched_class->put_prev_task(rq, prev, rf); + if (!rq->nr_running) + newidle_balance(rq, rf); + for_each_class(class) { - p = class->pick_next_task(rq, prev, rf); - if (p) { - if (unlikely(p == RETRY_TASK)) - goto again; + p = class->pick_next_task(rq, NULL, NULL); + if (p) return p; - } } /* The idle class should always have a runnable task: */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2872e15a87cd..0b9cbfb2b1d4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1761,39 +1761,13 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *p; struct dl_rq *dl_rq; - dl_rq = &rq->dl; - - if (need_pull_dl_task(rq, prev)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we're - * being very careful to re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_dl_task(rq); - rq_repin_lock(rq, rf); - /* - * pull_dl_task() can drop (and re-acquire) rq->lock; this - * means a stop task can slip in, in which case we need to - * re-start task selection. - */ - if (rq->stop && task_on_rq_queued(rq->stop)) - return RETRY_TASK; - } + WARN_ON_ONCE(prev || rf); - /* - * When prev is DL, we may throttle it in put_prev_task(). - * So, we update time before we check for dl_nr_running. - */ - if (prev->sched_class == &dl_sched_class) - update_curr_dl(rq); + dl_rq = &rq->dl; if (unlikely(!dl_rq->dl_nr_running)) return NULL; - put_prev_task(rq, prev); - dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4418c1998e69..19c58599e967 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6770,7 +6770,7 @@ again: goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != &fair_sched_class) + if (!prev || prev->sched_class != &fair_sched_class) goto simple; /* @@ -6847,8 +6847,8 @@ again: goto done; simple: #endif - - put_prev_task(rq, prev); + if (prev) + put_prev_task(rq, prev); do { se = pick_next_entity(cfs_rq, NULL); @@ -6876,6 +6876,9 @@ done: __maybe_unused; return p; idle: + if (!rf) + return NULL; + new_tasks = newidle_balance(rq, rf); /* diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8d59de2e4a6e..7c54550dda6a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -389,7 +389,9 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf { struct task_struct *next = rq->idle; - put_prev_task(rq, prev); + if (prev) + put_prev_task(rq, prev); + set_next_task_idle(rq, next); return next; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index dbdabd76f192..858c4cc6f99b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1553,38 +1553,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; - if (need_pull_rt_task(rq, prev)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we're - * being very careful to re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_rt_task(rq); - rq_repin_lock(rq, rf); - /* - * pull_rt_task() can drop (and re-acquire) rq->lock; this - * means a dl or stop task can slip in, in which case we need - * to re-start task selection. - */ - if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || - rq->dl.dl_nr_running)) - return RETRY_TASK; - } - - /* - * We may dequeue prev's rt_rq in put_prev_task(). - * So, we update time before rt_queued check. - */ - if (prev->sched_class == &rt_sched_class) - update_curr_rt(rq); + WARN_ON_ONCE(prev || rf); if (!rt_rq->rt_queued) return NULL; - put_prev_task(rq, prev); - p = _pick_next_task_rt(rq); set_next_task_rt(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e085cffb8004..7111e3a1eeb4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1700,12 +1700,15 @@ struct sched_class { void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); /* - * It is the responsibility of the pick_next_task() method that will - * return the next task to call put_prev_task() on the @prev task or - * something equivalent. + * Both @prev and @rf are optional and may be NULL, in which case the + * caller must already have invoked put_prev_task(rq, prev, rf). * - * May return RETRY_TASK when it finds a higher prio class has runnable - * tasks. + * Otherwise it is the responsibility of the pick_next_task() to call + * put_prev_task() on the @prev task or something equivalent, IFF it + * returns a next task. + * + * In that case (@rf != NULL) it may return RETRY_TASK when it finds a + * higher prio class has runnable tasks. */ struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 8f414018d5e0..7e1cee4e65b2 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -33,10 +33,11 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf { struct task_struct *stop = rq->stop; + WARN_ON_ONCE(prev || rf); + if (!stop || !task_on_rq_queued(stop)) return NULL; - put_prev_task(rq, prev); set_next_task_stop(rq, stop); return stop; -- cgit v1.2.3-70-g09d2