From e0f5f3afd2cffa96291cd852056d83ff4e2e99c7 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Aug 2015 17:23:09 +0100 Subject: sched/fair: Make load tracking frequency scale-invariant Apply frequency scaling correction factor to per-entity load tracking to make it frequency invariant. Currently, load appears bigger when the CPU is running slower which affects load-balancing decisions. Each segment of the sched_avg.load_sum geometric series is now scaled by the current frequency so that the sched_avg.load_avg of each sched entity will be invariant from frequency scaling. Moreover, cfs_rq.runnable_load_sum is scaled by the current frequency as well. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index a4ab9daa387c..c8d923ba429d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1177,9 +1177,9 @@ struct load_weight { /* * The load_avg/util_avg accumulates an infinite geometric series. - * 1) load_avg factors the amount of time that a sched_entity is - * runnable on a rq into its weight. For cfs_rq, it is the aggregated - * such weights of all runnable and blocked sched_entities. + * 1) load_avg factors frequency scaling into the amount of time that a + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the + * aggregated such weights of all runnable and blocked sched_entities. * 2) util_avg factors frequency scaling into the amount of time * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. * For cfs_rq, it is the aggregated such times of all runnable and -- cgit v1.2.3-70-g09d2 From e3279a2e6d697e00e74f905851ee7cf532f72b2d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 15 Aug 2015 00:04:41 +0100 Subject: sched/fair: Make utilization tracking CPU scale-invariant Besides the existing frequency scale-invariance correction factor, apply CPU scale-invariance correction factor to utilization tracking to compensate for any differences in compute capacity. This could be due to micro-architectural differences (i.e. instructions per seconds) between cpus in HMP systems (e.g. big.LITTLE), and/or differences in the current maximum frequency supported by individual cpus in SMP systems. In the existing implementation utilization isn't comparable between cpus as it is relative to the capacity of each individual CPU. Each segment of the sched_avg.util_sum geometric series is now scaled by the CPU performance factor too so the sched_avg.util_avg of each sched entity will be invariant from the particular CPU of the HMP/SMP system on which the sched entity is scheduled. With this patch, the utilization of a CPU stays relative to the max CPU performance of the fastest CPU in the system. In contrast to utilization (sched_avg.util_sum), load (sched_avg.load_sum) should not be scaled by compute capacity. The utilization metric is based on running time which only makes sense when cpus are _not_ fully utilized (utilization cannot go beyond 100% even if more tasks are added), where load is runnable time which isn't limited by the capacity of the CPU and therefore is a better metric for overloaded scenarios. If we run two nice-0 busy loops on two cpus with different compute capacity their load should be similar since their compute demands are the same. We have to assume that the compute demand of any task running on a fully utilized CPU (no spare cycles = 100% utilization) is high and the same no matter of the compute capacity of its current CPU, hence we shouldn't scale load by CPU capacity. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55CE7409.1000700@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/sched/fair.c | 7 ++++--- kernel/sched/sched.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index c8d923ba429d..bd38b3ee9e83 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1180,7 +1180,7 @@ struct load_weight { * 1) load_avg factors frequency scaling into the amount of time that a * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the * aggregated such weights of all runnable and blocked sched_entities. - * 2) util_avg factors frequency scaling into the amount of time + * 2) util_avg factors frequency and cpu scaling into the amount of time * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. * For cfs_rq, it is the aggregated such times of all runnable and * blocked sched_entities. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 102cdf1e4e97..573dc98c6248 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2553,6 +2553,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, u32 contrib; int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); delta = now - sa->last_update_time; /* @@ -2596,7 +2597,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } } if (running) - sa->util_sum += scaled_delta_w; + sa->util_sum += scale(scaled_delta_w, scale_cpu); delta -= delta_w; @@ -2620,7 +2621,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += contrib; + sa->util_sum += scale(contrib, scale_cpu); } /* Remainder of delta accrued against u_0` */ @@ -2631,7 +2632,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += scaled_delta; + sa->util_sum += scale(scaled_delta, scale_cpu); sa->period_contrib += delta; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c0726d5fd6a3..167ab4844ee6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1398,7 +1398,7 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) return sd->smt_gain / sd->span_weight; return SCHED_CAPACITY_SCALE; -- cgit v1.2.3-70-g09d2 From 2726d6ce389788c7fe724961a6e1bfe569560088 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 2 Sep 2015 11:01:34 +0100 Subject: sched/deadline: Unify dl_time_before() usage Move dl_time_before() static definition in include/linux/sched/deadline.h so that it can be used by different parties without being re-defined. Reported-by: Luca Abeni Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441188096-23021-3-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched/deadline.h | 5 +++++ kernel/sched/cpudeadline.c | 5 ----- kernel/sched/cpudeadline.h | 1 + kernel/sched/sched.h | 5 ----- 4 files changed, 6 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 9d303b8847df..9089a2ae913d 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p) return dl_prio(p->prio); } +static inline bool dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + #endif /* _SCHED_DEADLINE_H */ diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index c6acb07466bb..5a75b08cfd85 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -31,11 +31,6 @@ static inline int right_child(int i) return (i << 1) + 2; } -static inline int dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 1a0a6ef2fbe1..fcbdf83fed7e 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -2,6 +2,7 @@ #define _LINUX_CPUDL_H #include +#include #define IDX_INVALID -1 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3845a711c65e..af6f252e7e34 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -118,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -static inline bool dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - /* * Tells if entity @a should preempt entity @b. */ -- cgit v1.2.3-70-g09d2 From c6e1e7b5b7f031910850ddaf7bfa65ba3b4843ea Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 22 Sep 2015 12:48:59 +0200 Subject: sched/core: Make 'sched_domain_topology' declaration static The 'sched_domain_topology' variable is only used within kernel/sched/core.c. Make it static. Signed-off-by: Juergen Gross Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1442918939-9907-1-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- kernel/sched/core.c | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index bd38b3ee9e83..699228bb0035 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1127,8 +1127,6 @@ struct sched_domain_topology_level { #endif }; -extern struct sched_domain_topology_level *sched_domain_topology; - extern void set_sched_topology(struct sched_domain_topology_level *tl); extern void wake_up_if_idle(int cpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1b30b5b24a4a..a91df6171f48 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6445,7 +6445,8 @@ static struct sched_domain_topology_level default_topology[] = { { NULL, }, }; -struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology = + default_topology; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) -- cgit v1.2.3-70-g09d2 From 87dcbc0610cb580c8eaf289f52aca3620af825f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 17:45:40 +0200 Subject: sched/core: Simplify INIT_PREEMPT_COUNT As per the following commit: d86ee4809d03 ("sched: optimize cond_resched()") we need PREEMPT_ACTIVE to avoid cond_resched() from working before the scheduler is set up. However, keeping preemption disabled should do the same thing already, making the PREEMPT_ACTIVE part entirely redundant. The only complication is !PREEMPT_COUNT kernels, where PREEMPT_DISABLED ends up being 0. Instead we use an unconditional PREEMPT_OFFSET to set preempt_count() even on !PREEMPT_COUNT kernels. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt Reviewed-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d086cf0ca2c7..e5b8cbc4b8d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -606,19 +606,18 @@ struct task_cputime_atomic { #endif /* - * Disable preemption until the scheduler is running. - * Reset by start_kernel()->sched_init()->init_idle(). + * Disable preemption until the scheduler is running -- use an unconditional + * value so that it also works on !PREEMPT_COUNT kernels. * - * We include PREEMPT_ACTIVE to avoid cond_resched() from working - * before the scheduler is active -- see should_resched(). + * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). */ -#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) +#define INIT_PREEMPT_COUNT PREEMPT_OFFSET /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. * @running: non-zero when there are timers running and - * @cputime receives updates. + * @cputime receives updates. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. -- cgit v1.2.3-70-g09d2 From 609ca066386b2e64d4c0b0f55da327654962a0c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 17:52:18 +0200 Subject: sched/core: Create preempt_count invariant Assuming units of PREEMPT_DISABLE_OFFSET for preempt_count() numbers. Now that TASK_DEAD no longer results in preempt_count() == 3 during scheduling, we will always call context_switch() with preempt_count() == 2. However, we don't always end up with preempt_count() == 2 in finish_task_switch() because new tasks get created with preempt_count() == 1. Create FORK_PREEMPT_COUNT and set it to 2 and use that in the right places. Note that we cannot use INIT_PREEMPT_COUNT as that serves another purpose (boot). After this, preempt_count() is invariant across the context switch, with exception of PREEMPT_ACTIVE. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 2 +- include/asm-generic/preempt.h | 2 +- include/linux/sched.h | 17 ++++++++++++----- kernel/sched/core.c | 23 +++++++++++++++++++++-- 4 files changed, 35 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index b12f81022a6b..01e700d392cb 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -31,7 +31,7 @@ static __always_inline void preempt_count_set(int pc) * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ + task_thread_info(p)->saved_preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 0bec580a4885..5d8ffa3e6f8c 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc) * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { \ - task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ + task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ diff --git a/include/linux/sched.h b/include/linux/sched.h index e5b8cbc4b8d6..23ca455d9582 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -599,11 +599,7 @@ struct task_cputime_atomic { .sum_exec_runtime = ATOMIC64_INIT(0), \ } -#ifdef CONFIG_PREEMPT_COUNT -#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) -#else -#define PREEMPT_DISABLED PREEMPT_ENABLED -#endif +#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* * Disable preemption until the scheduler is running -- use an unconditional @@ -613,6 +609,17 @@ struct task_cputime_atomic { */ #define INIT_PREEMPT_COUNT PREEMPT_OFFSET +/* + * Initial preempt_count value; reflects the preempt_count schedule invariant + * which states that during context switches: + * + * preempt_count() == 2*PREEMPT_DISABLE_OFFSET + * + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. + * Note: See finish_task_switch(). + */ +#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) + /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 530fe8baa645..8d8722b84dee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2504,6 +2504,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) struct mm_struct *mm = rq->prev_mm; long prev_state; + /* + * The previous task will have left us with a preempt_count of 2 + * because it left us after: + * + * schedule() + * preempt_disable(); // 1 + * __schedule() + * raw_spin_lock_irq(&rq->lock) // 2 + * + * Also, see FORK_PREEMPT_COUNT. + */ + rq->prev_mm = NULL; /* @@ -2588,8 +2600,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) { struct rq *rq; - /* finish_task_switch() drops rq->lock and enables preemtion */ - preempt_disable(); + /* + * New tasks start with FORK_PREEMPT_COUNT, see there and + * finish_task_switch() for details. + * + * finish_task_switch() will drop rq->lock() and lower preempt_count + * and the preempt_enable() will end up enabling preemption (on + * PREEMPT_COUNT kernels). + */ + rq = finish_task_switch(prev); balance_callback(rq); preempt_enable(); -- cgit v1.2.3-70-g09d2 From c73464b1c8434ad4cbfd5369c3e724f3e8ffe5a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:06:56 +0200 Subject: sched/core: Fix trace_sched_switch() __trace_sched_switch_state() is the last remaining PREEMPT_ACTIVE user, move trace_sched_switch() from prepare_task_switch() to __schedule() and propagate the @preempt argument. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 22 +++++++++------------- kernel/sched/core.c | 2 +- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_sched_switch.c | 3 ++- kernel/trace/trace_sched_wakeup.c | 2 +- 5 files changed, 14 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 539d6bc3216a..9b90c57517a9 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, TP_ARGS(p)); #ifdef CREATE_TRACE_POINTS -static inline long __trace_sched_switch_state(struct task_struct *p) +static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) { - long state = p->state; - -#ifdef CONFIG_PREEMPT #ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); #endif /* CONFIG_SCHED_DEBUG */ + /* - * For all intents and purposes a preempted task is a running task. + * Preemption ignores task state, therefore preempted tasks are always + * RUNNING (we will not have dequeued if state != RUNNING). */ - if (preempt_count() & PREEMPT_ACTIVE) - state = TASK_RUNNING | TASK_STATE_MAX; -#endif /* CONFIG_PREEMPT */ - - return state; + return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state; } #endif /* CREATE_TRACE_POINTS */ @@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p) */ TRACE_EVENT(sched_switch, - TP_PROTO(struct task_struct *prev, + TP_PROTO(bool preempt, + struct task_struct *prev, struct task_struct *next), - TP_ARGS(prev, next), + TP_ARGS(preempt, prev, next), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) @@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch, memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; - __entry->prev_state = __trace_sched_switch_state(prev); + __entry->prev_state = __trace_sched_switch_state(preempt, prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0a71f89fb3fc..cfad7f5f74f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2470,7 +2470,6 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - trace_sched_switch(prev, next); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); @@ -3132,6 +3131,7 @@ static void __sched __schedule(bool preempt) rq->curr = next; ++*switch_count; + trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); } else { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0623ac785a2..00611e95a8ee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5697,7 +5697,7 @@ free: } static void -ftrace_graph_probe_sched_switch(void *ignore, +ftrace_graph_probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { unsigned long long timestamp; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index f270088e9929..4c896a0101bd 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,7 +16,8 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static void -probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) +probe_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, struct task_struct *next) { if (unlikely(!sched_ref)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 12cbe77b4136..4bcfbac289ff 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, } static void notrace -probe_wakeup_sched_switch(void *ignore, +probe_wakeup_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; -- cgit v1.2.3-70-g09d2 From 3d8f74dd4ca1da8a1a464bbafcf679e40c2fc10f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:09:19 +0200 Subject: sched/core: Stop setting PREEMPT_ACTIVE Now that nothing tests for PREEMPT_ACTIVE anymore, stop setting it. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 12 ------------ kernel/sched/core.c | 19 ++++++------------- 2 files changed, 6 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index bea8dd8ff5e0..448dfd0b2ea6 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -146,18 +146,6 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) -#define preempt_active_enter() \ -do { \ - preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ - barrier(); \ -} while (0) - -#define preempt_active_exit() \ -do { \ - barrier(); \ - preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ -} while (0) - #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cfad7f5f74f8..6344d82a84f6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3201,9 +3201,9 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - preempt_active_enter(); + preempt_disable(); __schedule(true); - preempt_active_exit(); + sched_preempt_enable_no_resched(); /* * Check again in case we missed a preemption opportunity @@ -3254,13 +3254,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { - /* - * Use raw __prempt_count() ops that don't call function. - * We can't call functions before disabling preemption which - * disarm preemption tracing recursions. - */ - __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); - barrier(); + preempt_disable_notrace(); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -3270,8 +3264,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) __schedule(true); exception_exit(prev_ctx); - barrier(); - __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); + preempt_enable_no_resched_notrace(); } while (need_resched()); } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); @@ -3294,11 +3287,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) prev_state = exception_enter(); do { - preempt_active_enter(); + preempt_disable(); local_irq_enable(); __schedule(true); local_irq_disable(); - preempt_active_exit(); + sched_preempt_enable_no_resched(); } while (need_resched()); exception_exit(prev_state); -- cgit v1.2.3-70-g09d2 From da7142e2ed735e1c1bef5a757dc55de35c65fbd6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:11:45 +0200 Subject: sched/core: Simplify preempt_count tests Since we stopped setting PREEMPT_ACTIVE, there is no need to mask it out of preempt_count() tests. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 3 +-- kernel/sched/core.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 448dfd0b2ea6..b2676a16cfbe 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -126,8 +126,7 @@ * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ -#define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET) +#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d6989f85c641..ca260cc5d881 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7486,7 +7486,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + int nested = preempt_count() + rcu_preempt_depth(); return (nested == preempt_offset); } -- cgit v1.2.3-70-g09d2 From e61bf1e43b6f8e687ce93e5d0ce85bca7e481600 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:11:35 +0200 Subject: sched/core: Kill PREEMPT_ACTIVE Its unused, kill the definition. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index b2676a16cfbe..75e4e30677f1 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -26,7 +26,6 @@ * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 * NMI_MASK: 0x00100000 - * PREEMPT_ACTIVE: 0x00200000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 @@ -53,10 +52,6 @@ #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) -#define PREEMPT_ACTIVE_BITS 1 -#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) -#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) - /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 -- cgit v1.2.3-70-g09d2 From 233e7f267e580fefdeb36628b7efe8bfe056d27c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 Oct 2015 16:51:31 +0200 Subject: stop_machine: Ensure that a queued callback will be called before cpu_stop_park() cpu_stop_queue_work() checks stopper->enabled before it queues the work, but ->enabled == T can only guarantee cpu_stop_signal_done() if we race with cpu_down(). This is not enough for stop_two_cpus() or stop_machine(), they will deadlock if multi_cpu_stop() won't be called by one of the target CPU's. stop_machine/stop_cpus are fine, they rely on stop_cpus_mutex. But stop_two_cpus() has to check cpu_active() to avoid the same race with hotplug, and this check is very unobvious and probably not even correct if we race with cpu_up(). Change cpu_down() pass to clear ->enabled before cpu_stopper_thread() flushes the pending ->works and returns with KTHREAD_SHOULD_PARK set. Note also that smpboot_thread_call() calls cpu_stop_unpark() which sets enabled == T at CPU_ONLINE stage, so this CPU can't go away until cpu_stopper_thread() is called at least once. This all means that if cpu_stop_queue_work() succeeds, we know that work->fn() will be called. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151008145131.GA18139@redhat.com Signed-off-by: Ingo Molnar --- include/linux/stop_machine.h | 1 + kernel/cpu.c | 2 +- kernel/stop_machine.c | 23 +++++++++++++---------- 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 414d924318ce..7b76362b381c 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -33,6 +33,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); +void stop_machine_park(int cpu); #else /* CONFIG_SMP */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 050c63472f03..c85df2775b73 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -344,7 +344,7 @@ static int take_cpu_down(void *_param) /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - kthread_park(current); + stop_machine_park((long)param->hcpu); return 0; } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 12484e5d5c88..6a402098d4ab 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -452,6 +452,18 @@ repeat: } } +void stop_machine_park(int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + /* + * Lockless. cpu_stopper_thread() will take stopper->lock and flush + * the pending works before it parks, until then it is fine to queue + * the new works. + */ + stopper->enabled = false; + kthread_park(stopper->thread); +} + extern void sched_set_stop_task(int cpu, struct task_struct *stop); static void cpu_stop_create(unsigned int cpu) @@ -462,17 +474,8 @@ static void cpu_stop_create(unsigned int cpu) static void cpu_stop_park(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct cpu_stop_work *work, *tmp; - unsigned long flags; - /* drain remaining works */ - spin_lock_irqsave(&stopper->lock, flags); - list_for_each_entry_safe(work, tmp, &stopper->works, list) { - list_del_init(&work->list); - cpu_stop_signal_done(work->done, false); - } - stopper->enabled = false; - spin_unlock_irqrestore(&stopper->lock, flags); + WARN_ON(!list_empty(&stopper->works)); } static void cpu_stop_unpark(unsigned int cpu) -- cgit v1.2.3-70-g09d2 From c00166d87e730088d919814020e96ffed129d0d1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 9 Oct 2015 18:00:49 +0200 Subject: stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark() 1. Change smpboot_unpark_thread() to check ->selfparking, just like smpboot_park_thread() does. 2. Introduce stop_machine_unpark() which sets ->enabled and calls kthread_unpark(). 3. Change smpboot_thread_call() and cpu_stop_init() to call stop_machine_unpark() by hand. This way: - IMO the ->selfparking logic becomes more consistent. - We can kill the smp_hotplug_thread->pre_unpark() method. - We can easily unpark the stopper thread earlier. Say, we can move stop_machine_unpark() from smpboot_thread_call() to sched_cpu_active() as Peter suggests. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151009160049.GA10166@redhat.com Signed-off-by: Ingo Molnar --- include/linux/smpboot.h | 4 ---- include/linux/stop_machine.h | 1 + kernel/cpu.c | 1 + kernel/smpboot.c | 5 ++--- kernel/stop_machine.c | 10 +++++++++- 5 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index e6109a6cd8f6..12910cf19869 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -24,9 +24,6 @@ struct smpboot_thread_data; * parked (cpu offline) * @unpark: Optional unpark function, called when the thread is * unparked (cpu online) - * @pre_unpark: Optional unpark function, called before the thread is - * unparked (cpu online). This is not guaranteed to be - * called on the target cpu of the thread. Careful! * @cpumask: Internal state. To update which threads are unparked, * call smpboot_update_cpumask_percpu_thread(). * @selfparking: Thread is not parked by the park function. @@ -42,7 +39,6 @@ struct smp_hotplug_thread { void (*cleanup)(unsigned int cpu, bool online); void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); - void (*pre_unpark)(unsigned int cpu); cpumask_var_t cpumask; bool selfparking; const char *thread_comm; diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 7b76362b381c..0adedca24c5b 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -34,6 +34,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); void stop_machine_park(int cpu); +void stop_machine_unpark(int cpu); #else /* CONFIG_SMP */ diff --git a/kernel/cpu.c b/kernel/cpu.c index c85df2775b73..6467521e1e15 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -475,6 +475,7 @@ static int smpboot_thread_call(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: + stop_machine_unpark(cpu); smpboot_unpark_threads(cpu); break; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index a818cbc73e14..d264f59bff56 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (ht->pre_unpark) - ht->pre_unpark(cpu); - kthread_unpark(tsk); + if (!ht->selfparking) + kthread_unpark(tsk); } void smpboot_unpark_threads(unsigned int cpu) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 91fbb109de6c..59096a55089f 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -513,6 +513,14 @@ static void cpu_stop_unpark(unsigned int cpu) spin_unlock_irq(&stopper->lock); } +void stop_machine_unpark(int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + + cpu_stop_unpark(cpu); + kthread_unpark(stopper->thread); +} + static struct smp_hotplug_thread cpu_stop_threads = { .store = &cpu_stopper.thread, .thread_should_run = cpu_stop_should_run, @@ -521,7 +529,6 @@ static struct smp_hotplug_thread cpu_stop_threads = { .create = cpu_stop_create, .setup = cpu_stop_unpark, .park = cpu_stop_park, - .pre_unpark = cpu_stop_unpark, .selfparking = true, }; @@ -537,6 +544,7 @@ static int __init cpu_stop_init(void) } BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); + stop_machine_unpark(raw_smp_processor_id()); stop_machine_initialized = true; return 0; } -- cgit v1.2.3-70-g09d2