From 7281c8dec8a87685cb54d503d8cceef5a0fc2fdd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Apr 2018 14:29:51 +0200 Subject: sched/core: Fix possible Spectre-v1 indexing for sched_prio_to_weight[] > kernel/sched/core.c:6921 cpu_weight_nice_write_s64() warn: potential spectre issue 'sched_prio_to_weight' Userspace controls @nice, so sanitize the value before using it to index an array. Reported-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ffde9eebc846..092f7c4de903 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8,6 +8,7 @@ #include "sched.h" #include +#include #include #include @@ -6923,11 +6924,15 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; + int idx; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; - weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; + idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; + idx = array_index_nospec(idx, 40); + weight = sched_prio_to_weight[idx]; + return sched_group_set_shares(css_tg(css), scale_load(weight)); } #endif -- cgit v1.2.3-70-g09d2 From 354d7793070611b4df5a79fbb0f12752d0ed0cc5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Apr 2018 15:03:45 +0200 Subject: sched/autogroup: Fix possible Spectre-v1 indexing for sched_prio_to_weight[] > kernel/sched/autogroup.c:230 proc_sched_autogroup_set_nice() warn: potential spectre issue 'sched_prio_to_weight' Userspace controls @nice, sanitize the array index. Reported-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Signed-off-by: Ingo Molnar --- kernel/sched/autogroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 6be6c575b6cd..2d4ff5353ded 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -2,6 +2,7 @@ /* * Auto-group scheduling implementation: */ +#include #include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; @@ -209,7 +210,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; unsigned long shares; - int err; + int err, idx; if (nice < MIN_NICE || nice > MAX_NICE) return -EINVAL; @@ -227,7 +228,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) next = HZ / 10 + jiffies; ag = autogroup_task_get(p); - shares = scale_load(sched_prio_to_weight[nice + 20]); + + idx = array_index_nospec(nice + 20, 40); + shares = scale_load(sched_prio_to_weight[idx]); down_write(&ag->lock); err = sched_group_set_shares(ag->tg, shares); -- cgit v1.2.3-70-g09d2 From a744490f12707d9f0b205272b29adf5bdb3ba193 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 9 May 2018 10:40:51 +0200 Subject: cpufreq: schedutil: remove stale comment After commit 794a56ebd9a57 (sched/cpufreq: Change the worker kthread to SCHED_DEADLINE) schedutil kthreads are "ignored" for a clock frequency selection point of view, so the potential corner case for RT tasks is not possible at all now. Remove the stale comment mentioning it. Signed-off-by: Juri Lelli Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d2c6083304b4..23ef19070137 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -396,19 +396,6 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - /* - * For RT tasks, the schedutil governor shoots the frequency to maximum. - * Special care must be taken to ensure that this kthread doesn't result - * in the same behavior. - * - * This is (mostly) guaranteed by the work_in_progress flag. The flag is - * updated only at the end of the sugov_work() function and before that - * the schedutil governor rejects all other frequency scaling requests. - * - * There is a very rare case though, where the RT thread yields right - * after the work_in_progress flag is cleared. The effects of that are - * neglected for now. - */ kthread_queue_work(&sg_policy->worker, &sg_policy->work); } -- cgit v1.2.3-70-g09d2 From 97739501f207efe33145b918817f305b822987f8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 9 May 2018 11:44:56 +0200 Subject: cpufreq: schedutil: Avoid using invalid next_freq If the next_freq field of struct sugov_policy is set to UINT_MAX, it shouldn't be used for updating the CPU frequency (this is a special "invalid" value), but after commit b7eaf1aab9f8 (cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely) it may be passed as the new frequency to sugov_update_commit() in sugov_update_single(). Fix that by adding an extra check for the special UINT_MAX value of next_freq to sugov_update_single(). Fixes: b7eaf1aab9f8 (cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely) Reported-by: Viresh Kumar Cc: 4.12+ # 4.12+ Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 23ef19070137..e13df951aca7 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -305,7 +305,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq) { + if (busy && next_f < sg_policy->next_freq && + sg_policy->next_freq != UINT_MAX) { next_f = sg_policy->next_freq; /* Reset cached freq as next_freq has changed */ -- cgit v1.2.3-70-g09d2 From 789ba28013ce23dbf5e9f5f014f4233b35523bf3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 9 May 2018 17:31:15 +0100 Subject: Revert "sched/numa: Delay retrying placement for automatic NUMA balance after wake_affine()" This reverts commit 7347fc87dfe6b7315e74310ee1243dc222c68086. Srikar Dronamra pointed out that while the commit in question did show a performance improvement on ppc64, it did so at the cost of disabling active CPU migration by automatic NUMA balancing which was not the intent. The issue was that a serious flaw in the logic failed to ever active balance if SD_WAKE_AFFINE was disabled on scheduler domains. Even when it's enabled, the logic is still bizarre and against the original intent. Investigation showed that fixing the patch in either the way he suggested, using the correct comparison for jiffies values or introducing a new numa_migrate_deferred variable in task_struct all perform similarly to a revert with a mix of gains and losses depending on the workload, machine and socket count. The original intent of the commit was to handle a problem whereby wake_affine, idle balancing and automatic NUMA balancing disagree on the appropriate placement for a task. This was particularly true for cases where a single task was a massive waker of tasks but where wake_wide logic did not apply. This was particularly noticeable when a futex (a barrier) woke all worker threads and tried pulling the wakees to the waker nodes. In that specific case, it could be handled by tuning MPI or openMP appropriately, but the behavior is not illogical and was worth attempting to fix. However, the approach was wrong. Given that we're at rc4 and a fix is not obvious, it's better to play safe, revert this commit and retry later. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Thomas Gleixner Cc: efault@gmx.de Cc: ggherdovich@suse.cz Cc: hpa@zytor.com Cc: matt@codeblueprint.co.uk Cc: mpe@ellerman.id.au Link: http://lkml.kernel.org/r/20180509163115.6fnnyeg4vdm2ct4v@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 57 +---------------------------------------------------- 1 file changed, 1 insertion(+), 56 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 54dc31e7ab9b..f43627c6bb3d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1854,7 +1854,6 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { unsigned long interval = HZ; - unsigned long numa_migrate_retry; /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) @@ -1862,18 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p) /* Periodically retry migrating the task to the preferred node */ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); - numa_migrate_retry = jiffies + interval; - - /* - * Check that the new retry threshold is after the current one. If - * the retry is in the future, it implies that wake_affine has - * temporarily asked NUMA balancing to backoff from placement. - */ - if (numa_migrate_retry > p->numa_migrate_retry) - return; - - /* Safe to try placing the task on the preferred node */ - p->numa_migrate_retry = numa_migrate_retry; + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) @@ -5922,48 +5910,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; } -#ifdef CONFIG_NUMA_BALANCING -static void -update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) -{ - unsigned long interval; - - if (!static_branch_likely(&sched_numa_balancing)) - return; - - /* If balancing has no preference then continue gathering data */ - if (p->numa_preferred_nid == -1) - return; - - /* - * If the wakeup is not affecting locality then it is neutral from - * the perspective of NUMA balacing so continue gathering data. - */ - if (cpu_to_node(prev_cpu) == cpu_to_node(target)) - return; - - /* - * Temporarily prevent NUMA balancing trying to place waker/wakee after - * wakee has been moved by wake_affine. This will potentially allow - * related tasks to converge and update their data placement. The - * 4 * numa_scan_period is to allow the two-pass filter to migrate - * hot data to the wakers node. - */ - interval = max(sysctl_numa_balancing_scan_delay, - p->numa_scan_period << 2); - p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); - - interval = max(sysctl_numa_balancing_scan_delay, - current->numa_scan_period << 2); - current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); -} -#else -static void -update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) -{ -} -#endif - static int wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5979,7 +5925,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (target == nr_cpumask_bits) return prev_cpu; - update_wa_numa_placement(p, prev_cpu, target); schedstat_inc(sd->ttwu_move_affine); schedstat_inc(p->se.statistics.nr_wakeups_affine); return target; -- cgit v1.2.3-70-g09d2