From 5aaa0b7a2ed5b12692c9ffb5222182bd558d3146 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 May 2012 17:15:29 +0200 Subject: sched/nohz: Fix rq->cpu_load calculations some more Follow up on commit 556061b00 ("sched/nohz: Fix rq->cpu_load[] calculations") since while that fixed the busy case it regressed the mostly idle case. Add a callback from the nohz exit to also age the rq->cpu_load[] array. This closes the hole where either there was no nohz load balance pass during the nohz, or there was a 'significant' amount of idle time between the last nohz balance and the nohz exit. So we'll update unconditionally from the tick to not insert any accidental 0 load periods while busy, and we try and catch up from nohz idle balance and nohz exit. Both these are still prone to missing a jiffy, but that has always been the case. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Cc: Venkatesh Pallipadi Link: http://lkml.kernel.org/n/tip-kt0trz0apodbf84ucjfdbr1a@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc38..75844a8f9aeb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2517,25 +2517,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + /* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ void update_idle_cpu_load(struct rq *this_rq) { - unsigned long curr_jiffies = jiffies; + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); unsigned long load = this_rq->load.weight; unsigned long pending_updates; /* - * Bloody broken means of dealing with nohz, but better than nothing.. - * jiffies is updated by one cpu, another cpu can drift wrt the jiffy - * update and see 0 difference the one time and 2 the next, even though - * we ticked at roughtly the same rate. - * - * Hence we only use this from nohz_idle_balance() and skip this - * nonsense when called from the scheduler_tick() since that's - * guaranteed a stable rate. + * bail if there's load or we're actually up-to-date. */ if (load || curr_jiffies == this_rq->last_load_update_tick) return; @@ -2546,13 +2553,39 @@ void update_idle_cpu_load(struct rq *this_rq) __update_cpu_load(this_rq, load, pending_updates); } +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + /* * Called from scheduler_tick() */ static void update_cpu_load_active(struct rq *this_rq) { /* - * See the mess in update_idle_cpu_load(). + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; __update_cpu_load(this_rq, this_rq->load.weight, 1); -- cgit v1.2.3-70-g09d2 From 2ea45800d8e1c3c51c45a233d6bd6289a297a386 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 25 May 2012 09:26:43 +0200 Subject: sched: Don't try allocating memory from offline nodes Allocators don't appreciate it when you try and allocate memory from offline nodes. Reported-and-tested-by: Tony Luck Reported-and-tested-by: Anton Blanchard Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-epfc1io9whb7o22bcujf31vn@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75844a8f9aeb..55733616baaa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6436,7 +6436,7 @@ static void sched_init_numa(void) return; for (j = 0; j < nr_node_ids; j++) { - struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); + struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); if (!mask) return; -- cgit v1.2.3-70-g09d2 From 74a5ce20e6eeeb3751340b390e7ac1d1d07bbf55 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 May 2012 18:00:43 +0200 Subject: sched: Fix SD_OVERLAP SD_OVERLAP exists to allow overlapping groups, overlapping groups appear in NUMA topologies that aren't fully connected. The typical result of not fully connected NUMA is that each cpu (or rather node) will have different spans for a particular distance. However due to how sched domains are traversed -- only the first cpu in the mask goes one level up -- the next level only cares about the spans of the cpus that went up. Due to this two things were observed to be broken: - build_overlap_sched_groups() -- since its possible the cpu we're building the groups for exists in multiple (or all) groups, the selection criteria of the first group didn't ensure there was a cpu for which is was true that cpumask_first(span) == cpu. Thus load- balancing would terminate. - update_group_power() -- assumed that the cpu span of the first group of the domain was covered by all groups of the child domain. The above explains why this isn't true, so deal with it. Signed-off-by: Peter Zijlstra Cc: David Rientjes Link: http://lkml.kernel.org/r/1337788843.9783.14.camel@laptop Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++++-- kernel/sched/fair.c | 25 ++++++++++++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55733616baaa..3a69374fb427 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6030,11 +6030,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + sg->sgp = *per_cpu_ptr(sdd->sgp, i); atomic_inc(&sg->sgp->ref); - if (cpumask_test_cpu(cpu, sg_span)) + if ((!groups && cpumask_test_cpu(cpu, sg_span)) || + cpumask_first(sg_span) == cpu) { + WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span)); groups = sg; + } if (!first) first = sg; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 940e6d17cf96..f0380d4987b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3574,11 +3574,26 @@ void update_group_power(struct sched_domain *sd, int cpu) power = 0; - group = child->groups; - do { - power += group->sgp->power; - group = group->next; - } while (group != child->groups); + if (child->flags & SD_OVERLAP) { + /* + * SD_OVERLAP domains cannot assume that child groups + * span the current group. + */ + + for_each_cpu(cpu, sched_group_cpus(sdg)) + power += power_of(cpu); + } else { + /* + * !SD_OVERLAP domains can assume that child groups + * span the current group. + */ + + group = child->groups; + do { + power += group->sgp->power; + group = group->next; + } while (group != child->groups); + } sdg->sgp->power = power; } -- cgit v1.2.3-70-g09d2 From 29baa7478ba47d746e3625c91d3b2afbf46b4312 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Apr 2012 12:11:21 +0200 Subject: sched: Move nr_cpus_allowed out of 'struct sched_rt_entity' Since nr_cpus_allowed is used outside of sched/rt.c and wants to be used outside of there more, move it to a more natural site. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-kr61f02y9brwzkh6x53pdptm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/blackfin/kernel/process.c | 2 +- include/linux/init_task.h | 2 +- include/linux/sched.h | 2 +- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 36 +++++++++++++++++++++--------------- 6 files changed, 26 insertions(+), 20 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 2e3994b20169..62bcea7dcc6d 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -173,7 +173,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs) unsigned long newsp; #ifdef __ARCH_SYNC_CORE_DCACHE - if (current->rt.nr_cpus_allowed == num_possible_cpus()) + if (current->nr_cpus_allowed == num_possible_cpus()) set_cpus_allowed_ptr(current, cpumask_of(smp_processor_id())); #endif diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e4baff5f7ff4..9e65eff6af3b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -149,6 +149,7 @@ extern struct cred init_cred; .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ + .nr_cpus_allowed= NR_CPUS, \ .mm = NULL, \ .active_mm = &init_mm, \ .se = { \ @@ -157,7 +158,6 @@ extern struct cred init_cred; .rt = { \ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ .time_slice = RR_TIMESLICE, \ - .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ INIT_PUSHABLE_TASKS(tsk) \ diff --git a/include/linux/sched.h b/include/linux/sched.h index d61e5977e517..0f50e78f7f44 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1188,7 +1188,6 @@ struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned int time_slice; - int nr_cpus_allowed; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED @@ -1253,6 +1252,7 @@ struct task_struct { #endif unsigned int policy; + int nr_cpus_allowed; cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a69374fb427..70cc36a6073f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5015,7 +5015,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b449a762074..b2a2d236f27b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int want_sd = 1; int sync = wake_flags & WF_SYNC; - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515f..295da737b6fe 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); inc_nr_running(rq); @@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) cpu = task_cpu(p); - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) goto out; /* For anything but wake ups, just return the task_cpu */ @@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->rt.nr_cpus_allowed < 2 || + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio) && - (p->rt.nr_cpus_allowed > 1)) { + (p->nr_cpus_allowed > 1)) { int target = find_lowest_rq(p); if (target != -1) @@ -1276,10 +1282,10 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->rt.nr_cpus_allowed == 1) + if (rq->curr->nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 + if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->rt.nr_cpus_allowed > 1)) + (p->nr_cpus_allowed > 1)) return 1; return 0; } @@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->rt.nr_cpus_allowed == 1) + if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->rt.nr_cpus_allowed <= 1); + BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); @@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1 && + p->nr_cpus_allowed > 1 && rt_task(rq->curr) && - (rq->curr->rt.nr_cpus_allowed < 2 || + (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Only update if the process changes its state from whether it * can migrate or not. */ - if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) + if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; rq = task_rq(p); -- cgit v1.2.3-70-g09d2 From 1292531f6f27af909e713671dd9cc3bcab8114b7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 25 May 2012 15:41:54 +0900 Subject: sched: Make sched_feat_names const The strings sched_feat_names are never changed. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FBF29B2.9030904@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 70cc36a6073f..c1679a098fc7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -142,7 +142,7 @@ const_debug unsigned int sysctl_sched_features = #define SCHED_FEAT(name, enabled) \ #name , -static __read_mostly char *sched_feat_names[] = { +static const char * const sched_feat_names[] = { #include "features.h" NULL }; -- cgit v1.2.3-70-g09d2 From 7997a456ef841bb78eb6f881d7cc2c17c2f9b35e Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 25 May 2012 15:42:47 +0900 Subject: sched: Remove the last NULL entry from sched_feat_names No need to have the last NULL entry. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FBF29E7.5020805@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1679a098fc7..94d598ac5e64 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -144,7 +144,6 @@ const_debug unsigned int sysctl_sched_features = static const char * const sched_feat_names[] = { #include "features.h" - NULL }; #undef SCHED_FEAT -- cgit v1.2.3-70-g09d2 From 6a4c96eef42f835734a82c6b512abf9881b7c55d Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Wed, 23 May 2012 14:44:11 +0530 Subject: sched: Remove NULL assignment of dattr_cur Remove explicit NULL assignment of static pointer dattr_cur from init_sched_domains(). Signed-off-by: Kamalesh Babulal Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120523091411.GG5005@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94d598ac5e64..c46958e26121 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6726,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - dattr_cur = NULL; err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); -- cgit v1.2.3-70-g09d2