From 6168f8ed01dc46a277908938294f1132d723f58d Mon Sep 17 00:00:00 2001 From: Wei Jiangang Date: Wed, 29 Jun 2016 12:51:50 +0800 Subject: timers/nohz: Fix several typos Signed-off-by: Wei Jiangang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Link: http://lkml.kernel.org/r/1467175910-2966-2-git-send-email-weijg.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/time/tick-sched.c') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 536ada80f6dd..6d83e9c4a302 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now) if (delta.tv64 < tick_period.tv64) return; - /* Reevalute with jiffies_lock held */ + /* Reevaluate with jiffies_lock held */ write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); @@ -117,7 +117,7 @@ static void tick_sched_do_timer(ktime_t now) /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to + * into a long sleep. If two cpus happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. */ @@ -571,7 +571,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) * @last_update_time: variable to store update time in. Do not update * counters if NULL. * - * Return the cummulative idle time (since boot) for a given + * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, @@ -612,7 +612,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * @last_update_time: variable to store update time in. Do not update * counters if NULL. * - * Return the cummulative iowait time (since boot) for a given + * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, @@ -733,7 +733,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * do_timer() never invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. If this cpu * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferement value. + * sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); -- cgit v1.2.3-70-g09d2 From 0de7611a1031f25b713fda7d36de44f17c2ed790 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 1 Jul 2016 12:42:35 +0200 Subject: timers/nohz: Capitalize 'CPU' consistently While reviewing another patch I noticed that kernel/time/tick-sched.c had a charmingly (confusingly, annoyingly) rich set of variants for spelling 'CPU': cpu cpus CPU CPUs per CPU per-CPU per cpu ... sometimes these were mixed even within the same comment block! Compress these variants down to a single consistent set of: CPU CPUs per-CPU Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel/time/tick-sched.c') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6d83e9c4a302..db57d1ba73eb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -31,7 +31,7 @@ #include /* - * Per cpu nohz control structure + * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); @@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now) #ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themselves to + * concurrency: This happens only when the CPU in charge went + * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. */ @@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: - * perf events, posix cpu timers, ... + * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { @@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void) * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy - * value. We do this unconditionally on any cpu, as we don't know whether the - * cpu, which has the update task assigned is in a long sleep. + * value. We do this unconditionally on any CPU, as we don't know whether the + * CPU, which has the update task assigned is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { @@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now) } /* - * Updates the per cpu time idle statistics counters + * Updates the per-CPU time idle statistics counters */ static void update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) @@ -566,7 +566,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) } /** - * get_cpu_idle_time_us - get the total idle time of a cpu + * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. @@ -607,7 +607,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** - * get_cpu_iowait_time_us - get the total iowait time of a cpu + * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. @@ -726,12 +726,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } /* - * If this cpu is the one which updates jiffies, then give up - * the assignment and let it be taken by the cpu which runs - * the tick timer next, which might be this cpu as well. If we + * If this CPU is the one which updates jiffies, then give up + * the assignment and let it be taken by the CPU which runs + * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it - * was the one which had the do_timer() duty last. If this cpu + * was the one which had the do_timer() duty last. If this CPU * is the one which had the do_timer() duty last, we limit the * sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. @@ -841,9 +841,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* - * If this cpu is offline and it is the one which updates + * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop + * the CPU which runs the tick timer next. If we don't drop * this here the jiffies might be stale and do_timer() never * invoked. */ @@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void) WARN_ON_ONCE(irqs_disabled()); /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ + * Update the idle state in the scheduler domain hierarchy + * when tick_nohz_stop_sched_tick() is called from the idle loop. + * State will be updated to busy during the first busy tick after + * exiting idle. + */ set_cpu_sd_state_idle(); local_irq_disable(); @@ -1211,7 +1211,7 @@ void tick_setup_sched_timer(void) hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - /* Get the next period (per cpu) */ + /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert jiffies_lock contention. */ -- cgit v1.2.3-70-g09d2 From ff00673292bd42a3688b33de47252a6a3c3f424c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:35 +0000 Subject: timers/nohz: Remove pointless tick_nohz_kick_tick() function This was a failed attempt to optimize the timer expiry in idle, which was disabled and never revisited. Remove the cruft. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.431073782@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) (limited to 'kernel/time/tick-sched.c') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 536ada80f6dd..69abc7bfe80f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1092,35 +1092,6 @@ static void tick_nohz_switch_to_nohz(void) tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } -/* - * When NOHZ is enabled and the tick is stopped, we need to kick the - * tick timer from irq_enter() so that the jiffies update is kept - * alive during long running softirqs. That's ugly as hell, but - * correctness is key even if we need to fix the offending softirq in - * the first place. - * - * Note, this is different to tick_nohz_restart. We just kick the - * timer and do not touch the other magic bits which need to be done - * when idle is left. - */ -static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) -{ -#if 0 - /* Switch back to 2.6.27 behaviour */ - ktime_t delta; - - /* - * Do not touch the tick device, when the next expiry is either - * already reached or less/equal than the tick period. - */ - delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); - if (delta.tv64 <= tick_period.tv64) - return; - - tick_nohz_restart(ts, now); -#endif -} - static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); @@ -1131,10 +1102,8 @@ static inline void tick_nohz_irq_enter(void) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) { + if (ts->tick_stopped) tick_nohz_update_jiffies(now); - tick_nohz_kick_tick(ts, now); - } } #else -- cgit v1.2.3-70-g09d2 From a683f390b93f4d1292f849fc48d28e322046120f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:36 +0000 Subject: timers: Forward the wheel clock whenever possible The wheel clock is stale when a CPU goes into a long idle sleep. This has the side effect that timers which are queued end up in the outer wheel levels. That results in coarser granularity. To solve this, we keep track of the idle state and forward the wheel clock whenever possible. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.512039360@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 1 + kernel/time/tick-sched.c | 12 +++++ kernel/time/timer.c | 128 ++++++++++++++++++++++++++++++++++++-------- 3 files changed, 120 insertions(+), 21 deletions(-) (limited to 'kernel/time/tick-sched.c') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 966a5a6fdd0a..f738251000fe 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { } DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); +void timer_clear_idle(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69abc7bfe80f..5d81f9aa30d2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { tick.tv64 = 0; + + /* + * Tell the timer code that the base is not idle, i.e. undo + * the effect of get_next_timer_interrupt(): + */ + timer_clear_idle(); /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. @@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) tick_do_update_jiffies64(now); cpu_load_update_nohz_stop(); + /* + * Clear the timer idle flag, so we avoid IPIs on remote queueing and + * the clock forward checks in the enqueue path: + */ + timer_clear_idle(); + calc_load_exit_idle(); touch_softlockup_watchdog_sched(); /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 658051c97a3c..9339d71ee998 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -196,9 +196,11 @@ struct timer_base { spinlock_t lock; struct timer_list *running_timer; unsigned long clk; + unsigned long next_expiry; unsigned int cpu; bool migration_enabled; bool nohz_active; + bool is_idle; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -519,24 +521,37 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer { __internal_add_timer(base, timer); + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + return; + /* - * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. We are - * protected against the other CPU fiddling with the timer by - * holding the timer base lock. This also makes sure that a - * CPU on the way to stop its tick can not evaluate the timer - * wheel. - * - * Spare the IPI for deferrable timers on idle targets though. - * The next busy ticks will take care of it. Except full dynticks - * require special care against races with idle_cpu(), lets deal - * with that later. + * TODO: This wants some optimizing similar to the code below, but we + * will do that when we switch from push to pull for deferrable timers. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) { - if (!(timer->flags & TIMER_DEFERRABLE) || - tick_nohz_full_cpu(base->cpu)) + if (timer->flags & TIMER_DEFERRABLE) { + if (tick_nohz_full_cpu(base->cpu)) wake_up_nohz_cpu(base->cpu); + return; } + + /* + * We might have to IPI the remote CPU if the base is idle and the + * timer is not deferrable. If the other CPU is on the way to idle + * then it can't set base->is_idle as we hold the base lock: + */ + if (!base->is_idle) + return; + + /* Check whether this is the new first expiring timer: */ + if (time_after_eq(timer->expires, base->next_expiry)) + return; + + /* + * Set the next expiry time and kick the CPU so it can reevaluate the + * wheel: + */ + base->next_expiry = timer->expires; + wake_up_nohz_cpu(base->cpu); } #ifdef CONFIG_TIMER_STATS @@ -844,10 +859,11 @@ static inline struct timer_base *get_timer_base(u32 tflags) return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } -static inline struct timer_base *get_target_base(struct timer_base *base, - unsigned tflags) +#ifdef CONFIG_NO_HZ_COMMON +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) { -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP if ((tflags & TIMER_PINNED) || !base->migration_enabled) return get_timer_this_cpu_base(tflags); return get_timer_cpu_base(tflags, get_nohz_timer_target()); @@ -856,6 +872,43 @@ static inline struct timer_base *get_target_base(struct timer_base *base, #endif } +static inline void forward_timer_base(struct timer_base *base) +{ + /* + * We only forward the base when it's idle and we have a delta between + * base clock and jiffies. + */ + if (!base->is_idle || (long) (jiffies - base->clk) < 2) + return; + + /* + * If the next expiry value is > jiffies, then we fast forward to + * jiffies otherwise we forward to the next expiry value. + */ + if (time_after(base->next_expiry, jiffies)) + base->clk = jiffies; + else + base->clk = base->next_expiry; +} +#else +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) +{ + return get_timer_this_cpu_base(tflags); +} + +static inline void forward_timer_base(struct timer_base *base) { } +#endif + +static inline struct timer_base * +get_target_base(struct timer_base *base, unsigned tflags) +{ + struct timer_base *target = __get_target_base(base, tflags); + + forward_timer_base(target); + return target; +} + /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means * that all timers which are tied to this base are locked, and the base itself @@ -1417,16 +1470,49 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) spin_lock(&base->lock); nextevt = __next_timer_interrupt(base); - spin_unlock(&base->lock); + base->next_expiry = nextevt; + /* + * We have a fresh next event. Check whether we can forward the base: + */ + if (time_after(nextevt, jiffies)) + base->clk = jiffies; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; - if (time_before_eq(nextevt, basej)) + if (time_before_eq(nextevt, basej)) { expires = basem; - else + base->is_idle = false; + } else { expires = basem + (nextevt - basej) * TICK_NSEC; + /* + * If we expect to sleep more than a tick, mark the base idle: + */ + if ((expires - basem) > TICK_NSEC) + base->is_idle = true; + } + spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); } +/** + * timer_clear_idle - Clear the idle state of the timer base + * + * Called with interrupts disabled + */ +void timer_clear_idle(void) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * We do this unlocked. The worst outcome is a remote enqueue sending + * a pointless IPI, but taking the lock would just make the window for + * sending the IPI a few instructions smaller for the cost of taking + * the lock in the exit from idle path. + */ + base->is_idle = false; +} + static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { @@ -1440,7 +1526,7 @@ static int collect_expired_timers(struct timer_base *base, /* * If the next timer is ahead of time forward to current - * jiffies, otherwise forward to the next expiry time. + * jiffies, otherwise forward to the next expiry time: */ if (time_after(next, jiffies)) { /* The call site will increment clock! */ -- cgit v1.2.3-70-g09d2 From 1f3b0f8243cb934307f59bd4d8e43b868e61d4d9 Mon Sep 17 00:00:00 2001 From: Gaurav Jindal Date: Thu, 14 Jul 2016 12:04:20 +0000 Subject: tick/nohz: Optimize nohz idle enter tick_nohz_start_idle is called before checking whether the idle tick can be stopped. If the tick cannot be stopped, calling tick_nohz_start_idle() is pointless and just wasting CPU cycles. Only invoke tick_nohz_start_idle() when can_stop_idle_tick() returns true. A short one minute observation of the effect on ARM64 shows a reduction of calls by 1.5% thus optimizing the idle entry sequence. [tglx: Massaged changelog ] Co-developed-by: Sanjeev Yadav Signed-off-by: Gaurav Jindal Link: http://lkml.kernel.org/r/20160714120416.GB21099@gaurav.jindal@spreadtrum.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/time/tick-sched.c') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2ec7c00228f3..204fdc86863d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -908,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ktime_t now, expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(ts); - if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; + now = tick_nohz_start_idle(ts); ts->idle_calls++; expires = tick_nohz_stop_sched_tick(ts, now, cpu); -- cgit v1.2.3-70-g09d2