diff options
author | Rafael J. Wysocki <rafael.j.wysocki@intel.com> | 2016-12-12 20:46:15 +0100 |
---|---|---|
committer | Rafael J. Wysocki <rafael.j.wysocki@intel.com> | 2016-12-12 20:46:15 +0100 |
commit | b19ad3b9f1fd46bb7d4ac623a4f2cb46fa2cb7a0 (patch) | |
tree | 31c2d63b2c7b516d35478597aa3dcbbe9b044357 /drivers | |
parent | fecc8c0ebd30c41cc66303b6f9476481c5d6d260 (diff) | |
parent | 404ea9f1a792eebdc208d53fd38bdd1619531430 (diff) |
Merge branch 'pm-cpuidle'
* pm-cpuidle:
cpuidle: Add a kerneldoc comment to cpuidle_use_deepest_state()
cpuidle: fix improper return value on error
intel_idle: Convert to hotplug state machine
intel_idle: Remove superfluous SMP fuction call
MAINTAINERS: Add Jacob Pan as a new intel_idle maintainer
MAINTAINERS: Add bug tracking system location entries for cpuidle
x86/intel_idle: Add Knights Mill CPUID
x86/intel_idle: Add CPU model 0x4a (Atom Z34xx series)
thermal/intel_powerclamp: stop sched tick in forced idle
thermal/intel_powerclamp: Convert to CPU hotplug state
thermal/intel_powerclamp: Convert the kthread to kthread worker API
thermal/intel_powerclamp: Remove duplicated code that starts the kthread
sched/idle: Add support for tasks that inject idle
cpuidle: Allow enforcing deepest idle state selection
cpuidle/powernv: staticise powernv_idle_driver
cpuidle: dt: assign ->enter_freeze to same as ->enter callback function
cpuidle: governors: Remove remaining old module code
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/cpuidle/cpuidle-powernv.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle.c | 19 | ||||
-rw-r--r-- | drivers/cpuidle/dt_idle_states.c | 6 | ||||
-rw-r--r-- | drivers/cpuidle/governor.c | 4 | ||||
-rw-r--r-- | drivers/cpuidle/governors/ladder.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/governors/menu.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/sysfs.c | 4 | ||||
-rw-r--r-- | drivers/idle/intel_idle.c | 154 | ||||
-rw-r--r-- | drivers/thermal/intel_powerclamp.c | 359 |
9 files changed, 301 insertions, 251 deletions
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 7fe442ca38f4..0835a37a5f3a 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -22,7 +22,7 @@ #define POWERNV_THRESHOLD_LATENCY_NS 200000 -struct cpuidle_driver powernv_idle_driver = { +static struct cpuidle_driver powernv_idle_driver = { .name = "powernv_idle", .owner = THIS_MODULE, }; diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c73207abb5a4..62810ff3b00f 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -97,7 +97,23 @@ static int find_deepest_state(struct cpuidle_driver *drv, return ret; } -#ifdef CONFIG_SUSPEND +/** + * cpuidle_use_deepest_state - Set/clear governor override flag. + * @enable: New value of the flag. + * + * Set/unset the current CPU to use the deepest idle state (override governors + * going forward if set). + */ +void cpuidle_use_deepest_state(bool enable) +{ + struct cpuidle_device *dev; + + preempt_disable(); + dev = cpuidle_get_device(); + dev->use_deepest_state = enable; + preempt_enable(); +} + /** * cpuidle_find_deepest_state - Find the deepest available idle state. * @drv: cpuidle driver for the given CPU. @@ -109,6 +125,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv, return find_deepest_state(drv, dev, UINT_MAX, 0, false); } +#ifdef CONFIG_SUSPEND static void enter_freeze_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c index a5c111b67f37..ffca4fc0061d 100644 --- a/drivers/cpuidle/dt_idle_states.c +++ b/drivers/cpuidle/dt_idle_states.c @@ -38,6 +38,12 @@ static int init_state_node(struct cpuidle_state *idle_state, * state enter function. */ idle_state->enter = match_id->data; + /* + * Since this is not a "coupled" state, it's safe to assume interrupts + * won't be enabled when it exits allowing the tick to be frozen + * safely. So enter() can be also enter_freeze() callback. + */ + idle_state->enter_freeze = match_id->data; err = of_property_read_u32(state_node, "wakeup-latency-us", &idle_state->exit_latency); diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index fb9f511cca23..4e78263e34a4 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -9,7 +9,6 @@ */ #include <linux/mutex.h> -#include <linux/module.h> #include <linux/cpuidle.h> #include "cpuidle.h" @@ -53,14 +52,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov) if (cpuidle_curr_governor) { list_for_each_entry(dev, &cpuidle_detected_devices, device_list) cpuidle_disable_device(dev); - module_put(cpuidle_curr_governor->owner); } cpuidle_curr_governor = gov; if (gov) { - if (!try_module_get(cpuidle_curr_governor->owner)) - return -EINVAL; list_for_each_entry(dev, &cpuidle_detected_devices, device_list) cpuidle_enable_device(dev); cpuidle_install_idle_handler(); diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 63bd5a403e22..fe8f08948fcb 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -15,7 +15,6 @@ #include <linux/kernel.h> #include <linux/cpuidle.h> #include <linux/pm_qos.h> -#include <linux/module.h> #include <linux/jiffies.h> #include <linux/tick.h> @@ -177,7 +176,6 @@ static struct cpuidle_governor ladder_governor = { .enable = ladder_enable_device, .select = ladder_select_state, .reflect = ladder_reflect, - .owner = THIS_MODULE, }; /** diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 03d38c291de6..d9b5b9398a0f 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -19,7 +19,6 @@ #include <linux/tick.h> #include <linux/sched.h> #include <linux/math64.h> -#include <linux/module.h> /* * Please note when changing the tuning values: @@ -484,7 +483,6 @@ static struct cpuidle_governor menu_governor = { .enable = menu_enable_device, .select = menu_select, .reflect = menu_reflect, - .owner = THIS_MODULE, }; /** diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 832a2c3f01ff..c5adc8c9ac43 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -403,8 +403,10 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device) /* state statistics */ for (i = 0; i < drv->state_count; i++) { kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL); - if (!kobj) + if (!kobj) { + ret = -ENOMEM; goto error_state; + } kobj->state = &drv->states[i]; kobj->state_usage = &device->states_usage[i]; init_completion(&kobj->kobj_unregister); diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4466a2f969d7..7d8ea3d5fda6 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -98,8 +98,6 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); static void intel_idle_freeze(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); -static int intel_idle_cpu_init(int cpu); - static struct cpuidle_state *cpuidle_state_table; /* @@ -724,6 +722,50 @@ static struct cpuidle_state atom_cstates[] = { { .enter = NULL } }; +static struct cpuidle_state tangier_cstates[] = { + { + .name = "C1-TNG", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 4, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C4-TNG", + .desc = "MWAIT 0x30", + .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 100, + .target_residency = 400, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-TNG", + .desc = "MWAIT 0x52", + .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 140, + .target_residency = 560, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7-TNG", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 1200, + .target_residency = 4000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C9-TNG", + .desc = "MWAIT 0x64", + .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 10000, + .target_residency = 20000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; static struct cpuidle_state avn_cstates[] = { { .name = "C1-AVN", @@ -907,51 +949,15 @@ static void intel_idle_freeze(struct cpuidle_device *dev, mwait_idle_with_hints(eax, ecx); } -static void __setup_broadcast_timer(void *arg) +static void __setup_broadcast_timer(bool on) { - unsigned long on = (unsigned long)arg; - if (on) tick_broadcast_enable(); else tick_broadcast_disable(); } -static int cpu_hotplug_notify(struct notifier_block *n, - unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct cpuidle_device *dev; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - - if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) - smp_call_function_single(hotcpu, __setup_broadcast_timer, - (void *)true, 1); - - /* - * Some systems can hotplug a cpu at runtime after - * the kernel has booted, we have to initialize the - * driver in this case - */ - dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu); - if (dev->registered) - break; - - if (intel_idle_cpu_init(hotcpu)) - return NOTIFY_BAD; - - break; - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_hotplug_notifier = { - .notifier_call = cpu_hotplug_notify, -}; - -static void auto_demotion_disable(void *dummy) +static void auto_demotion_disable(void) { unsigned long long msr_bits; @@ -959,7 +965,7 @@ static void auto_demotion_disable(void *dummy) msr_bits &= ~(icpu->auto_demotion_disable_flags); wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); } -static void c1e_promotion_disable(void *dummy) +static void c1e_promotion_disable(void) { unsigned long long msr_bits; @@ -978,6 +984,10 @@ static const struct idle_cpu idle_cpu_atom = { .state_table = atom_cstates, }; +static const struct idle_cpu idle_cpu_tangier = { + .state_table = tangier_cstates, +}; + static const struct idle_cpu idle_cpu_lincroft = { .state_table = atom_cstates, .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE, @@ -1066,6 +1076,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb), ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom), ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt), + ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier), ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht), ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb), ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt), @@ -1084,6 +1095,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, idle_cpu_skl), ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx), ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl), + ICPU(INTEL_FAM6_XEON_PHI_KNM, idle_cpu_knl), ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt), ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv), {} @@ -1373,12 +1385,11 @@ static void __init intel_idle_cpuidle_driver_init(void) * allocate, initialize, register cpuidle_devices * @cpu: cpu/core to initialize */ -static int intel_idle_cpu_init(int cpu) +static int intel_idle_cpu_init(unsigned int cpu) { struct cpuidle_device *dev; dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); - dev->cpu = cpu; if (cpuidle_register_device(dev)) { @@ -1387,17 +1398,36 @@ static int intel_idle_cpu_init(int cpu) } if (icpu->auto_demotion_disable_flags) - smp_call_function_single(cpu, auto_demotion_disable, NULL, 1); + auto_demotion_disable(); if (icpu->disable_promotion_to_c1e) - smp_call_function_single(cpu, c1e_promotion_disable, NULL, 1); + c1e_promotion_disable(); + + return 0; +} + +static int intel_idle_cpu_online(unsigned int cpu) +{ + struct cpuidle_device *dev; + + if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) + __setup_broadcast_timer(true); + + /* + * Some systems can hotplug a cpu at runtime after + * the kernel has booted, we have to initialize the + * driver in this case + */ + dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); + if (!dev->registered) + return intel_idle_cpu_init(cpu); return 0; } static int __init intel_idle_init(void) { - int retval, i; + int retval; /* Do not load intel_idle at all for now if idle= is passed */ if (boot_option_idle_override != IDLE_NO_OVERRIDE) @@ -1417,35 +1447,29 @@ static int __init intel_idle_init(void) struct cpuidle_driver *drv = cpuidle_get_driver(); printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", drv ? drv->name : "none"); - free_percpu(intel_idle_cpuidle_devices); - return retval; + goto init_driver_fail; } - cpu_notifier_register_begin(); - - for_each_online_cpu(i) { - retval = intel_idle_cpu_init(i); - if (retval) { - intel_idle_cpuidle_devices_uninit(); - cpu_notifier_register_done(); - cpuidle_unregister_driver(&intel_idle_driver); - free_percpu(intel_idle_cpuidle_devices); - return retval; - } - } - __register_cpu_notifier(&cpu_hotplug_notifier); - if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; - else - on_each_cpu(__setup_broadcast_timer, (void *)true, 1); - cpu_notifier_register_done(); + retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online", + intel_idle_cpu_online, NULL); + if (retval < 0) + goto hp_setup_fail; pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", lapic_timer_reliable_states); return 0; + +hp_setup_fail: + intel_idle_cpuidle_devices_uninit(); + cpuidle_unregister_driver(&intel_idle_driver); +init_driver_fail: + free_percpu(intel_idle_cpuidle_devices); + return retval; + } device_initcall(intel_idle_init); diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index afada655f861..83e697186410 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -43,7 +43,6 @@ #include <linux/kernel.h> #include <linux/delay.h> #include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/cpu.h> #include <linux/thermal.h> #include <linux/slab.h> @@ -86,11 +85,26 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update */ static bool clamping; +static const struct sched_param sparam = { + .sched_priority = MAX_USER_RT_PRIO / 2, +}; +struct powerclamp_worker_data { + struct kthread_worker *worker; + struct kthread_work balancing_work; + struct kthread_delayed_work idle_injection_work; + unsigned int cpu; + unsigned int count; + unsigned int guard; + unsigned int window_size_now; + unsigned int target_ratio; + unsigned int duration_jiffies; + bool clamping; +}; -static struct task_struct * __percpu *powerclamp_thread; +static struct powerclamp_worker_data * __percpu worker_data; static struct thermal_cooling_device *cooling_dev; static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu - * clamping thread + * clamping kthread worker */ static unsigned int duration; @@ -262,11 +276,6 @@ static u64 pkg_state_counter(void) return count; } -static void noop_timer(unsigned long foo) -{ - /* empty... just the fact that we get the interrupt wakes us up */ -} - static unsigned int get_compensation(int ratio) { unsigned int comp = 0; @@ -368,103 +377,79 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio, return set_target_ratio + guard <= current_ratio; } -static int clamp_thread(void *arg) +static void clamp_balancing_func(struct kthread_work *work) { - int cpunr = (unsigned long)arg; - DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0); - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; - unsigned int count = 0; - unsigned int target_ratio; + struct powerclamp_worker_data *w_data; + int sleeptime; + unsigned long target_jiffies; + unsigned int compensated_ratio; + int interval; /* jiffies to sleep for each attempt */ - set_bit(cpunr, cpu_clamping_mask); - set_freezable(); - init_timer_on_stack(&wakeup_timer); - sched_setscheduler(current, SCHED_FIFO, ¶m); - - while (true == clamping && !kthread_should_stop() && - cpu_online(cpunr)) { - int sleeptime; - unsigned long target_jiffies; - unsigned int guard; - unsigned int compensated_ratio; - int interval; /* jiffies to sleep for each attempt */ - unsigned int duration_jiffies = msecs_to_jiffies(duration); - unsigned int window_size_now; - - try_to_freeze(); - /* - * make sure user selected ratio does not take effect until - * the next round. adjust target_ratio if user has changed - * target such that we can converge quickly. - */ - target_ratio = set_target_ratio; - guard = 1 + target_ratio/20; - window_size_now = window_size; - count++; - - /* - * systems may have different ability to enter package level - * c-states, thus we need to compensate the injected idle ratio - * to achieve the actual target reported by the HW. - */ - compensated_ratio = target_ratio + - get_compensation(target_ratio); - if (compensated_ratio <= 0) - compensated_ratio = 1; - interval = duration_jiffies * 100 / compensated_ratio; - - /* align idle time */ - target_jiffies = roundup(jiffies, interval); - sleeptime = target_jiffies - jiffies; - if (sleeptime <= 0) - sleeptime = 1; - schedule_timeout_interruptible(sleeptime); - /* - * only elected controlling cpu can collect stats and update - * control parameters. - */ - if (cpunr == control_cpu && !(count%window_size_now)) { - should_skip = - powerclamp_adjust_controls(target_ratio, - guard, window_size_now); - smp_mb(); - } + w_data = container_of(work, struct powerclamp_worker_data, + balancing_work); - if (should_skip) - continue; - - target_jiffies = jiffies + duration_jiffies; - mod_timer(&wakeup_timer, target_jiffies); - if (unlikely(local_softirq_pending())) - continue; - /* - * stop tick sched during idle time, interrupts are still - * allowed. thus jiffies are updated properly. - */ - preempt_disable(); - /* mwait until target jiffies is reached */ - while (time_before(jiffies, target_jiffies)) { - unsigned long ecx = 1; - unsigned long eax = target_mwait; - - /* - * REVISIT: may call enter_idle() to notify drivers who - * can save power during cpu idle. same for exit_idle() - */ - local_touch_nmi(); - stop_critical_timings(); - mwait_idle_with_hints(eax, ecx); - start_critical_timings(); - atomic_inc(&idle_wakeup_counter); - } - preempt_enable(); + /* + * make sure user selected ratio does not take effect until + * the next round. adjust target_ratio if user has changed + * target such that we can converge quickly. + */ + w_data->target_ratio = READ_ONCE(set_target_ratio); + w_data->guard = 1 + w_data->target_ratio / 20; + w_data->window_size_now = window_size; + w_data->duration_jiffies = msecs_to_jiffies(duration); + w_data->count++; + + /* + * systems may have different ability to enter package level + * c-states, thus we need to compensate the injected idle ratio + * to achieve the actual target reported by the HW. + */ + compensated_ratio = w_data->target_ratio + + get_compensation(w_data->target_ratio); + if (compensated_ratio <= 0) + compensated_ratio = 1; + interval = w_data->duration_jiffies * 100 / compensated_ratio; + + /* align idle time */ + target_jiffies = roundup(jiffies, interval); + sleeptime = target_jiffies - jiffies; + if (sleeptime <= 0) + sleeptime = 1; + + if (clamping && w_data->clamping && cpu_online(w_data->cpu)) + kthread_queue_delayed_work(w_data->worker, + &w_data->idle_injection_work, + sleeptime); +} + +static void clamp_idle_injection_func(struct kthread_work *work) +{ + struct powerclamp_worker_data *w_data; + + w_data = container_of(work, struct powerclamp_worker_data, + idle_injection_work.work); + + /* + * only elected controlling cpu can collect stats and update + * control parameters. + */ + if (w_data->cpu == control_cpu && + !(w_data->count % w_data->window_size_now)) { + should_skip = + powerclamp_adjust_controls(w_data->target_ratio, + w_data->guard, + w_data->window_size_now); + smp_mb(); } - del_timer_sync(&wakeup_timer); - clear_bit(cpunr, cpu_clamping_mask); - return 0; + if (should_skip) + goto balance; + + play_idle(jiffies_to_msecs(w_data->duration_jiffies)); + +balance: + if (clamping && w_data->clamping && cpu_online(w_data->cpu)) + kthread_queue_work(w_data->worker, &w_data->balancing_work); } /* @@ -508,10 +493,60 @@ static void poll_pkg_cstate(struct work_struct *dummy) schedule_delayed_work(&poll_pkg_cstate_work, HZ); } +static void start_power_clamp_worker(unsigned long cpu) +{ + struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); + struct kthread_worker *worker; + + worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu); + if (IS_ERR(worker)) + return; + + w_data->worker = worker; + w_data->count = 0; + w_data->cpu = cpu; + w_data->clamping = true; + set_bit(cpu, cpu_clamping_mask); + sched_setscheduler(worker->task, SCHED_FIFO, &sparam); + kthread_init_work(&w_data->balancing_work, clamp_balancing_func); + kthread_init_delayed_work(&w_data->idle_injection_work, + clamp_idle_injection_func); + kthread_queue_work(w_data->worker, &w_data->balancing_work); +} + +static void stop_power_clamp_worker(unsigned long cpu) +{ + struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); + + if (!w_data->worker) + return; + + w_data->clamping = false; + /* + * Make sure that all works that get queued after this point see + * the clamping disabled. The counter part is not needed because + * there is an implicit memory barrier when the queued work + * is proceed. + */ + smp_wmb(); + kthread_cancel_work_sync(&w_data->balancing_work); + kthread_cancel_delayed_work_sync(&w_data->idle_injection_work); + /* + * The balancing work still might be queued here because + * the handling of the "clapming" variable, cancel, and queue + * operations are not synchronized via a lock. But it is not + * a big deal. The balancing work is fast and destroy kthread + * will wait for it. + */ + clear_bit(w_data->cpu, cpu_clamping_mask); + kthread_destroy_worker(w_data->worker); + + w_data->worker = NULL; +} + static int start_power_clamp(void) { unsigned long cpu; - struct task_struct *thread; set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); /* prevent cpu hotplug */ @@ -525,22 +560,9 @@ static int start_power_clamp(void) clamping = true; schedule_delayed_work(&poll_pkg_cstate_work, 0); - /* start one thread per online cpu */ + /* start one kthread worker per online cpu */ for_each_online_cpu(cpu) { - struct task_struct **p = - per_cpu_ptr(powerclamp_thread, cpu); - - thread = kthread_create_on_node(clamp_thread, - (void *) cpu, - cpu_to_node(cpu), - "kidle_inject/%ld", cpu); - /* bind to cpu here */ - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - wake_up_process(thread); - *p = thread; - } - + start_power_clamp_worker(cpu); } put_online_cpus(); @@ -550,71 +572,49 @@ static int start_power_clamp(void) static void end_power_clamp(void) { int i; - struct task_struct *thread; - clamping = false; /* - * make clamping visible to other cpus and give per cpu clamping threads - * sometime to exit, or gets killed later. + * Block requeuing in all the kthread workers. They will flush and + * stop faster. */ - smp_mb(); - msleep(20); + clamping = false; if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) { for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) { - pr_debug("clamping thread for cpu %d alive, kill\n", i); - thread = *per_cpu_ptr(powerclamp_thread, i); - kthread_stop(thread); + pr_debug("clamping worker for cpu %d alive, destroy\n", + i); + stop_power_clamp_worker(i); } } } -static int powerclamp_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int powerclamp_cpu_online(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; - struct task_struct *thread; - struct task_struct **percpu_thread = - per_cpu_ptr(powerclamp_thread, cpu); - - if (false == clamping) - goto exit_ok; - - switch (action) { - case CPU_ONLINE: - thread = kthread_create_on_node(clamp_thread, - (void *) cpu, - cpu_to_node(cpu), - "kidle_inject/%lu", cpu); - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - wake_up_process(thread); - *percpu_thread = thread; - } - /* prefer BSP as controlling CPU */ - if (cpu == 0) { - control_cpu = 0; - smp_mb(); - } - break; - case CPU_DEAD: - if (test_bit(cpu, cpu_clamping_mask)) { - pr_err("cpu %lu dead but powerclamping thread is not\n", - cpu); - kthread_stop(*percpu_thread); - } - if (cpu == control_cpu) { - control_cpu = smp_processor_id(); - smp_mb(); - } + if (clamping == false) + return 0; + start_power_clamp_worker(cpu); + /* prefer BSP as controlling CPU */ + if (cpu == 0) { + control_cpu = 0; + smp_mb(); } - -exit_ok: - return NOTIFY_OK; + return 0; } -static struct notifier_block powerclamp_cpu_notifier = { - .notifier_call = powerclamp_cpu_callback, -}; +static int powerclamp_cpu_predown(unsigned int cpu) +{ + if (clamping == false) + return 0; + + stop_power_clamp_worker(cpu); + if (cpu != control_cpu) + return 0; + + control_cpu = cpumask_first(cpu_online_mask); + if (control_cpu == cpu) + control_cpu = cpumask_next(cpu, cpu_online_mask); + smp_mb(); + return 0; +} static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, unsigned long *state) @@ -742,6 +742,8 @@ file_error: debugfs_remove_recursive(debug_dir); } +static enum cpuhp_state hp_state; + static int __init powerclamp_init(void) { int retval; @@ -759,10 +761,17 @@ static int __init powerclamp_init(void) /* set default limit, maybe adjusted during runtime based on feedback */ window_size = 2; - register_hotcpu_notifier(&powerclamp_cpu_notifier); + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "thermal/intel_powerclamp:online", + powerclamp_cpu_online, + powerclamp_cpu_predown); + if (retval < 0) + goto exit_free; + + hp_state = retval; - powerclamp_thread = alloc_percpu(struct task_struct *); - if (!powerclamp_thread) { + worker_data = alloc_percpu(struct powerclamp_worker_data); + if (!worker_data) { retval = -ENOMEM; goto exit_unregister; } @@ -782,9 +791,9 @@ static int __init powerclamp_init(void) return 0; exit_free_thread: - free_percpu(powerclamp_thread); + free_percpu(worker_data); exit_unregister: - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); + cpuhp_remove_state_nocalls(hp_state); exit_free: kfree(cpu_clamping_mask); return retval; @@ -793,9 +802,9 @@ module_init(powerclamp_init); static void __exit powerclamp_exit(void) { - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); end_power_clamp(); - free_percpu(powerclamp_thread); + cpuhp_remove_state_nocalls(hp_state); + free_percpu(worker_data); thermal_cooling_device_unregister(cooling_dev); kfree(cpu_clamping_mask); |