diff options
39 files changed, 913 insertions, 503 deletions
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index cbea78e79f3d..ac4dde8fdb8b 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -32,7 +32,6 @@ #include <linux/suspend.h> #include <trace/events/power.h> #include <linux/cpufreq.h> -#include <linux/cpuidle.h> #include <linux/devfreq.h> #include <linux/timer.h> @@ -747,8 +746,6 @@ void dpm_resume_noirq(pm_message_t state) resume_device_irqs(); device_wakeup_disarm_wake_irqs(); - - cpuidle_resume(); } /** @@ -1051,7 +1048,7 @@ static void device_complete(struct device *dev, pm_message_t state) const char *info = NULL; if (dev->power.syscore) - return; + goto out; device_lock(dev); @@ -1081,6 +1078,7 @@ static void device_complete(struct device *dev, pm_message_t state) device_unlock(dev); +out: pm_runtime_put(dev); } @@ -1336,8 +1334,6 @@ int dpm_suspend_noirq(pm_message_t state) { int ret; - cpuidle_pause(); - device_wakeup_arm_wake_irqs(); suspend_device_irqs(); @@ -1794,9 +1790,6 @@ static int device_prepare(struct device *dev, pm_message_t state) int (*callback)(struct device *) = NULL; int ret = 0; - if (dev->power.syscore) - return 0; - /* * If a device's parent goes into runtime suspend at the wrong time, * it won't be possible to resume the device. To prevent this we @@ -1805,6 +1798,9 @@ static int device_prepare(struct device *dev, pm_message_t state) */ pm_runtime_get_noresume(dev); + if (dev->power.syscore) + return 0; + device_lock(dev); dev->power.wakeup_path = false; diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h index 54292cdd7808..0eb7f02b3ad5 100644 --- a/drivers/base/power/power.h +++ b/drivers/base/power/power.h @@ -25,8 +25,10 @@ extern u64 pm_runtime_active_time(struct device *dev); #define WAKE_IRQ_DEDICATED_ALLOCATED BIT(0) #define WAKE_IRQ_DEDICATED_MANAGED BIT(1) +#define WAKE_IRQ_DEDICATED_REVERSE BIT(2) #define WAKE_IRQ_DEDICATED_MASK (WAKE_IRQ_DEDICATED_ALLOCATED | \ - WAKE_IRQ_DEDICATED_MANAGED) + WAKE_IRQ_DEDICATED_MANAGED | \ + WAKE_IRQ_DEDICATED_REVERSE) struct wake_irq { struct device *dev; @@ -39,7 +41,8 @@ extern void dev_pm_arm_wake_irq(struct wake_irq *wirq); extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq); extern void dev_pm_enable_wake_irq_check(struct device *dev, bool can_change_status); -extern void dev_pm_disable_wake_irq_check(struct device *dev); +extern void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable); +extern void dev_pm_enable_wake_irq_complete(struct device *dev); #ifdef CONFIG_PM_SLEEP diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index ec94049442b9..d504cd4ab3cb 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -645,6 +645,8 @@ static int rpm_suspend(struct device *dev, int rpmflags) if (retval) goto fail; + dev_pm_enable_wake_irq_complete(dev); + no_callback: __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_deactivate_timer(dev); @@ -690,7 +692,7 @@ static int rpm_suspend(struct device *dev, int rpmflags) return retval; fail: - dev_pm_disable_wake_irq_check(dev); + dev_pm_disable_wake_irq_check(dev, true); __update_runtime_status(dev, RPM_ACTIVE); dev->power.deferred_resume = false; wake_up_all(&dev->power.wait_queue); @@ -873,7 +875,7 @@ static int rpm_resume(struct device *dev, int rpmflags) callback = RPM_GET_CALLBACK(dev, runtime_resume); - dev_pm_disable_wake_irq_check(dev); + dev_pm_disable_wake_irq_check(dev, false); retval = rpm_callback(callback, dev); if (retval) { __update_runtime_status(dev, RPM_SUSPENDED); diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c index b91a3a9bf9f6..0004db4a9d3b 100644 --- a/drivers/base/power/wakeirq.c +++ b/drivers/base/power/wakeirq.c @@ -142,24 +142,7 @@ static irqreturn_t handle_threaded_wake_irq(int irq, void *_wirq) return IRQ_HANDLED; } -/** - * dev_pm_set_dedicated_wake_irq - Request a dedicated wake-up interrupt - * @dev: Device entry - * @irq: Device wake-up interrupt - * - * Unless your hardware has separate wake-up interrupts in addition - * to the device IO interrupts, you don't need this. - * - * Sets up a threaded interrupt handler for a device that has - * a dedicated wake-up interrupt in addition to the device IO - * interrupt. - * - * The interrupt starts disabled, and needs to be managed for - * the device by the bus code or the device driver using - * dev_pm_enable_wake_irq() and dev_pm_disable_wake_irq() - * functions. - */ -int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq) +static int __dev_pm_set_dedicated_wake_irq(struct device *dev, int irq, unsigned int flag) { struct wake_irq *wirq; int err; @@ -197,7 +180,7 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq) if (err) goto err_free_irq; - wirq->status = WAKE_IRQ_DEDICATED_ALLOCATED; + wirq->status = WAKE_IRQ_DEDICATED_ALLOCATED | flag; return err; @@ -210,9 +193,58 @@ err_free: return err; } + + +/** + * dev_pm_set_dedicated_wake_irq - Request a dedicated wake-up interrupt + * @dev: Device entry + * @irq: Device wake-up interrupt + * + * Unless your hardware has separate wake-up interrupts in addition + * to the device IO interrupts, you don't need this. + * + * Sets up a threaded interrupt handler for a device that has + * a dedicated wake-up interrupt in addition to the device IO + * interrupt. + * + * The interrupt starts disabled, and needs to be managed for + * the device by the bus code or the device driver using + * dev_pm_enable_wake_irq*() and dev_pm_disable_wake_irq*() + * functions. + */ +int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq) +{ + return __dev_pm_set_dedicated_wake_irq(dev, irq, 0); +} EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq); /** + * dev_pm_set_dedicated_wake_irq_reverse - Request a dedicated wake-up interrupt + * with reverse enable ordering + * @dev: Device entry + * @irq: Device wake-up interrupt + * + * Unless your hardware has separate wake-up interrupts in addition + * to the device IO interrupts, you don't need this. + * + * Sets up a threaded interrupt handler for a device that has a dedicated + * wake-up interrupt in addition to the device IO interrupt. It sets + * the status of WAKE_IRQ_DEDICATED_REVERSE to tell rpm_suspend() + * to enable dedicated wake-up interrupt after running the runtime suspend + * callback for @dev. + * + * The interrupt starts disabled, and needs to be managed for + * the device by the bus code or the device driver using + * dev_pm_enable_wake_irq*() and dev_pm_disable_wake_irq*() + * functions. + */ +int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq) +{ + return __dev_pm_set_dedicated_wake_irq(dev, irq, WAKE_IRQ_DEDICATED_REVERSE); +} +EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq_reverse); + +/** * dev_pm_enable_wake_irq - Enable device wake-up interrupt * @dev: Device * @@ -282,28 +314,55 @@ void dev_pm_enable_wake_irq_check(struct device *dev, return; enable: - enable_irq(wirq->irq); + if (!can_change_status || !(wirq->status & WAKE_IRQ_DEDICATED_REVERSE)) + enable_irq(wirq->irq); } /** * dev_pm_disable_wake_irq_check - Checks and disables wake-up interrupt * @dev: Device + * @cond_disable: if set, also check WAKE_IRQ_DEDICATED_REVERSE * * Disables wake-up interrupt conditionally based on status. * Should be only called from rpm_suspend() and rpm_resume() path. */ -void dev_pm_disable_wake_irq_check(struct device *dev) +void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable) { struct wake_irq *wirq = dev->power.wakeirq; if (!wirq || !(wirq->status & WAKE_IRQ_DEDICATED_MASK)) return; + if (cond_disable && (wirq->status & WAKE_IRQ_DEDICATED_REVERSE)) + return; + if (wirq->status & WAKE_IRQ_DEDICATED_MANAGED) disable_irq_nosync(wirq->irq); } /** + * dev_pm_enable_wake_irq_complete - enable wake IRQ not enabled before + * @dev: Device using the wake IRQ + * + * Enable wake IRQ conditionally based on status, mainly used if want to + * enable wake IRQ after running ->runtime_suspend() which depends on + * WAKE_IRQ_DEDICATED_REVERSE. + * + * Should be only called from rpm_suspend() path. + */ +void dev_pm_enable_wake_irq_complete(struct device *dev) +{ + struct wake_irq *wirq = dev->power.wakeirq; + + if (!wirq || !(wirq->status & WAKE_IRQ_DEDICATED_MASK)) + return; + + if (wirq->status & WAKE_IRQ_DEDICATED_MANAGED && + wirq->status & WAKE_IRQ_DEDICATED_REVERSE) + enable_irq(wirq->irq); +} + +/** * dev_pm_arm_wake_irq - Arm device wake-up * @wirq: Device wake-up interrupt * diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 28467d83c745..3d514b82d055 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -470,7 +470,8 @@ static unsigned int acpi_cpufreq_fast_switch(struct cpufreq_policy *policy, if (policy->cached_target_freq == target_freq) index = policy->cached_resolved_idx; else - index = cpufreq_table_find_index_dl(policy, target_freq); + index = cpufreq_table_find_index_dl(policy, target_freq, + false); entry = &policy->freq_table[index]; next_freq = entry->frequency; diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c index d0b10baf039a..6448e03bcf48 100644 --- a/drivers/cpufreq/amd_freq_sensitivity.c +++ b/drivers/cpufreq/amd_freq_sensitivity.c @@ -91,7 +91,8 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy, unsigned int index; index = cpufreq_table_find_index_h(policy, - policy->cur - 1); + policy->cur - 1, + relation & CPUFREQ_RELATION_E); freq_next = policy->freq_table[index].frequency; } diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index d4c27022b9c9..db17196266e4 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -741,8 +741,6 @@ static int __init cppc_cpufreq_init(void) if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; - INIT_LIST_HEAD(&cpu_data_list); - cppc_check_hisi_workaround(); cppc_freq_invariance_init(); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 5782b15a8caa..e338d2f010fe 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -554,7 +554,7 @@ static unsigned int __resolve_freq(struct cpufreq_policy *policy, unsigned int cpufreq_driver_resolve_freq(struct cpufreq_policy *policy, unsigned int target_freq) { - return __resolve_freq(policy, target_freq, CPUFREQ_RELATION_L); + return __resolve_freq(policy, target_freq, CPUFREQ_RELATION_LE); } EXPORT_SYMBOL_GPL(cpufreq_driver_resolve_freq); @@ -2260,8 +2260,16 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy, !(cpufreq_driver->flags & CPUFREQ_NEED_UPDATE_LIMITS)) return 0; - if (cpufreq_driver->target) + if (cpufreq_driver->target) { + /* + * If the driver hasn't setup a single inefficient frequency, + * it's unlikely it knows how to decode CPUFREQ_RELATION_E. + */ + if (!policy->efficiencies_available) + relation &= ~CPUFREQ_RELATION_E; + return cpufreq_driver->target(policy, target_freq, relation); + } if (!cpufreq_driver->target_index) return -EINVAL; @@ -2523,8 +2531,15 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, if (ret) return ret; + /* + * Resolve policy min/max to available frequencies. It ensures + * no frequency resolution will neither overshoot the requested maximum + * nor undershoot the requested minimum. + */ policy->min = new_data.min; policy->max = new_data.max; + policy->min = __resolve_freq(policy, policy->min, CPUFREQ_RELATION_L); + policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H); trace_cpu_frequency_limits(policy); policy->cached_target_freq = UINT_MAX; diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index aa39ff31ec9f..0879ec3c170c 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -111,7 +111,8 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy) if (requested_freq > policy->max) requested_freq = policy->max; - __cpufreq_driver_target(policy, requested_freq, CPUFREQ_RELATION_H); + __cpufreq_driver_target(policy, requested_freq, + CPUFREQ_RELATION_HE); dbs_info->requested_freq = requested_freq; goto out; } @@ -134,7 +135,8 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy) else requested_freq = policy->min; - __cpufreq_driver_target(policy, requested_freq, CPUFREQ_RELATION_L); + __cpufreq_driver_target(policy, requested_freq, + CPUFREQ_RELATION_LE); dbs_info->requested_freq = requested_freq; } diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index eb4320b619c9..3b8f924771b4 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -83,9 +83,11 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy, freq_avg = freq_req - freq_reduc; /* Find freq bounds for freq_avg in freq_table */ - index = cpufreq_table_find_index_h(policy, freq_avg); + index = cpufreq_table_find_index_h(policy, freq_avg, + relation & CPUFREQ_RELATION_E); freq_lo = freq_table[index].frequency; - index = cpufreq_table_find_index_l(policy, freq_avg); + index = cpufreq_table_find_index_l(policy, freq_avg, + relation & CPUFREQ_RELATION_E); freq_hi = freq_table[index].frequency; /* Find out how long we have to be in hi and lo freqs */ @@ -118,12 +120,12 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) if (od_tuners->powersave_bias) freq = od_ops.powersave_bias_target(policy, freq, - CPUFREQ_RELATION_H); + CPUFREQ_RELATION_HE); else if (policy->cur == policy->max) return; __cpufreq_driver_target(policy, freq, od_tuners->powersave_bias ? - CPUFREQ_RELATION_L : CPUFREQ_RELATION_H); + CPUFREQ_RELATION_LE : CPUFREQ_RELATION_HE); } /* @@ -161,9 +163,9 @@ static void od_update(struct cpufreq_policy *policy) if (od_tuners->powersave_bias) freq_next = od_ops.powersave_bias_target(policy, freq_next, - CPUFREQ_RELATION_L); + CPUFREQ_RELATION_LE); - __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_C); + __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_CE); } } @@ -182,7 +184,7 @@ static unsigned int od_dbs_update(struct cpufreq_policy *policy) */ if (sample_type == OD_SUB_SAMPLE && policy_dbs->sample_delay_ns > 0) { __cpufreq_driver_target(policy, dbs_info->freq_lo, - CPUFREQ_RELATION_H); + CPUFREQ_RELATION_HE); return dbs_info->freq_lo_delay_us; } diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 8c176b7dae41..349ddbaef796 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -32,6 +32,7 @@ #include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <asm/intel-family.h> +#include "../drivers/thermal/intel/thermal_interrupt.h" #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) @@ -219,6 +220,7 @@ struct global_params { * @sched_flags: Store scheduler flags for possible cross CPU update * @hwp_boost_min: Last HWP boosted min performance * @suspended: Whether or not the driver has been suspended. + * @hwp_notify_work: workqueue for HWP notifications. * * This structure stores per CPU instance data for all CPUs. */ @@ -257,6 +259,7 @@ struct cpudata { unsigned int sched_flags; u32 hwp_boost_min; bool suspended; + struct delayed_work hwp_notify_work; }; static struct cpudata **all_cpu_data; @@ -537,7 +540,8 @@ static void intel_pstate_hybrid_hwp_adjust(struct cpudata *cpu) * scaling factor is too high, so recompute it to make the HWP_CAP * highest performance correspond to the maximum turbo frequency. */ - if (turbo_freq < cpu->pstate.turbo_pstate * scaling) { + cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * scaling; + if (turbo_freq < cpu->pstate.turbo_freq) { cpu->pstate.turbo_freq = turbo_freq; scaling = DIV_ROUND_UP(turbo_freq, cpu->pstate.turbo_pstate); cpu->pstate.scaling = scaling; @@ -985,11 +989,15 @@ skip_epp: wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); } +static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata); + static void intel_pstate_hwp_offline(struct cpudata *cpu) { u64 value = READ_ONCE(cpu->hwp_req_cached); int min_perf; + intel_pstate_disable_hwp_interrupt(cpu); + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { /* * In case the EPP has been set to "performance" by the @@ -1053,6 +1061,9 @@ static int intel_pstate_suspend(struct cpufreq_policy *policy) cpu->suspended = true; + /* disable HWP interrupt and cancel any pending work */ + intel_pstate_disable_hwp_interrupt(cpu); + return 0; } @@ -1546,15 +1557,105 @@ static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void) /************************** sysfs end ************************/ +static void intel_pstate_notify_work(struct work_struct *work) +{ + struct cpudata *cpudata = + container_of(to_delayed_work(work), struct cpudata, hwp_notify_work); + + cpufreq_update_policy(cpudata->cpu); + wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0); +} + +static DEFINE_SPINLOCK(hwp_notify_lock); +static cpumask_t hwp_intr_enable_mask; + +void notify_hwp_interrupt(void) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpudata *cpudata; + unsigned long flags; + u64 value; + + if (!READ_ONCE(hwp_active) || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) + return; + + rdmsrl_safe(MSR_HWP_STATUS, &value); + if (!(value & 0x01)) + return; + + spin_lock_irqsave(&hwp_notify_lock, flags); + + if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask)) + goto ack_intr; + + /* + * Currently we never free all_cpu_data. And we can't reach here + * without this allocated. But for safety for future changes, added + * check. + */ + if (unlikely(!READ_ONCE(all_cpu_data))) + goto ack_intr; + + /* + * The free is done during cleanup, when cpufreq registry is failed. + * We wouldn't be here if it fails on init or switch status. But for + * future changes, added check. + */ + cpudata = READ_ONCE(all_cpu_data[this_cpu]); + if (unlikely(!cpudata)) + goto ack_intr; + + schedule_delayed_work(&cpudata->hwp_notify_work, msecs_to_jiffies(10)); + + spin_unlock_irqrestore(&hwp_notify_lock, flags); + + return; + +ack_intr: + wrmsrl_safe(MSR_HWP_STATUS, 0); + spin_unlock_irqrestore(&hwp_notify_lock, flags); +} + +static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata) +{ + unsigned long flags; + + /* wrmsrl_on_cpu has to be outside spinlock as this can result in IPC */ + wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); + + spin_lock_irqsave(&hwp_notify_lock, flags); + if (cpumask_test_and_clear_cpu(cpudata->cpu, &hwp_intr_enable_mask)) + cancel_delayed_work(&cpudata->hwp_notify_work); + spin_unlock_irqrestore(&hwp_notify_lock, flags); +} + +static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata) +{ + /* Enable HWP notification interrupt for guaranteed performance change */ + if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) { + unsigned long flags; + + spin_lock_irqsave(&hwp_notify_lock, flags); + INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work); + cpumask_set_cpu(cpudata->cpu, &hwp_intr_enable_mask); + spin_unlock_irqrestore(&hwp_notify_lock, flags); + + /* wrmsrl_on_cpu has to be outside spinlock as this can result in IPC */ + wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01); + } +} + static void intel_pstate_hwp_enable(struct cpudata *cpudata) { - /* First disable HWP notification interrupt as we don't process them */ + /* First disable HWP notification interrupt till we activate again */ if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); if (cpudata->epp_default == -EINVAL) cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); + + intel_pstate_enable_hwp_interrupt(cpudata); } static int atom_get_min_pstate(void) @@ -2266,7 +2367,7 @@ static int intel_pstate_init_cpu(unsigned int cpunum) if (!cpu) return -ENOMEM; - all_cpu_data[cpunum] = cpu; + WRITE_ONCE(all_cpu_data[cpunum], cpu); cpu->cpu = cpunum; @@ -2929,8 +3030,10 @@ static void intel_pstate_driver_cleanup(void) if (intel_pstate_driver == &intel_pstate) intel_pstate_clear_update_util_hook(cpu); + spin_lock(&hwp_notify_lock); kfree(all_cpu_data[cpu]); - all_cpu_data[cpu] = NULL; + WRITE_ONCE(all_cpu_data[cpu], NULL); + spin_unlock(&hwp_notify_lock); } } cpus_read_unlock(); @@ -3199,6 +3302,7 @@ static bool intel_pstate_hwp_is_enabled(void) static int __init intel_pstate_init(void) { + static struct cpudata **_all_cpu_data; const struct x86_cpu_id *id; int rc; @@ -3224,7 +3328,7 @@ static int __init intel_pstate_init(void) * deal with it. */ if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || hwp_forced) { - hwp_active++; + WRITE_ONCE(hwp_active, 1); hwp_mode_bdw = id->driver_data; intel_pstate.attr = hwp_cpufreq_attrs; intel_cpufreq.attr = hwp_cpufreq_attrs; @@ -3275,10 +3379,12 @@ hwp_cpu_matched: pr_info("Intel P-state driver initializing\n"); - all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus())); - if (!all_cpu_data) + _all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus())); + if (!_all_cpu_data) return -ENOMEM; + WRITE_ONCE(all_cpu_data, _all_cpu_data); + intel_pstate_request_control_from_smm(); intel_pstate_sysfs_expose_params(); diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index 0cf18dd46b92..8ddbd0c5ce37 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -109,7 +109,7 @@ static unsigned int mtk_cpufreq_hw_fast_switch(struct cpufreq_policy *policy, struct mtk_cpufreq_data *data = policy->driver_data; unsigned int index; - index = cpufreq_table_find_index_dl(policy, target_freq); + index = cpufreq_table_find_index_dl(policy, target_freq, false); writel_relaxed(index, data->reg_bases[REG_FREQ_PERF_STATE]); diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 5a2cf5f91ccb..fddbd1ea1635 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -934,7 +934,7 @@ static void powernv_cpufreq_work_fn(struct work_struct *work) policy = cpufreq_cpu_get(cpu); if (!policy) continue; - index = cpufreq_table_find_index_c(policy, policy->cur); + index = cpufreq_table_find_index_c(policy, policy->cur, false); powernv_cpufreq_target_index(policy, index); cpumask_andnot(&mask, &mask, policy->cpus); cpufreq_cpu_put(policy); @@ -1022,7 +1022,7 @@ static unsigned int powernv_fast_switch(struct cpufreq_policy *policy, int index; struct powernv_smp_call_data freq_data; - index = cpufreq_table_find_index_dl(policy, target_freq); + index = cpufreq_table_find_index_dl(policy, target_freq, false); freq_data.pstate_id = powernv_freqs[index].driver_data; freq_data.gpstate_id = powernv_freqs[index].driver_data; set_pstate(&freq_data); diff --git a/drivers/cpufreq/s3c2440-cpufreq.c b/drivers/cpufreq/s3c2440-cpufreq.c index 148e8aedefa9..2011fb9c03a4 100644 --- a/drivers/cpufreq/s3c2440-cpufreq.c +++ b/drivers/cpufreq/s3c2440-cpufreq.c @@ -173,12 +173,14 @@ static void s3c2440_cpufreq_setdivs(struct s3c_cpufreq_config *cfg) case 6: camdiv |= S3C2440_CAMDIVN_HCLK3_HALF; + fallthrough; case 3: clkdiv |= S3C2440_CLKDIVN_HDIVN_3_6; break; case 8: camdiv |= S3C2440_CAMDIVN_HCLK4_HALF; + fallthrough; case 4: clkdiv |= S3C2440_CLKDIVN_HDIVN_4_8; break; diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c index ad7d4f272ddc..76c888ed8d16 100644 --- a/drivers/cpufreq/s5pv210-cpufreq.c +++ b/drivers/cpufreq/s5pv210-cpufreq.c @@ -243,7 +243,7 @@ static int s5pv210_target(struct cpufreq_policy *policy, unsigned int index) new_freq = s5pv210_freq_table[index].frequency; /* Finding current running level index */ - priv_index = cpufreq_table_find_index_h(policy, old_freq); + priv_index = cpufreq_table_find_index_h(policy, old_freq, false); arm_volt = dvs_conf[index].arm_volt; int_volt = dvs_conf[index].int_volt; diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c index 5d1943e787b0..6c88827f4e62 100644 --- a/drivers/cpufreq/tegra186-cpufreq.c +++ b/drivers/cpufreq/tegra186-cpufreq.c @@ -159,6 +159,10 @@ static struct cpufreq_frequency_table *init_vhint_table( table = ERR_PTR(err); goto free; } + if (msg.rx.ret) { + table = ERR_PTR(-EINVAL); + goto free; + } for (i = data->vfloor; i <= data->vceil; i++) { u16 ndiv = data->ndiv[i]; diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c index a9620e4489ae..ac381db25dbe 100644 --- a/drivers/cpufreq/tegra194-cpufreq.c +++ b/drivers/cpufreq/tegra194-cpufreq.c @@ -242,7 +242,7 @@ static int tegra194_cpufreq_init(struct cpufreq_policy *policy) smp_call_function_single(policy->cpu, get_cpu_cluster, &cl, true); - if (cl >= data->num_clusters) + if (cl >= data->num_clusters || !data->tables[cl]) return -EINVAL; /* set same policy for all cpus in a cluster */ @@ -310,6 +310,12 @@ init_freq_table(struct platform_device *pdev, struct tegra_bpmp *bpmp, err = tegra_bpmp_transfer(bpmp, &msg); if (err) return ERR_PTR(err); + if (msg.rx.ret == -BPMP_EINVAL) { + /* Cluster not available */ + return NULL; + } + if (msg.rx.ret) + return ERR_PTR(-EINVAL); /* * Make sure frequency table step is a multiple of mdiv to match diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 53ec9585ccd4..469e18547d06 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -488,6 +488,7 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device) &kdev->kobj, "state%d", i); if (ret) { kobject_put(&kobj->kobj); + kfree(kobj); goto error_state; } cpuidle_add_s2idle_attr_group(kobj); @@ -619,6 +620,7 @@ static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev) &kdev->kobj, "driver"); if (ret) { kobject_put(&kdrv->kobj); + kfree(kdrv); return ret; } @@ -705,7 +707,6 @@ int cpuidle_add_sysfs(struct cpuidle_device *dev) if (!kdev) return -ENOMEM; kdev->dev = dev; - dev->kobj_dev = kdev; init_completion(&kdev->kobj_unregister); @@ -713,9 +714,11 @@ int cpuidle_add_sysfs(struct cpuidle_device *dev) "cpuidle"); if (error) { kobject_put(&kdev->kobj); + kfree(kdev); return error; } + dev->kobj_dev = kdev; kobject_uevent(&kdev->kobj, KOBJ_ADD); return 0; diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 85faa7a5c7d1..06333d430382 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -827,7 +827,7 @@ struct devfreq *devfreq_add_device(struct device *dev, goto err_dev; } - if (!devfreq->profile->max_state && !devfreq->profile->freq_table) { + if (!devfreq->profile->max_state || !devfreq->profile->freq_table) { mutex_unlock(&devfreq->lock); err = set_freq_table(devfreq); if (err < 0) diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c index 17ed980d9099..9b849d781116 100644 --- a/drivers/devfreq/event/exynos-ppmu.c +++ b/drivers/devfreq/event/exynos-ppmu.c @@ -94,11 +94,16 @@ static struct __exynos_ppmu_events { PPMU_EVENT(d1-general), PPMU_EVENT(d1-rt), - /* For Exynos5422 SoC */ + /* For Exynos5422 SoC, deprecated (backwards compatible) */ PPMU_EVENT(dmc0_0), PPMU_EVENT(dmc0_1), PPMU_EVENT(dmc1_0), PPMU_EVENT(dmc1_1), + /* For Exynos5422 SoC */ + PPMU_EVENT(dmc0-0), + PPMU_EVENT(dmc0-1), + PPMU_EVENT(dmc1-0), + PPMU_EVENT(dmc1-1), }; static int __exynos_ppmu_find_ppmu_id(const char *edev_name) @@ -561,13 +566,10 @@ static int of_get_devfreq_events(struct device_node *np, * use default if not. */ if (info->ppmu_type == EXYNOS_TYPE_PPMU_V2) { - int id; /* Not all registers take the same value for * read+write data count. */ - id = __exynos_ppmu_find_ppmu_id(desc[j].name); - - switch (id) { + switch (ppmu_events[i].id) { case PPMU_PMNCNT0: case PPMU_PMNCNT1: case PPMU_PMNCNT2: diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index e6c543b5ee1d..0b66e25c0e2d 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -89,6 +89,12 @@ static struct cpuidle_state *cpuidle_state_table __initdata; static unsigned int mwait_substates __initdata; /* + * Enable interrupts before entering the C-state. On some platforms and for + * some C-states, this may measurably decrease interrupt latency. + */ +#define CPUIDLE_FLAG_IRQ_ENABLE BIT(14) + +/* * Enable this state by default even if the ACPI _CST does not list it. */ #define CPUIDLE_FLAG_ALWAYS_ENABLE BIT(15) @@ -127,6 +133,9 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, unsigned long eax = flg2MWAIT(state->flags); unsigned long ecx = 1; /* break on interrupt flag */ + if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) + local_irq_enable(); + mwait_idle_with_hints(eax, ecx); return index; @@ -698,7 +707,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { { .name = "C1", .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00), + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE, .exit_latency = 2, .target_residency = 2, .enter = &intel_idle, @@ -727,7 +736,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { { .name = "C1", .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00), + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE, .exit_latency = 1, .target_residency = 1, .enter = &intel_idle, diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index b00d80370379..a42dbf448860 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -906,7 +906,7 @@ acpi_status pci_acpi_add_pm_notifier(struct acpi_device *dev, * choose highest power _SxD or any lower power */ -static pci_power_t acpi_pci_choose_state(struct pci_dev *pdev) +pci_power_t acpi_pci_choose_state(struct pci_dev *pdev) { int acpi_state, d_max; @@ -965,22 +965,20 @@ int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) return 0; } -static bool acpi_pci_power_manageable(struct pci_dev *dev) +bool acpi_pci_power_manageable(struct pci_dev *dev) { struct acpi_device *adev = ACPI_COMPANION(&dev->dev); - if (!adev) - return false; - return acpi_device_power_manageable(adev); + return adev && acpi_device_power_manageable(adev); } -static bool acpi_pci_bridge_d3(struct pci_dev *dev) +bool acpi_pci_bridge_d3(struct pci_dev *dev) { const union acpi_object *obj; struct acpi_device *adev; struct pci_dev *rpdev; - if (!dev->is_hotplug_bridge) + if (acpi_pci_disabled || !dev->is_hotplug_bridge) return false; /* Assume D3 support if the bridge is power-manageable by ACPI. */ @@ -1008,7 +1006,7 @@ static bool acpi_pci_bridge_d3(struct pci_dev *dev) return obj->integer.value == 1; } -static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) +int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) { struct acpi_device *adev = ACPI_COMPANION(&dev->dev); static const u8 state_conv[] = { @@ -1046,7 +1044,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) return error; } -static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) +pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) { struct acpi_device *adev = ACPI_COMPANION(&dev->dev); static const pci_power_t state_conv[] = { @@ -1068,7 +1066,7 @@ static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) return state_conv[state]; } -static void acpi_pci_refresh_power_state(struct pci_dev *dev) +void acpi_pci_refresh_power_state(struct pci_dev *dev) { struct acpi_device *adev = ACPI_COMPANION(&dev->dev); @@ -1093,17 +1091,23 @@ static int acpi_pci_propagate_wakeup(struct pci_bus *bus, bool enable) return 0; } -static int acpi_pci_wakeup(struct pci_dev *dev, bool enable) +int acpi_pci_wakeup(struct pci_dev *dev, bool enable) { + if (acpi_pci_disabled) + return 0; + if (acpi_pm_device_can_wakeup(&dev->dev)) return acpi_pm_set_device_wakeup(&dev->dev, enable); return acpi_pci_propagate_wakeup(dev->bus, enable); } -static bool acpi_pci_need_resume(struct pci_dev *dev) +bool acpi_pci_need_resume(struct pci_dev *dev) { - struct acpi_device *adev = ACPI_COMPANION(&dev->dev); + struct acpi_device *adev; + + if (acpi_pci_disabled) + return false; /* * In some cases (eg. Samsung 305V4A) leaving a bridge in suspend over @@ -1115,6 +1119,7 @@ static bool acpi_pci_need_resume(struct pci_dev *dev) if (pci_is_bridge(dev) && acpi_target_system_state() != ACPI_STATE_S0) return true; + adev = ACPI_COMPANION(&dev->dev); if (!adev || !acpi_device_power_manageable(adev)) return false; @@ -1128,17 +1133,6 @@ static bool acpi_pci_need_resume(struct pci_dev *dev) return !!adev->power.flags.dsw_present; } -static const struct pci_platform_pm_ops acpi_pci_platform_pm = { - .bridge_d3 = acpi_pci_bridge_d3, - .is_manageable = acpi_pci_power_manageable, - .set_state = acpi_pci_set_power_state, - .get_state = acpi_pci_get_power_state, - .refresh_state = acpi_pci_refresh_power_state, - .choose_state = acpi_pci_choose_state, - .set_wakeup = acpi_pci_wakeup, - .need_resume = acpi_pci_need_resume, -}; - void acpi_pci_add_bus(struct pci_bus *bus) { union acpi_object *obj; @@ -1451,7 +1445,6 @@ static int __init acpi_pci_init(void) if (acpi_pci_disabled) return 0; - pci_set_platform_pm(&acpi_pci_platform_pm); acpi_pci_slot_init(); acpiphp_init(); diff --git a/drivers/pci/pci-mid.c b/drivers/pci/pci-mid.c index aafd58da3a89..fbfd78127123 100644 --- a/drivers/pci/pci-mid.c +++ b/drivers/pci/pci-mid.c @@ -16,45 +16,23 @@ #include "pci.h" -static bool mid_pci_power_manageable(struct pci_dev *dev) +static bool pci_mid_pm_enabled __read_mostly; + +bool pci_use_mid_pm(void) { - return true; + return pci_mid_pm_enabled; } -static int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) +int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) { return intel_mid_pci_set_power_state(pdev, state); } -static pci_power_t mid_pci_get_power_state(struct pci_dev *pdev) +pci_power_t mid_pci_get_power_state(struct pci_dev *pdev) { return intel_mid_pci_get_power_state(pdev); } -static pci_power_t mid_pci_choose_state(struct pci_dev *pdev) -{ - return PCI_D3hot; -} - -static int mid_pci_wakeup(struct pci_dev *dev, bool enable) -{ - return 0; -} - -static bool mid_pci_need_resume(struct pci_dev *dev) -{ - return false; -} - -static const struct pci_platform_pm_ops mid_pci_platform_pm = { - .is_manageable = mid_pci_power_manageable, - .set_state = mid_pci_set_power_state, - .get_state = mid_pci_get_power_state, - .choose_state = mid_pci_choose_state, - .set_wakeup = mid_pci_wakeup, - .need_resume = mid_pci_need_resume, -}; - /* * This table should be in sync with the one in * arch/x86/platform/intel-mid/pwr.c. @@ -71,7 +49,8 @@ static int __init mid_pci_init(void) id = x86_match_cpu(lpss_cpu_ids); if (id) - pci_set_platform_pm(&mid_pci_platform_pm); + pci_mid_pm_enabled = true; + return 0; } arch_initcall(mid_pci_init); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index ce2ab62b64cf..11c88ffb51d9 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -972,61 +972,67 @@ static void pci_restore_bars(struct pci_dev *dev) pci_update_resource(dev, i); } -static const struct pci_platform_pm_ops *pci_platform_pm; - -int pci_set_platform_pm(const struct pci_platform_pm_ops *ops) -{ - if (!ops->is_manageable || !ops->set_state || !ops->get_state || - !ops->choose_state || !ops->set_wakeup || !ops->need_resume) - return -EINVAL; - pci_platform_pm = ops; - return 0; -} - static inline bool platform_pci_power_manageable(struct pci_dev *dev) { - return pci_platform_pm ? pci_platform_pm->is_manageable(dev) : false; + if (pci_use_mid_pm()) + return true; + + return acpi_pci_power_manageable(dev); } static inline int platform_pci_set_power_state(struct pci_dev *dev, pci_power_t t) { - return pci_platform_pm ? pci_platform_pm->set_state(dev, t) : -ENOSYS; + if (pci_use_mid_pm()) + return mid_pci_set_power_state(dev, t); + + return acpi_pci_set_power_state(dev, t); } static inline pci_power_t platform_pci_get_power_state(struct pci_dev *dev) { - return pci_platform_pm ? pci_platform_pm->get_state(dev) : PCI_UNKNOWN; + if (pci_use_mid_pm()) + return mid_pci_get_power_state(dev); + + return acpi_pci_get_power_state(dev); } static inline void platform_pci_refresh_power_state(struct pci_dev *dev) { - if (pci_platform_pm && pci_platform_pm->refresh_state) - pci_platform_pm->refresh_state(dev); + if (!pci_use_mid_pm()) + acpi_pci_refresh_power_state(dev); } static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev) { - return pci_platform_pm ? - pci_platform_pm->choose_state(dev) : PCI_POWER_ERROR; + if (pci_use_mid_pm()) + return PCI_POWER_ERROR; + + return acpi_pci_choose_state(dev); } static inline int platform_pci_set_wakeup(struct pci_dev *dev, bool enable) { - return pci_platform_pm ? - pci_platform_pm->set_wakeup(dev, enable) : -ENODEV; + if (pci_use_mid_pm()) + return PCI_POWER_ERROR; + + return acpi_pci_wakeup(dev, enable); } static inline bool platform_pci_need_resume(struct pci_dev *dev) { - return pci_platform_pm ? pci_platform_pm->need_resume(dev) : false; + if (pci_use_mid_pm()) + return false; + + return acpi_pci_need_resume(dev); } static inline bool platform_pci_bridge_d3(struct pci_dev *dev) { - if (pci_platform_pm && pci_platform_pm->bridge_d3) - return pci_platform_pm->bridge_d3(dev); - return false; + if (pci_use_mid_pm()) + return false; + + return acpi_pci_bridge_d3(dev); } /** @@ -1185,9 +1191,7 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state) */ void pci_refresh_power_state(struct pci_dev *dev) { - if (platform_pci_power_manageable(dev)) - platform_pci_refresh_power_state(dev); - + platform_pci_refresh_power_state(dev); pci_update_current_state(dev, dev->current_state); } @@ -1200,14 +1204,10 @@ int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state) { int error; - if (platform_pci_power_manageable(dev)) { - error = platform_pci_set_power_state(dev, state); - if (!error) - pci_update_current_state(dev, state); - } else - error = -ENODEV; - - if (error && !dev->pm_cap) /* Fall back to PCI_D0 */ + error = platform_pci_set_power_state(dev, state); + if (!error) + pci_update_current_state(dev, state); + else if (!dev->pm_cap) /* Fall back to PCI_D0 */ dev->current_state = PCI_D0; return error; @@ -1388,44 +1388,6 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state) } EXPORT_SYMBOL(pci_set_power_state); -/** - * pci_choose_state - Choose the power state of a PCI device - * @dev: PCI device to be suspended - * @state: target sleep state for the whole system. This is the value - * that is passed to suspend() function. - * - * Returns PCI power state suitable for given device and given system - * message. - */ -pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) -{ - pci_power_t ret; - - if (!dev->pm_cap) - return PCI_D0; - - ret = platform_pci_choose_state(dev); - if (ret != PCI_POWER_ERROR) - return ret; - - switch (state.event) { - case PM_EVENT_ON: - return PCI_D0; - case PM_EVENT_FREEZE: - case PM_EVENT_PRETHAW: - /* REVISIT both freeze and pre-thaw "should" use D0 */ - case PM_EVENT_SUSPEND: - case PM_EVENT_HIBERNATE: - return PCI_D3hot; - default: - pci_info(dev, "unrecognized suspend event %d\n", - state.event); - BUG(); - } - return PCI_D0; -} -EXPORT_SYMBOL(pci_choose_state); - #define PCI_EXP_SAVE_REGS 7 static struct pci_cap_saved_state *_pci_find_saved_cap(struct pci_dev *pci_dev, @@ -2577,8 +2539,6 @@ EXPORT_SYMBOL(pci_wake_from_d3); */ static pci_power_t pci_target_state(struct pci_dev *dev, bool wakeup) { - pci_power_t target_state = PCI_D3hot; - if (platform_pci_power_manageable(dev)) { /* * Call the platform to find the target state for the device. @@ -2588,32 +2548,29 @@ static pci_power_t pci_target_state(struct pci_dev *dev, bool wakeup) switch (state) { case PCI_POWER_ERROR: case PCI_UNKNOWN: - break; + return PCI_D3hot; + case PCI_D1: case PCI_D2: if (pci_no_d1d2(dev)) - break; - fallthrough; - default: - target_state = state; + return PCI_D3hot; } - return target_state; + return state; } - if (!dev->pm_cap) - target_state = PCI_D0; - /* * If the device is in D3cold even though it's not power-manageable by * the platform, it may have been powered down by non-standard means. * Best to let it slumber. */ if (dev->current_state == PCI_D3cold) - target_state = PCI_D3cold; + return PCI_D3cold; + else if (!dev->pm_cap) + return PCI_D0; if (wakeup && dev->pme_support) { - pci_power_t state = target_state; + pci_power_t state = PCI_D3hot; /* * Find the deepest state from which the device can generate @@ -2628,7 +2585,7 @@ static pci_power_t pci_target_state(struct pci_dev *dev, bool wakeup) return PCI_D0; } - return target_state; + return PCI_D3hot; } /** @@ -2681,8 +2638,13 @@ EXPORT_SYMBOL(pci_prepare_to_sleep); */ int pci_back_from_sleep(struct pci_dev *dev) { + int ret = pci_set_power_state(dev, PCI_D0); + + if (ret) + return ret; + pci_enable_wake(dev, PCI_D0, false); - return pci_set_power_state(dev, PCI_D0); + return 0; } EXPORT_SYMBOL(pci_back_from_sleep); @@ -2842,6 +2804,22 @@ void pci_dev_complete_resume(struct pci_dev *pci_dev) spin_unlock_irq(&dev->power.lock); } +/** + * pci_choose_state - Choose the power state of a PCI device. + * @dev: Target PCI device. + * @state: Target state for the whole system. + * + * Returns PCI power state suitable for @dev and @state. + */ +pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) +{ + if (state.event == PM_EVENT_ON) + return PCI_D0; + + return pci_target_state(dev, false); +} +EXPORT_SYMBOL(pci_choose_state); + void pci_config_pm_runtime_get(struct pci_dev *pdev) { struct device *dev = &pdev->dev; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1cce56c2aea0..bd39098c57da 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -63,45 +63,6 @@ struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, #define PCI_PM_D3HOT_WAIT 10 /* msec */ #define PCI_PM_D3COLD_WAIT 100 /* msec */ -/** - * struct pci_platform_pm_ops - Firmware PM callbacks - * - * @bridge_d3: Does the bridge allow entering into D3 - * - * @is_manageable: returns 'true' if given device is power manageable by the - * platform firmware - * - * @set_state: invokes the platform firmware to set the device's power state - * - * @get_state: queries the platform firmware for a device's current power state - * - * @refresh_state: asks the platform to refresh the device's power state data - * - * @choose_state: returns PCI power state of given device preferred by the - * platform; to be used during system-wide transitions from a - * sleeping state to the working state and vice versa - * - * @set_wakeup: enables/disables wakeup capability for the device - * - * @need_resume: returns 'true' if the given device (which is currently - * suspended) needs to be resumed to be configured for system - * wakeup. - * - * If given platform is generally capable of power managing PCI devices, all of - * these callbacks are mandatory. - */ -struct pci_platform_pm_ops { - bool (*bridge_d3)(struct pci_dev *dev); - bool (*is_manageable)(struct pci_dev *dev); - int (*set_state)(struct pci_dev *dev, pci_power_t state); - pci_power_t (*get_state)(struct pci_dev *dev); - void (*refresh_state)(struct pci_dev *dev); - pci_power_t (*choose_state)(struct pci_dev *dev); - int (*set_wakeup)(struct pci_dev *dev, bool enable); - bool (*need_resume)(struct pci_dev *dev); -}; - -int pci_set_platform_pm(const struct pci_platform_pm_ops *ops); void pci_update_current_state(struct pci_dev *dev, pci_power_t state); void pci_refresh_power_state(struct pci_dev *dev); int pci_power_up(struct pci_dev *dev); @@ -725,17 +686,53 @@ int pci_acpi_program_hp_params(struct pci_dev *dev); extern const struct attribute_group pci_dev_acpi_attr_group; void pci_set_acpi_fwnode(struct pci_dev *dev); int pci_dev_acpi_reset(struct pci_dev *dev, bool probe); +bool acpi_pci_power_manageable(struct pci_dev *dev); +bool acpi_pci_bridge_d3(struct pci_dev *dev); +int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state); +pci_power_t acpi_pci_get_power_state(struct pci_dev *dev); +void acpi_pci_refresh_power_state(struct pci_dev *dev); +int acpi_pci_wakeup(struct pci_dev *dev, bool enable); +bool acpi_pci_need_resume(struct pci_dev *dev); +pci_power_t acpi_pci_choose_state(struct pci_dev *pdev); #else static inline int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } - static inline void pci_set_acpi_fwnode(struct pci_dev *dev) {} static inline int pci_acpi_program_hp_params(struct pci_dev *dev) { return -ENODEV; } +static inline bool acpi_pci_power_manageable(struct pci_dev *dev) +{ + return false; +} +static inline bool acpi_pci_bridge_d3(struct pci_dev *dev) +{ + return false; +} +static inline int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) +{ + return -ENODEV; +} +static inline pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) +{ + return PCI_UNKNOWN; +} +static inline void acpi_pci_refresh_power_state(struct pci_dev *dev) {} +static inline int acpi_pci_wakeup(struct pci_dev *dev, bool enable) +{ + return -ENODEV; +} +static inline bool acpi_pci_need_resume(struct pci_dev *dev) +{ + return false; +} +static inline pci_power_t acpi_pci_choose_state(struct pci_dev *pdev) +{ + return PCI_POWER_ERROR; +} #endif #ifdef CONFIG_PCIEASPM @@ -744,4 +741,23 @@ extern const struct attribute_group aspm_ctrl_attr_group; extern const struct attribute_group pci_dev_reset_method_attr_group; +#ifdef CONFIG_X86_INTEL_MID +bool pci_use_mid_pm(void); +int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state); +pci_power_t mid_pci_get_power_state(struct pci_dev *pdev); +#else +static inline bool pci_use_mid_pm(void) +{ + return false; +} +static inline int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) +{ + return -ENODEV; +} +static inline pci_power_t mid_pci_get_power_state(struct pci_dev *pdev) +{ + return PCI_UNKNOWN; +} +#endif + #endif /* DRIVERS_PCI_H */ diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index c2185ec5f887..b9fac786246a 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -116,8 +116,6 @@ static void __dtpm_sub_power(struct dtpm *dtpm) parent->power_limit -= dtpm->power_limit; parent = parent->parent; } - - __dtpm_rebalance_weight(root); } static void __dtpm_add_power(struct dtpm *dtpm) @@ -130,45 +128,45 @@ static void __dtpm_add_power(struct dtpm *dtpm) parent->power_limit += dtpm->power_limit; parent = parent->parent; } +} + +static int __dtpm_update_power(struct dtpm *dtpm) +{ + int ret; + + __dtpm_sub_power(dtpm); + + ret = dtpm->ops->update_power_uw(dtpm); + if (ret) + pr_err("Failed to update power for '%s': %d\n", + dtpm->zone.name, ret); - __dtpm_rebalance_weight(root); + if (!test_bit(DTPM_POWER_LIMIT_FLAG, &dtpm->flags)) + dtpm->power_limit = dtpm->power_max; + + __dtpm_add_power(dtpm); + + if (root) + __dtpm_rebalance_weight(root); + + return ret; } /** * dtpm_update_power - Update the power on the dtpm * @dtpm: a pointer to a dtpm structure to update - * @power_min: a u64 representing the new power_min value - * @power_max: a u64 representing the new power_max value * * Function to update the power values of the dtpm node specified in * parameter. These new values will be propagated to the tree. * * Return: zero on success, -EINVAL if the values are inconsistent */ -int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max) +int dtpm_update_power(struct dtpm *dtpm) { - int ret = 0; + int ret; mutex_lock(&dtpm_lock); - - if (power_min == dtpm->power_min && power_max == dtpm->power_max) - goto unlock; - - if (power_max < power_min) { - ret = -EINVAL; - goto unlock; - } - - __dtpm_sub_power(dtpm); - - dtpm->power_min = power_min; - dtpm->power_max = power_max; - if (!test_bit(DTPM_POWER_LIMIT_FLAG, &dtpm->flags)) - dtpm->power_limit = power_max; - - __dtpm_add_power(dtpm); - -unlock: + ret = __dtpm_update_power(dtpm); mutex_unlock(&dtpm_lock); return ret; @@ -359,24 +357,18 @@ static struct powercap_zone_ops zone_ops = { }; /** - * dtpm_alloc - Allocate and initialize a dtpm struct - * @name: a string specifying the name of the node - * - * Return: a struct dtpm pointer, NULL in case of error + * dtpm_init - Allocate and initialize a dtpm struct + * @dtpm: The dtpm struct pointer to be initialized + * @ops: The dtpm device specific ops, NULL for a virtual node */ -struct dtpm *dtpm_alloc(struct dtpm_ops *ops) +void dtpm_init(struct dtpm *dtpm, struct dtpm_ops *ops) { - struct dtpm *dtpm; - - dtpm = kzalloc(sizeof(*dtpm), GFP_KERNEL); if (dtpm) { INIT_LIST_HEAD(&dtpm->children); INIT_LIST_HEAD(&dtpm->sibling); dtpm->weight = 1024; dtpm->ops = ops; } - - return dtpm; } /** @@ -436,6 +428,7 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) if (dtpm->ops && !(dtpm->ops->set_power_uw && dtpm->ops->get_power_uw && + dtpm->ops->update_power_uw && dtpm->ops->release)) return -EINVAL; @@ -455,7 +448,10 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) root = dtpm; } - __dtpm_add_power(dtpm); + if (dtpm->ops && !dtpm->ops->update_power_uw(dtpm)) { + __dtpm_add_power(dtpm); + dtpm->power_limit = dtpm->power_max; + } pr_info("Registered dtpm node '%s' / %llu-%llu uW, \n", dtpm->zone.name, dtpm->power_min, dtpm->power_max); @@ -465,9 +461,9 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) return 0; } -static int __init dtpm_init(void) +static int __init init_dtpm(void) { - struct dtpm_descr **dtpm_descr; + struct dtpm_descr *dtpm_descr; pct = powercap_register_control_type(NULL, "dtpm", NULL); if (IS_ERR(pct)) { @@ -476,8 +472,8 @@ static int __init dtpm_init(void) } for_each_dtpm_table(dtpm_descr) - (*dtpm_descr)->init(*dtpm_descr); + dtpm_descr->init(); return 0; } -late_initcall(dtpm_init); +late_initcall(init_dtpm); diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index 51c366938acd..44faa3a74db6 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -14,6 +14,8 @@ * The CPU hotplug is supported and the power numbers will be updated * if a CPU is hot plugged / unplugged. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/cpumask.h> #include <linux/cpufreq.h> #include <linux/cpuhotplug.h> @@ -23,66 +25,29 @@ #include <linux/slab.h> #include <linux/units.h> -static struct dtpm *__parent; - -static DEFINE_PER_CPU(struct dtpm *, dtpm_per_cpu); - struct dtpm_cpu { + struct dtpm dtpm; struct freq_qos_request qos_req; int cpu; }; -/* - * When a new CPU is inserted at hotplug or boot time, add the power - * contribution and update the dtpm tree. - */ -static int power_add(struct dtpm *dtpm, struct em_perf_domain *em) -{ - u64 power_min, power_max; - - power_min = em->table[0].power; - power_min *= MICROWATT_PER_MILLIWATT; - power_min += dtpm->power_min; +static DEFINE_PER_CPU(struct dtpm_cpu *, dtpm_per_cpu); - power_max = em->table[em->nr_perf_states - 1].power; - power_max *= MICROWATT_PER_MILLIWATT; - power_max += dtpm->power_max; - - return dtpm_update_power(dtpm, power_min, power_max); -} - -/* - * When a CPU is unplugged, remove its power contribution from the - * dtpm tree. - */ -static int power_sub(struct dtpm *dtpm, struct em_perf_domain *em) +static struct dtpm_cpu *to_dtpm_cpu(struct dtpm *dtpm) { - u64 power_min, power_max; - - power_min = em->table[0].power; - power_min *= MICROWATT_PER_MILLIWATT; - power_min = dtpm->power_min - power_min; - - power_max = em->table[em->nr_perf_states - 1].power; - power_max *= MICROWATT_PER_MILLIWATT; - power_max = dtpm->power_max - power_max; - - return dtpm_update_power(dtpm, power_min, power_max); + return container_of(dtpm, struct dtpm_cpu, dtpm); } static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) { - struct dtpm_cpu *dtpm_cpu = dtpm->private; - struct em_perf_domain *pd; + struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); + struct em_perf_domain *pd = em_cpu_get(dtpm_cpu->cpu); struct cpumask cpus; unsigned long freq; u64 power; int i, nr_cpus; - pd = em_cpu_get(dtpm_cpu->cpu); - cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus)); - nr_cpus = cpumask_weight(&cpus); for (i = 0; i < pd->nr_perf_states; i++) { @@ -103,34 +68,88 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) return power_limit; } +static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power) +{ + unsigned long max = 0, sum_util = 0; + int cpu; + + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + + /* + * The capacity is the same for all CPUs belonging to + * the same perf domain, so a single call to + * arch_scale_cpu_capacity() is enough. However, we + * need the CPU parameter to be initialized by the + * loop, so the call ends up in this block. + * + * We can initialize 'max' with a cpumask_first() call + * before the loop but the bits computation is not + * worth given the arch_scale_cpu_capacity() just + * returns a value where the resulting assembly code + * will be optimized by the compiler. + */ + max = arch_scale_cpu_capacity(cpu); + sum_util += sched_cpu_util(cpu, max); + } + + /* + * In the improbable case where all the CPUs of the perf + * domain are offline, 'max' will be zero and will lead to an + * illegal operation with a zero division. + */ + return max ? (power * ((sum_util << 10) / max)) >> 10 : 0; +} + static u64 get_pd_power_uw(struct dtpm *dtpm) { - struct dtpm_cpu *dtpm_cpu = dtpm->private; + struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); struct em_perf_domain *pd; - struct cpumask cpus; + struct cpumask *pd_mask; unsigned long freq; - int i, nr_cpus; + int i; pd = em_cpu_get(dtpm_cpu->cpu); + + pd_mask = em_span_cpus(pd); + freq = cpufreq_quick_get(dtpm_cpu->cpu); - cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus)); - nr_cpus = cpumask_weight(&cpus); for (i = 0; i < pd->nr_perf_states; i++) { if (pd->table[i].frequency < freq) continue; - return pd->table[i].power * - MICROWATT_PER_MILLIWATT * nr_cpus; + return scale_pd_power_uw(pd_mask, pd->table[i].power * + MICROWATT_PER_MILLIWATT); } return 0; } +static int update_pd_power_uw(struct dtpm *dtpm) +{ + struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); + struct em_perf_domain *em = em_cpu_get(dtpm_cpu->cpu); + struct cpumask cpus; + int nr_cpus; + + cpumask_and(&cpus, cpu_online_mask, to_cpumask(em->cpus)); + nr_cpus = cpumask_weight(&cpus); + + dtpm->power_min = em->table[0].power; + dtpm->power_min *= MICROWATT_PER_MILLIWATT; + dtpm->power_min *= nr_cpus; + + dtpm->power_max = em->table[em->nr_perf_states - 1].power; + dtpm->power_max *= MICROWATT_PER_MILLIWATT; + dtpm->power_max *= nr_cpus; + + return 0; +} + static void pd_release(struct dtpm *dtpm) { - struct dtpm_cpu *dtpm_cpu = dtpm->private; + struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); if (freq_qos_request_active(&dtpm_cpu->qos_req)) freq_qos_remove_request(&dtpm_cpu->qos_req); @@ -139,44 +158,28 @@ static void pd_release(struct dtpm *dtpm) } static struct dtpm_ops dtpm_ops = { - .set_power_uw = set_pd_power_limit, - .get_power_uw = get_pd_power_uw, - .release = pd_release, + .set_power_uw = set_pd_power_limit, + .get_power_uw = get_pd_power_uw, + .update_power_uw = update_pd_power_uw, + .release = pd_release, }; static int cpuhp_dtpm_cpu_offline(unsigned int cpu) { - struct cpufreq_policy *policy; struct em_perf_domain *pd; - struct dtpm *dtpm; - - policy = cpufreq_cpu_get(cpu); - - if (!policy) - return 0; + struct dtpm_cpu *dtpm_cpu; pd = em_cpu_get(cpu); if (!pd) return -EINVAL; - dtpm = per_cpu(dtpm_per_cpu, cpu); + dtpm_cpu = per_cpu(dtpm_per_cpu, cpu); - power_sub(dtpm, pd); - - if (cpumask_weight(policy->cpus) != 1) - return 0; - - for_each_cpu(cpu, policy->related_cpus) - per_cpu(dtpm_per_cpu, cpu) = NULL; - - dtpm_unregister(dtpm); - - return 0; + return dtpm_update_power(&dtpm_cpu->dtpm); } static int cpuhp_dtpm_cpu_online(unsigned int cpu) { - struct dtpm *dtpm; struct dtpm_cpu *dtpm_cpu; struct cpufreq_policy *policy; struct em_perf_domain *pd; @@ -184,7 +187,6 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) int ret = -ENOMEM; policy = cpufreq_cpu_get(cpu); - if (!policy) return 0; @@ -192,66 +194,82 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) if (!pd) return -EINVAL; - dtpm = per_cpu(dtpm_per_cpu, cpu); - if (dtpm) - return power_add(dtpm, pd); - - dtpm = dtpm_alloc(&dtpm_ops); - if (!dtpm) - return -EINVAL; + dtpm_cpu = per_cpu(dtpm_per_cpu, cpu); + if (dtpm_cpu) + return dtpm_update_power(&dtpm_cpu->dtpm); dtpm_cpu = kzalloc(sizeof(*dtpm_cpu), GFP_KERNEL); if (!dtpm_cpu) - goto out_kfree_dtpm; + return -ENOMEM; - dtpm->private = dtpm_cpu; + dtpm_init(&dtpm_cpu->dtpm, &dtpm_ops); dtpm_cpu->cpu = cpu; for_each_cpu(cpu, policy->related_cpus) - per_cpu(dtpm_per_cpu, cpu) = dtpm; + per_cpu(dtpm_per_cpu, cpu) = dtpm_cpu; - sprintf(name, "cpu%d", dtpm_cpu->cpu); + snprintf(name, sizeof(name), "cpu%d-cpufreq", dtpm_cpu->cpu); - ret = dtpm_register(name, dtpm, __parent); + ret = dtpm_register(name, &dtpm_cpu->dtpm, NULL); if (ret) goto out_kfree_dtpm_cpu; - ret = power_add(dtpm, pd); - if (ret) - goto out_dtpm_unregister; - ret = freq_qos_add_request(&policy->constraints, &dtpm_cpu->qos_req, FREQ_QOS_MAX, pd->table[pd->nr_perf_states - 1].frequency); if (ret) - goto out_power_sub; + goto out_dtpm_unregister; return 0; -out_power_sub: - power_sub(dtpm, pd); - out_dtpm_unregister: - dtpm_unregister(dtpm); + dtpm_unregister(&dtpm_cpu->dtpm); dtpm_cpu = NULL; - dtpm = NULL; out_kfree_dtpm_cpu: for_each_cpu(cpu, policy->related_cpus) per_cpu(dtpm_per_cpu, cpu) = NULL; kfree(dtpm_cpu); -out_kfree_dtpm: - kfree(dtpm); return ret; } -int dtpm_register_cpu(struct dtpm *parent) +static int __init dtpm_cpu_init(void) { - __parent = parent; + int ret; + + /* + * The callbacks at CPU hotplug time are calling + * dtpm_update_power() which in turns calls update_pd_power(). + * + * The function update_pd_power() uses the online mask to + * figure out the power consumption limits. + * + * At CPUHP_AP_ONLINE_DYN, the CPU is present in the CPU + * online mask when the cpuhp_dtpm_cpu_online function is + * called, but the CPU is still in the online mask for the + * tear down callback. So the power can not be updated when + * the CPU is unplugged. + * + * At CPUHP_AP_DTPM_CPU_DEAD, the situation is the opposite as + * above. The CPU online mask is not up to date when the CPU + * is plugged in. + * + * For this reason, we need to call the online and offline + * callbacks at different moments when the CPU online mask is + * consistent with the power numbers we want to update. + */ + ret = cpuhp_setup_state(CPUHP_AP_DTPM_CPU_DEAD, "dtpm_cpu:offline", + NULL, cpuhp_dtpm_cpu_offline); + if (ret < 0) + return ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "dtpm_cpu:online", + cpuhp_dtpm_cpu_online, NULL); + if (ret < 0) + return ret; - return cpuhp_setup_state(CPUHP_AP_DTPM_CPU_ONLINE, - "dtpm_cpu:online", - cpuhp_dtpm_cpu_online, - cpuhp_dtpm_cpu_offline); + return 0; } + +DTPM_DECLARE(dtpm_cpu, dtpm_cpu_init); diff --git a/drivers/usb/host/xhci-mtk.c b/drivers/usb/host/xhci-mtk.c index c53f6f276d5c..58a0eae4f41b 100644 --- a/drivers/usb/host/xhci-mtk.c +++ b/drivers/usb/host/xhci-mtk.c @@ -602,7 +602,7 @@ static int xhci_mtk_probe(struct platform_device *pdev) goto dealloc_usb2_hcd; if (wakeup_irq > 0) { - ret = dev_pm_set_dedicated_wake_irq(dev, wakeup_irq); + ret = dev_pm_set_dedicated_wake_irq_reverse(dev, wakeup_irq); if (ret) { dev_err(dev, "set wakeup irq %d failed\n", wakeup_irq); goto dealloc_usb3_hcd; diff --git a/drivers/usb/mtu3/mtu3_plat.c b/drivers/usb/mtu3/mtu3_plat.c index f13531022f4a..4309ed939178 100644 --- a/drivers/usb/mtu3/mtu3_plat.c +++ b/drivers/usb/mtu3/mtu3_plat.c @@ -337,7 +337,7 @@ static int mtu3_probe(struct platform_device *pdev) goto comm_init_err; if (ssusb->wakeup_irq > 0) { - ret = dev_pm_set_dedicated_wake_irq(dev, ssusb->wakeup_irq); + ret = dev_pm_set_dedicated_wake_irq_reverse(dev, ssusb->wakeup_irq); if (ret) { dev_err(dev, "failed to set wakeup irq %d\n", ssusb->wakeup_irq); goto comm_exit; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index ff88bb3e44fc..1ab29e61b078 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -119,6 +119,13 @@ struct cpufreq_policy { bool strict_target; /* + * Set if inefficient frequencies were found in the frequency table. + * This indicates if the relation flag CPUFREQ_RELATION_E can be + * honored. + */ + bool efficiencies_available; + + /* * Preferred average time interval between consecutive invocations of * the driver to set the frequency for this policy. To be set by the * scaling driver (0, which is the default, means no preference). @@ -273,6 +280,12 @@ static inline void cpufreq_stats_record_transition(struct cpufreq_policy *policy #define CPUFREQ_RELATION_L 0 /* lowest frequency at or above target */ #define CPUFREQ_RELATION_H 1 /* highest frequency below or at target */ #define CPUFREQ_RELATION_C 2 /* closest frequency to target */ +/* relation flags */ +#define CPUFREQ_RELATION_E BIT(2) /* Get if possible an efficient frequency */ + +#define CPUFREQ_RELATION_LE (CPUFREQ_RELATION_L | CPUFREQ_RELATION_E) +#define CPUFREQ_RELATION_HE (CPUFREQ_RELATION_H | CPUFREQ_RELATION_E) +#define CPUFREQ_RELATION_CE (CPUFREQ_RELATION_C | CPUFREQ_RELATION_E) struct freq_attr { struct attribute attr; @@ -385,7 +398,7 @@ struct cpufreq_driver { /* flags */ /* - * Set by drivers that need to update internale upper and lower boundaries along + * Set by drivers that need to update internal upper and lower boundaries along * with the target frequency and so the core and governors should also invoke * the diver if the target frequency does not change, but the policy min or max * may have changed. @@ -627,9 +640,11 @@ struct cpufreq_governor *cpufreq_fallback_governor(void); static inline void cpufreq_policy_apply_limits(struct cpufreq_policy *policy) { if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); + __cpufreq_driver_target(policy, policy->max, + CPUFREQ_RELATION_HE); else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); + __cpufreq_driver_target(policy, policy->min, + CPUFREQ_RELATION_LE); } /* Governor attribute set */ @@ -660,10 +675,11 @@ struct governor_attr { *********************************************************************/ /* Special Values of .frequency field */ -#define CPUFREQ_ENTRY_INVALID ~0u -#define CPUFREQ_TABLE_END ~1u +#define CPUFREQ_ENTRY_INVALID ~0u +#define CPUFREQ_TABLE_END ~1u /* Special Values of .flags field */ -#define CPUFREQ_BOOST_FREQ (1 << 0) +#define CPUFREQ_BOOST_FREQ (1 << 0) +#define CPUFREQ_INEFFICIENT_FREQ (1 << 1) struct cpufreq_frequency_table { unsigned int flags; @@ -740,6 +756,22 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, continue; \ else +/** + * cpufreq_for_each_efficient_entry_idx - iterate with index over a cpufreq + * frequency_table excluding CPUFREQ_ENTRY_INVALID and + * CPUFREQ_INEFFICIENT_FREQ frequencies. + * @pos: the &struct cpufreq_frequency_table to use as a loop cursor. + * @table: the &struct cpufreq_frequency_table to iterate over. + * @idx: the table entry currently being processed. + * @efficiencies: set to true to only iterate over efficient frequencies. + */ + +#define cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) \ + cpufreq_for_each_valid_entry_idx(pos, table, idx) \ + if (efficiencies && (pos->flags & CPUFREQ_INEFFICIENT_FREQ)) \ + continue; \ + else + int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table); @@ -764,14 +796,15 @@ bool policy_has_boost_freq(struct cpufreq_policy *policy); /* Find lowest freq at or above target in a table in ascending order */ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq >= target_freq) @@ -785,14 +818,15 @@ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy, /* Find lowest freq at or above target in a table in descending order */ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -815,26 +849,30 @@ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy, /* Works only on sorted freq-tables */ static inline int cpufreq_table_find_index_l(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { target_freq = clamp_val(target_freq, policy->min, policy->max); if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) - return cpufreq_table_find_index_al(policy, target_freq); + return cpufreq_table_find_index_al(policy, target_freq, + efficiencies); else - return cpufreq_table_find_index_dl(policy, target_freq); + return cpufreq_table_find_index_dl(policy, target_freq, + efficiencies); } /* Find highest freq at or below target in a table in ascending order */ static inline int cpufreq_table_find_index_ah(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -857,14 +895,15 @@ static inline int cpufreq_table_find_index_ah(struct cpufreq_policy *policy, /* Find highest freq at or below target in a table in descending order */ static inline int cpufreq_table_find_index_dh(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq <= target_freq) @@ -878,26 +917,30 @@ static inline int cpufreq_table_find_index_dh(struct cpufreq_policy *policy, /* Works only on sorted freq-tables */ static inline int cpufreq_table_find_index_h(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { target_freq = clamp_val(target_freq, policy->min, policy->max); if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) - return cpufreq_table_find_index_ah(policy, target_freq); + return cpufreq_table_find_index_ah(policy, target_freq, + efficiencies); else - return cpufreq_table_find_index_dh(policy, target_freq); + return cpufreq_table_find_index_dh(policy, target_freq, + efficiencies); } /* Find closest freq to target in a table in ascending order */ static inline int cpufreq_table_find_index_ac(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -924,14 +967,15 @@ static inline int cpufreq_table_find_index_ac(struct cpufreq_policy *policy, /* Find closest freq to target in a table in descending order */ static inline int cpufreq_table_find_index_dc(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -958,35 +1002,58 @@ static inline int cpufreq_table_find_index_dc(struct cpufreq_policy *policy, /* Works only on sorted freq-tables */ static inline int cpufreq_table_find_index_c(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { target_freq = clamp_val(target_freq, policy->min, policy->max); if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) - return cpufreq_table_find_index_ac(policy, target_freq); + return cpufreq_table_find_index_ac(policy, target_freq, + efficiencies); else - return cpufreq_table_find_index_dc(policy, target_freq); + return cpufreq_table_find_index_dc(policy, target_freq, + efficiencies); } static inline int cpufreq_frequency_table_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { + bool efficiencies = policy->efficiencies_available && + (relation & CPUFREQ_RELATION_E); + int idx; + + /* cpufreq_table_index_unsorted() has no use for this flag anyway */ + relation &= ~CPUFREQ_RELATION_E; + if (unlikely(policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED)) return cpufreq_table_index_unsorted(policy, target_freq, relation); - +retry: switch (relation) { case CPUFREQ_RELATION_L: - return cpufreq_table_find_index_l(policy, target_freq); + idx = cpufreq_table_find_index_l(policy, target_freq, + efficiencies); + break; case CPUFREQ_RELATION_H: - return cpufreq_table_find_index_h(policy, target_freq); + idx = cpufreq_table_find_index_h(policy, target_freq, + efficiencies); + break; case CPUFREQ_RELATION_C: - return cpufreq_table_find_index_c(policy, target_freq); + idx = cpufreq_table_find_index_c(policy, target_freq, + efficiencies); + break; default: WARN_ON_ONCE(1); return 0; } + + if (idx < 0 && efficiencies) { + efficiencies = false; + goto retry; + } + + return idx; } static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy *policy) @@ -1003,6 +1070,37 @@ static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy return count; } +/** + * cpufreq_table_set_inefficient() - Mark a frequency as inefficient + * @policy: the &struct cpufreq_policy containing the inefficient frequency + * @frequency: the inefficient frequency + * + * The &struct cpufreq_policy must use a sorted frequency table + * + * Return: %0 on success or a negative errno code + */ + +static inline int +cpufreq_table_set_inefficient(struct cpufreq_policy *policy, + unsigned int frequency) +{ + struct cpufreq_frequency_table *pos; + + /* Not supported */ + if (policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED) + return -EINVAL; + + cpufreq_for_each_valid_entry(pos, policy->freq_table) { + if (pos->frequency == frequency) { + pos->flags |= CPUFREQ_INEFFICIENT_FREQ; + policy->efficiencies_available = true; + return 0; + } + } + + return -EINVAL; +} + static inline int parse_perf_domain(int cpu, const char *list_name, const char *cell_name) { @@ -1041,7 +1139,7 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ if (cpu == pcpu) continue; - ret = parse_perf_domain(pcpu, list_name, cell_name); + ret = parse_perf_domain(cpu, list_name, cell_name); if (ret < 0) continue; @@ -1071,6 +1169,13 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) return false; } +static inline int +cpufreq_table_set_inefficient(struct cpufreq_policy *policy, + unsigned int frequency) +{ + return -EINVAL; +} + static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name, const char *cell_name, struct cpumask *cpumask) { diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 991911048857..773c83730906 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -99,6 +99,7 @@ enum cpuhp_state { CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, + CPUHP_AP_DTPM_CPU_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, @@ -246,7 +247,6 @@ enum cpuhp_state { CPUHP_AP_MM_DEMOTION_ONLINE, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, - CPUHP_AP_DTPM_CPU_ONLINE, CPUHP_AP_ACTIVE, CPUHP_ONLINE, }; diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index e80a332e3d8a..2890f6370eb9 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -23,34 +23,32 @@ struct dtpm { u64 power_max; u64 power_min; int weight; - void *private; }; struct dtpm_ops { u64 (*set_power_uw)(struct dtpm *, u64); u64 (*get_power_uw)(struct dtpm *); + int (*update_power_uw)(struct dtpm *); void (*release)(struct dtpm *); }; -struct dtpm_descr; - -typedef int (*dtpm_init_t)(struct dtpm_descr *); +typedef int (*dtpm_init_t)(void); struct dtpm_descr { - struct dtpm *parent; - const char *name; dtpm_init_t init; }; /* Init section thermal table */ -extern struct dtpm_descr *__dtpm_table[]; -extern struct dtpm_descr *__dtpm_table_end[]; +extern struct dtpm_descr __dtpm_table[]; +extern struct dtpm_descr __dtpm_table_end[]; -#define DTPM_TABLE_ENTRY(name) \ - static typeof(name) *__dtpm_table_entry_##name \ - __used __section("__dtpm_table") = &name +#define DTPM_TABLE_ENTRY(name, __init) \ + static struct dtpm_descr __dtpm_table_entry_##name \ + __used __section("__dtpm_table") = { \ + .init = __init, \ + } -#define DTPM_DECLARE(name) DTPM_TABLE_ENTRY(name) +#define DTPM_DECLARE(name, init) DTPM_TABLE_ENTRY(name, init) #define for_each_dtpm_table(__dtpm) \ for (__dtpm = __dtpm_table; \ @@ -62,11 +60,11 @@ static inline struct dtpm *to_dtpm(struct powercap_zone *zone) return container_of(zone, struct dtpm, zone); } -int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max); +int dtpm_update_power(struct dtpm *dtpm); int dtpm_release_zone(struct powercap_zone *pcz); -struct dtpm *dtpm_alloc(struct dtpm_ops *ops); +void dtpm_init(struct dtpm *dtpm, struct dtpm_ops *ops); void dtpm_unregister(struct dtpm *dtpm); diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 39dcadd492b5..6377adc3b78d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -17,19 +17,30 @@ * device). It can be a total power: static and dynamic. * @cost: The cost coefficient associated with this level, used during * energy calculation. Equal to: power * max_frequency / frequency + * @flags: see "em_perf_state flags" description below. */ struct em_perf_state { unsigned long frequency; unsigned long power; unsigned long cost; + unsigned long flags; }; +/* + * em_perf_state flags: + * + * EM_PERF_STATE_INEFFICIENT: The performance state is inefficient. There is + * in this em_perf_domain, another performance state with a higher frequency + * but a lower or equal power cost. Such inefficient states are ignored when + * using em_pd_get_efficient_*() functions. + */ +#define EM_PERF_STATE_INEFFICIENT BIT(0) + /** * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states - * @milliwatts: Flag indicating the power values are in milli-Watts - * or some other scale. + * @flags: See "em_perf_domain flags" * @cpus: Cpumask covering the CPUs of the domain. It's here * for performance reasons to avoid potential cache * misses during energy calculations in the scheduler @@ -44,10 +55,22 @@ struct em_perf_state { struct em_perf_domain { struct em_perf_state *table; int nr_perf_states; - int milliwatts; + unsigned long flags; unsigned long cpus[]; }; +/* + * em_perf_domain flags: + * + * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some + * other scale. + * + * EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating + * energy consumption. + */ +#define EM_PERF_DOMAIN_MILLIWATTS BIT(0) +#define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1) + #define em_span_cpus(em) (to_cpumask((em)->cpus)) #ifdef CONFIG_ENERGY_MODEL @@ -102,6 +125,37 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, void em_dev_unregister_perf_domain(struct device *dev); /** + * em_pd_get_efficient_state() - Get an efficient performance state from the EM + * @pd : Performance domain for which we want an efficient frequency + * @freq : Frequency to map with the EM + * + * It is called from the scheduler code quite frequently and as a consequence + * doesn't implement any check. + * + * Return: An efficient performance state, high enough to meet @freq + * requirement. + */ +static inline +struct em_perf_state *em_pd_get_efficient_state(struct em_perf_domain *pd, + unsigned long freq) +{ + struct em_perf_state *ps; + int i; + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &pd->table[i]; + if (ps->frequency >= freq) { + if (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && + ps->flags & EM_PERF_STATE_INEFFICIENT) + continue; + break; + } + } + + return ps; +} + +/** * em_cpu_energy() - Estimates the energy consumed by the CPUs of a * performance domain * @pd : performance domain for which energy has to be estimated @@ -123,7 +177,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, { unsigned long freq, scale_cpu; struct em_perf_state *ps; - int i, cpu; + int cpu; if (!sum_util) return 0; @@ -148,11 +202,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested frequency. */ - for (i = 0; i < pd->nr_perf_states; i++) { - ps = &pd->table[i]; - if (ps->frequency >= freq) - break; - } + ps = em_pd_get_efficient_state(pd, freq); /* * The capacity of a CPU in the domain at the performance state (ps) diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h index cd5b62db9084..e63a63aa47a3 100644 --- a/include/linux/pm_wakeirq.h +++ b/include/linux/pm_wakeirq.h @@ -17,8 +17,8 @@ #ifdef CONFIG_PM extern int dev_pm_set_wake_irq(struct device *dev, int irq); -extern int dev_pm_set_dedicated_wake_irq(struct device *dev, - int irq); +extern int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq); +extern int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq); extern void dev_pm_clear_wake_irq(struct device *dev); extern void dev_pm_enable_wake_irq(struct device *dev); extern void dev_pm_disable_wake_irq(struct device *dev); @@ -35,6 +35,11 @@ static inline int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq) return 0; } +static inline int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq) +{ + return 0; +} + static inline void dev_pm_clear_wake_irq(struct device *dev) { } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index a332ccd829e2..0153b0ca7b23 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -2,7 +2,7 @@ /* * Energy Model of devices * - * Copyright (c) 2018-2020, Arm ltd. + * Copyright (c) 2018-2021, Arm ltd. * Written by: Quentin Perret, Arm ltd. * Improvements provided by: Lukasz Luba, Arm ltd. */ @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "energy_model: " fmt #include <linux/cpu.h> +#include <linux/cpufreq.h> #include <linux/cpumask.h> #include <linux/debugfs.h> #include <linux/energy_model.h> @@ -42,6 +43,7 @@ static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) debugfs_create_ulong("frequency", 0444, d, &ps->frequency); debugfs_create_ulong("power", 0444, d, &ps->power); debugfs_create_ulong("cost", 0444, d, &ps->cost); + debugfs_create_ulong("inefficient", 0444, d, &ps->flags); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -55,7 +57,8 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); static int em_debug_units_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; + char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? + "milliWatts" : "bogoWatts"; seq_printf(s, "%s\n", units); @@ -63,6 +66,17 @@ static int em_debug_units_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_units); +static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; + + seq_printf(s, "%d\n", enabled); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); + static void em_debug_create_pd(struct device *dev) { struct dentry *d; @@ -76,6 +90,8 @@ static void em_debug_create_pd(struct device *dev) &em_debug_cpus_fops); debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); + debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, + &em_debug_skip_inefficiencies_fops); /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) @@ -107,8 +123,7 @@ static void em_debug_remove_pd(struct device *dev) {} static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, int nr_states, struct em_data_callback *cb) { - unsigned long opp_eff, prev_opp_eff = ULONG_MAX; - unsigned long power, freq, prev_freq = 0; + unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; struct em_perf_state *table; int i, ret; u64 fmax; @@ -153,27 +168,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, table[i].power = power; table[i].frequency = prev_freq = freq; - - /* - * The hertz/watts efficiency ratio should decrease as the - * frequency grows on sane platforms. But this isn't always - * true in practice so warn the user if a higher OPP is more - * power efficient than a lower one. - */ - opp_eff = freq / power; - if (opp_eff >= prev_opp_eff) - dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", - i, i - 1); - prev_opp_eff = opp_eff; } /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; - for (i = 0; i < nr_states; i++) { + for (i = nr_states - 1; i >= 0; i--) { unsigned long power_res = em_scale_power(table[i].power); table[i].cost = div64_u64(fmax * power_res, table[i].frequency); + if (table[i].cost >= prev_cost) { + table[i].flags = EM_PERF_STATE_INEFFICIENT; + dev_dbg(dev, "EM: OPP:%lu is inefficient\n", + table[i].frequency); + } else { + prev_cost = table[i].cost; + } } pd->table = table; @@ -222,6 +232,43 @@ static int em_create_pd(struct device *dev, int nr_states, return 0; } +static void em_cpufreq_update_efficiencies(struct device *dev) +{ + struct em_perf_domain *pd = dev->em_pd; + struct em_perf_state *table; + struct cpufreq_policy *policy; + int found = 0; + int i; + + if (!_is_cpu_device(dev) || !pd) + return; + + policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd))); + if (!policy) { + dev_warn(dev, "EM: Access to CPUFreq policy failed"); + return; + } + + table = pd->table; + + for (i = 0; i < pd->nr_perf_states; i++) { + if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT)) + continue; + + if (!cpufreq_table_set_inefficient(policy, table[i].frequency)) + found++; + } + + if (!found) + return; + + /* + * Efficiencies have been installed in CPUFreq, inefficient frequencies + * will be skipped. The EM can do the same. + */ + pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES; +} + /** * em_pd_get() - Return the performance domain for a device * @dev : Device to find the performance domain for @@ -335,7 +382,10 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, if (ret) goto unlock; - dev->em_pd->milliwatts = milliwatts; + if (milliwatts) + dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; + + em_cpufreq_update_efficiencies(dev); em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 559acef3fddb..9ed9b744876c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -300,7 +300,7 @@ static int create_image(int platform_mode) if (error || hibernation_test(TEST_PLATFORM)) goto Platform_finish; - error = suspend_disable_secondary_cpus(); + error = pm_sleep_disable_secondary_cpus(); if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; @@ -342,7 +342,7 @@ static int create_image(int platform_mode) local_irq_enable(); Enable_cpus: - suspend_enable_secondary_cpus(); + pm_sleep_enable_secondary_cpus(); /* Allow architectures to do nosmt-specific post-resume dances */ if (!in_suspend) @@ -466,6 +466,8 @@ static int resume_target_kernel(bool platform_mode) if (error) goto Cleanup; + cpuidle_pause(); + error = hibernate_resume_nonboot_cpu_disable(); if (error) goto Enable_cpus; @@ -509,7 +511,7 @@ static int resume_target_kernel(bool platform_mode) local_irq_enable(); Enable_cpus: - suspend_enable_secondary_cpus(); + pm_sleep_enable_secondary_cpus(); Cleanup: platform_restore_cleanup(platform_mode); @@ -587,7 +589,7 @@ int hibernation_platform_enter(void) if (error) goto Platform_finish; - error = suspend_disable_secondary_cpus(); + error = pm_sleep_disable_secondary_cpus(); if (error) goto Enable_cpus; @@ -609,7 +611,7 @@ int hibernation_platform_enter(void) local_irq_enable(); Enable_cpus: - suspend_enable_secondary_cpus(); + pm_sleep_enable_secondary_cpus(); Platform_finish: hibernation_ops->finish(); diff --git a/kernel/power/power.h b/kernel/power/power.h index 778bf431ec02..326f8d032eb5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -4,6 +4,8 @@ #include <linux/utsname.h> #include <linux/freezer.h> #include <linux/compiler.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> struct swsusp_info { struct new_utsname uts; @@ -310,3 +312,15 @@ extern int pm_wake_lock(const char *buf); extern int pm_wake_unlock(const char *buf); #endif /* !CONFIG_PM_WAKELOCKS */ + +static inline int pm_sleep_disable_secondary_cpus(void) +{ + cpuidle_pause(); + return suspend_disable_secondary_cpus(); +} + +static inline void pm_sleep_enable_secondary_cpus(void) +{ + suspend_enable_secondary_cpus(); + cpuidle_resume(); +} diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index eb75f394a059..80cc1f0f502b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -97,7 +97,6 @@ static void s2idle_enter(void) raw_spin_unlock_irq(&s2idle_lock); cpus_read_lock(); - cpuidle_resume(); /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); @@ -105,7 +104,6 @@ static void s2idle_enter(void) swait_event_exclusive(s2idle_wait_head, s2idle_state == S2IDLE_STATE_WAKE); - cpuidle_pause(); cpus_read_unlock(); raw_spin_lock_irq(&s2idle_lock); @@ -162,11 +160,13 @@ EXPORT_SYMBOL_GPL(s2idle_wake); static bool valid_state(suspend_state_t state) { /* - * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level - * support and need to be valid to the low level - * implementation, no valid callback implies that none are valid. + * The PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states require low-level + * support and need to be valid to the low-level implementation. + * + * No ->valid() or ->enter() callback implies that none are valid. */ - return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state) && + suspend_ops->enter; } void __init pm_states_init(void) @@ -238,7 +238,7 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem); static bool sleep_state_supported(suspend_state_t state) { - return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter); + return state == PM_SUSPEND_TO_IDLE || valid_state(state); } static int platform_suspend_prepare(suspend_state_t state) @@ -422,7 +422,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } - error = suspend_disable_secondary_cpus(); + error = pm_sleep_disable_secondary_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -452,7 +452,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) BUG_ON(irqs_disabled()); Enable_cpus: - suspend_enable_secondary_cpus(); + pm_sleep_enable_secondary_cpus(); Platform_wake: platform_resume_noirq(state); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3cb89baebc79..ff326c2cb77b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -299,7 +299,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, return error; } -static blk_status_t hib_wait_io(struct hib_bio_batch *hb) +static int hib_wait_io(struct hib_bio_batch *hb) { /* * We are relying on the behavior of blk_plug that a thread with @@ -705,22 +705,19 @@ static int save_image_lzo(struct swap_map_handle *handle, goto out_clean; } - data = vmalloc(array_size(nr_threads, sizeof(*data))); + data = vzalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; goto out_clean; } - for (thr = 0; thr < nr_threads; thr++) - memset(&data[thr], 0, offsetof(struct cmp_data, go)); - crc = kmalloc(sizeof(*crc), GFP_KERNEL); + crc = kzalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } - memset(crc, 0, offsetof(struct crc_data, go)); /* * Start the compression threads. @@ -1198,22 +1195,19 @@ static int load_image_lzo(struct swap_map_handle *handle, goto out_clean; } - data = vmalloc(array_size(nr_threads, sizeof(*data))); + data = vzalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; goto out_clean; } - for (thr = 0; thr < nr_threads; thr++) - memset(&data[thr], 0, offsetof(struct dec_data, go)); - crc = kmalloc(sizeof(*crc), GFP_KERNEL); + crc = kzalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } - memset(crc, 0, offsetof(struct crc_data, go)); clean_pages_on_decompress = true; @@ -1521,9 +1515,10 @@ end: int swsusp_check(void) { int error; + void *holder; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - FMODE_READ, NULL); + FMODE_READ | FMODE_EXCL, &holder); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); @@ -1545,7 +1540,7 @@ int swsusp_check(void) put: if (error) - blkdev_put(hib_resume_bdev, FMODE_READ); + blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL); else pr_debug("Image signature found, resuming\n"); } else { |