diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-08-31 13:21:58 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-08-31 13:21:58 -0700 |
commit | 5cbba60596b1f32f637190ca9ed5b1acdadb852c (patch) | |
tree | 4e2b647602054304f1ec9e58603defe954fa43b8 | |
parent | 9b2eacd8f04625c6cb2dd82469972a3bba3a783a (diff) | |
parent | fe583359ddf0d509275b87b635fa8b2e3794321e (diff) |
Merge tag 'pm-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
Pull power management updates from Rafael Wysocki:
"These address some PCI device power management issues, add new
hardware support to the RAPL power capping driver, add HWP guaranteed
performance change notification support to the intel_pstate driver,
replace deprecated CPU-hotplug functions in a few places, update CPU
PM notifiers to use raw spinlocks, update the PM domains framework
(new DT property support, Kconfig fix), do a couple of cleanups in
code related to system sleep, and improve the energy model and the
schedutil cpufreq governor.
Specifics:
- Address 3 PCI device power management issues (Rafael Wysocki).
- Add Power Limit4 support for Alder Lake to the Intel RAPL power
capping driver (Sumeet Pawnikar).
- Add HWP guaranteed performance change notification support to the
intel_pstate driver (Srinivas Pandruvada).
- Replace deprecated CPU-hotplug functions in code related to power
management (Sebastian Andrzej Siewior).
- Update CPU PM notifiers to use raw spinlocks (Valentin Schneider).
- Add support for 'required-opps' DT property to the generic power
domains (genpd) framework and use this property for I2C on ARM64
sc7180 (Rajendra Nayak).
- Fix Kconfig issue related to genpd (Geert Uytterhoeven).
- Increase energy calculation precision in the Energy Model (Lukasz
Luba).
- Fix kobject deletion in the exit code of the schedutil cpufreq
governor (Kevin Hao).
- Unmark some functions as kernel-doc in the PM core to avoid
false-positive documentation build warnings (Randy Dunlap).
- Check RTC features instead of ops in suspend_test Alexandre
Belloni)"
* tag 'pm-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm:
PM: domains: Fix domain attach for CONFIG_PM_OPP=n
powercap: Add Power Limit4 support for Alder Lake SoC
cpufreq: intel_pstate: Process HWP Guaranteed change notification
thermal: intel: Allow processing of HWP interrupt
notifier: Remove atomic_notifier_call_chain_robust()
PM: cpu: Make notifier chain use a raw_spinlock_t
PM: sleep: unmark 'state' functions as kernel-doc
arm64: dts: sc7180: Add required-opps for i2c
PM: domains: Add support for 'required-opps' to set default perf state
opp: Don't print an error if required-opps is missing
cpufreq: schedutil: Use kobject release() method to free sugov_tunables
PM: EM: Increase energy calculation precision
PM: sleep: check RTC features instead of ops in suspend_test
PM: sleep: s2idle: Replace deprecated CPU-hotplug functions
cpufreq: Replace deprecated CPU-hotplug functions
powercap: intel_rapl: Replace deprecated CPU-hotplug functions
PCI: PM: Enable PME if it can be signaled from D3cold
PCI: PM: Avoid forcing PCI_D0 for wakeup reasons inconsistently
PCI: Use pci_update_current_state() in pci_enable_device_flags()
-rw-r--r-- | arch/arm64/boot/dts/qcom/sc7180.dtsi | 24 | ||||
-rw-r--r-- | drivers/base/power/domain.c | 30 | ||||
-rw-r--r-- | drivers/cpufreq/acpi-cpufreq.c | 4 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq.c | 6 | ||||
-rw-r--r-- | drivers/cpufreq/cpufreq_ondemand.c | 4 | ||||
-rw-r--r-- | drivers/cpufreq/intel_pstate.c | 43 | ||||
-rw-r--r-- | drivers/cpufreq/powernow-k8.c | 6 | ||||
-rw-r--r-- | drivers/cpufreq/powernv-cpufreq.c | 4 | ||||
-rw-r--r-- | drivers/opp/of.c | 12 | ||||
-rw-r--r-- | drivers/pci/pci.c | 31 | ||||
-rw-r--r-- | drivers/powercap/intel_rapl_common.c | 50 | ||||
-rw-r--r-- | drivers/powercap/intel_rapl_msr.c | 2 | ||||
-rw-r--r-- | drivers/thermal/intel/therm_throt.c | 7 | ||||
-rw-r--r-- | drivers/thermal/intel/thermal_interrupt.h | 3 | ||||
-rw-r--r-- | include/linux/energy_model.h | 16 | ||||
-rw-r--r-- | include/linux/notifier.h | 2 | ||||
-rw-r--r-- | include/linux/pm_domain.h | 1 | ||||
-rw-r--r-- | kernel/cpu_pm.c | 50 | ||||
-rw-r--r-- | kernel/notifier.c | 19 | ||||
-rw-r--r-- | kernel/power/energy_model.c | 4 | ||||
-rw-r--r-- | kernel/power/main.c | 2 | ||||
-rw-r--r-- | kernel/power/suspend.c | 4 | ||||
-rw-r--r-- | kernel/power/suspend_test.c | 2 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 16 |
24 files changed, 235 insertions, 107 deletions
diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi b/arch/arm64/boot/dts/qcom/sc7180.dtsi index a9a052f8c63c..e7f0e5cde424 100644 --- a/arch/arm64/boot/dts/qcom/sc7180.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi @@ -786,6 +786,8 @@ <&aggre1_noc MASTER_QUP_0 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -838,6 +840,8 @@ <&aggre1_noc MASTER_QUP_0 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -890,6 +894,8 @@ <&aggre1_noc MASTER_QUP_0 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -924,6 +930,8 @@ <&aggre1_noc MASTER_QUP_0 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -976,6 +984,8 @@ <&aggre1_noc MASTER_QUP_0 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -1010,6 +1020,8 @@ <&aggre1_noc MASTER_QUP_0 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -1075,6 +1087,8 @@ <&aggre2_noc MASTER_QUP_1 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -1127,6 +1141,8 @@ <&aggre2_noc MASTER_QUP_1 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -1161,6 +1177,8 @@ <&aggre2_noc MASTER_QUP_1 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -1213,6 +1231,8 @@ <&aggre2_noc MASTER_QUP_1 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -1247,6 +1267,8 @@ <&aggre2_noc MASTER_QUP_1 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; @@ -1299,6 +1321,8 @@ <&aggre2_noc MASTER_QUP_1 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "qup-core", "qup-config", "qup-memory"; + power-domains = <&rpmhpd SC7180_CX>; + required-opps = <&rpmhpd_opp_low_svs>; status = "disabled"; }; diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index f10688e83226..5db704f02e71 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2604,6 +2604,12 @@ static void genpd_dev_pm_detach(struct device *dev, bool power_off) dev_dbg(dev, "removing from PM domain %s\n", pd->name); + /* Drop the default performance state */ + if (dev_gpd_data(dev)->default_pstate) { + dev_pm_genpd_set_performance_state(dev, 0); + dev_gpd_data(dev)->default_pstate = 0; + } + for (i = 1; i < GENPD_RETRY_MAX_MS; i <<= 1) { ret = genpd_remove_device(pd, dev); if (ret != -EAGAIN) @@ -2643,6 +2649,7 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev, { struct of_phandle_args pd_args; struct generic_pm_domain *pd; + int pstate; int ret; ret = of_parse_phandle_with_args(dev->of_node, "power-domains", @@ -2681,10 +2688,29 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev, genpd_unlock(pd); } - if (ret) + if (ret) { genpd_remove_device(pd, dev); + return -EPROBE_DEFER; + } - return ret ? -EPROBE_DEFER : 1; + /* Set the default performance state */ + pstate = of_get_required_opp_performance_state(dev->of_node, index); + if (pstate < 0 && pstate != -ENODEV && pstate != -EOPNOTSUPP) { + ret = pstate; + goto err; + } else if (pstate > 0) { + ret = dev_pm_genpd_set_performance_state(dev, pstate); + if (ret) + goto err; + dev_gpd_data(dev)->default_pstate = pstate; + } + return 1; + +err: + dev_err(dev, "failed to set required performance state for power-domain %s: %d\n", + pd->name, ret); + genpd_remove_device(pd, dev); + return ret; } /** diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 7e7450453714..b49612895c78 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -163,9 +163,9 @@ static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, if (ret || val > 1) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); set_boost(policy, val); - put_online_cpus(); + cpus_read_unlock(); return count; } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 45f3416988f1..06c526d66dd3 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2654,18 +2654,18 @@ int cpufreq_boost_trigger_state(int state) cpufreq_driver->boost_enabled = state; write_unlock_irqrestore(&cpufreq_driver_lock, flags); - get_online_cpus(); + cpus_read_lock(); for_each_active_policy(policy) { ret = cpufreq_driver->set_boost(policy, state); if (ret) goto err_reset_state; } - put_online_cpus(); + cpus_read_unlock(); return 0; err_reset_state: - put_online_cpus(); + cpus_read_unlock(); write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver->boost_enabled = !state; diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index ac361a8b1d3b..eb4320b619c9 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -418,7 +418,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias) default_powersave_bias = powersave_bias; cpumask_clear(&done); - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct cpufreq_policy *policy; struct policy_dbs_info *policy_dbs; @@ -442,7 +442,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias) od_tuners = dbs_data->tuners; od_tuners->powersave_bias = default_powersave_bias; } - put_online_cpus(); + cpus_read_unlock(); } void od_register_powersave_bias_handler(unsigned int (*f) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index bb4549959b11..b4ffe6c8a0d0 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -32,6 +32,7 @@ #include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <asm/intel-family.h> +#include "../drivers/thermal/intel/thermal_interrupt.h" #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) @@ -219,6 +220,7 @@ struct global_params { * @sched_flags: Store scheduler flags for possible cross CPU update * @hwp_boost_min: Last HWP boosted min performance * @suspended: Whether or not the driver has been suspended. + * @hwp_notify_work: workqueue for HWP notifications. * * This structure stores per CPU instance data for all CPUs. */ @@ -257,6 +259,7 @@ struct cpudata { unsigned int sched_flags; u32 hwp_boost_min; bool suspended; + struct delayed_work hwp_notify_work; }; static struct cpudata **all_cpu_data; @@ -1625,6 +1628,40 @@ static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void) /************************** sysfs end ************************/ +static void intel_pstate_notify_work(struct work_struct *work) +{ + mutex_lock(&intel_pstate_driver_lock); + cpufreq_update_policy(smp_processor_id()); + wrmsrl(MSR_HWP_STATUS, 0); + mutex_unlock(&intel_pstate_driver_lock); +} + +void notify_hwp_interrupt(void) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpudata *cpudata; + u64 value; + + if (!hwp_active || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) + return; + + rdmsrl(MSR_HWP_STATUS, value); + if (!(value & 0x01)) + return; + + cpudata = all_cpu_data[this_cpu]; + schedule_delayed_work_on(this_cpu, &cpudata->hwp_notify_work, msecs_to_jiffies(10)); +} + +static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata) +{ + /* Enable HWP notification interrupt for guaranteed performance change */ + if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) { + INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work); + wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01); + } +} + static void intel_pstate_hwp_enable(struct cpudata *cpudata) { /* First disable HWP notification interrupt as we don't process them */ @@ -1634,6 +1671,8 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata) wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); if (cpudata->epp_default == -EINVAL) cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); + + intel_pstate_enable_hwp_interrupt(cpudata); } static int atom_get_min_pstate(void) @@ -2969,7 +3008,7 @@ static void intel_pstate_driver_cleanup(void) { unsigned int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { if (all_cpu_data[cpu]) { if (intel_pstate_driver == &intel_pstate) @@ -2979,7 +3018,7 @@ static void intel_pstate_driver_cleanup(void) all_cpu_data[cpu] = NULL; } } - put_online_cpus(); + cpus_read_unlock(); intel_pstate_driver = NULL; } diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c index b9ccb6a3dad9..12ab4014af71 100644 --- a/drivers/cpufreq/powernow-k8.c +++ b/drivers/cpufreq/powernow-k8.c @@ -1180,7 +1180,7 @@ static int powernowk8_init(void) if (!x86_match_cpu(powernow_k8_ids)) return -ENODEV; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(i) { smp_call_function_single(i, check_supported_cpu, &ret, 1); if (!ret) @@ -1188,10 +1188,10 @@ static int powernowk8_init(void) } if (supported_cpus != num_online_cpus()) { - put_online_cpus(); + cpus_read_unlock(); return -ENODEV; } - put_online_cpus(); + cpus_read_unlock(); ret = cpufreq_register_driver(&cpufreq_amd64_driver); if (ret) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 005600cef273..23a06cba392c 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -918,7 +918,7 @@ static void powernv_cpufreq_work_fn(struct work_struct *work) unsigned int cpu; cpumask_t mask; - get_online_cpus(); + cpus_read_lock(); cpumask_and(&mask, &chip->mask, cpu_online_mask); smp_call_function_any(&mask, powernv_cpufreq_throttle_check, NULL, 0); @@ -939,7 +939,7 @@ static void powernv_cpufreq_work_fn(struct work_struct *work) cpufreq_cpu_put(policy); } out: - put_online_cpus(); + cpus_read_unlock(); } static int powernv_cpufreq_occ_msg(struct notifier_block *nb, diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 67f2e0710e79..2a97c6535c4c 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -95,15 +95,7 @@ static struct dev_pm_opp *_find_opp_of_np(struct opp_table *opp_table, static struct device_node *of_parse_required_opp(struct device_node *np, int index) { - struct device_node *required_np; - - required_np = of_parse_phandle(np, "required-opps", index); - if (unlikely(!required_np)) { - pr_err("%s: Unable to parse required-opps: %pOF, index: %d\n", - __func__, np, index); - } - - return required_np; + return of_parse_phandle(np, "required-opps", index); } /* The caller must call dev_pm_opp_put_opp_table() after the table is used */ @@ -1328,7 +1320,7 @@ int of_get_required_opp_performance_state(struct device_node *np, int index) required_np = of_parse_required_opp(np, index); if (!required_np) - return -EINVAL; + return -ENODEV; opp_table = _find_table_of_opp_np(required_np); if (IS_ERR(opp_table)) { diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index aacf575c15cf..a5e6759c407b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1906,11 +1906,7 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags) * so that things like MSI message writing will behave as expected * (e.g. if the device really is in D0 at enable time). */ - if (dev->pm_cap) { - u16 pmcsr; - pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); - dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); - } + pci_update_current_state(dev, dev->current_state); if (atomic_inc_return(&dev->enable_cnt) > 1) return 0; /* already enabled */ @@ -2495,7 +2491,14 @@ static int __pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable if (enable) { int error; - if (pci_pme_capable(dev, state)) + /* + * Enable PME signaling if the device can signal PME from + * D3cold regardless of whether or not it can signal PME from + * the current target state, because that will allow it to + * signal PME when the hierarchy above it goes into D3cold and + * the device itself ends up in D3cold as a result of that. + */ + if (pci_pme_capable(dev, state) || pci_pme_capable(dev, PCI_D3cold)) pci_pme_active(dev, true); else ret = 1; @@ -2599,16 +2602,20 @@ static pci_power_t pci_target_state(struct pci_dev *dev, bool wakeup) if (dev->current_state == PCI_D3cold) target_state = PCI_D3cold; - if (wakeup) { + if (wakeup && dev->pme_support) { + pci_power_t state = target_state; + /* * Find the deepest state from which the device can generate * PME#. */ - if (dev->pme_support) { - while (target_state - && !(dev->pme_support & (1 << target_state))) - target_state--; - } + while (state && !(dev->pme_support & (1 << state))) + state--; + + if (state) + return state; + else if (dev->pme_support & 1) + return PCI_D0; } return target_state; diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 73cf68af9770..7c0099e7a6d7 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -158,16 +158,16 @@ static int get_energy_counter(struct powercap_zone *power_zone, /* prevent CPU hotplug, make sure the RAPL domain does not go * away while reading the counter. */ - get_online_cpus(); + cpus_read_lock(); rd = power_zone_to_rapl_domain(power_zone); if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) { *energy_raw = energy_now; - put_online_cpus(); + cpus_read_unlock(); return 0; } - put_online_cpus(); + cpus_read_unlock(); return -EIO; } @@ -216,11 +216,11 @@ static int set_domain_enable(struct powercap_zone *power_zone, bool mode) if (rd->state & DOMAIN_STATE_BIOS_LOCKED) return -EACCES; - get_online_cpus(); + cpus_read_lock(); rapl_write_data_raw(rd, PL1_ENABLE, mode); if (rapl_defaults->set_floor_freq) rapl_defaults->set_floor_freq(rd, mode); - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -234,13 +234,13 @@ static int get_domain_enable(struct powercap_zone *power_zone, bool *mode) *mode = false; return 0; } - get_online_cpus(); + cpus_read_lock(); if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) { - put_online_cpus(); + cpus_read_unlock(); return -EIO; } *mode = val; - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -317,7 +317,7 @@ static int set_power_limit(struct powercap_zone *power_zone, int cid, int ret = 0; int id; - get_online_cpus(); + cpus_read_lock(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); if (id < 0) { @@ -350,7 +350,7 @@ static int set_power_limit(struct powercap_zone *power_zone, int cid, if (!ret) package_power_limit_irq_save(rp); set_exit: - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -363,7 +363,7 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid, int ret = 0; int id; - get_online_cpus(); + cpus_read_lock(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); if (id < 0) { @@ -382,7 +382,7 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid, prim = POWER_LIMIT4; break; default: - put_online_cpus(); + cpus_read_unlock(); return -EINVAL; } if (rapl_read_data_raw(rd, prim, true, &val)) @@ -391,7 +391,7 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid, *data = val; get_exit: - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -403,7 +403,7 @@ static int set_time_window(struct powercap_zone *power_zone, int cid, int ret = 0; int id; - get_online_cpus(); + cpus_read_lock(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); if (id < 0) { @@ -423,7 +423,7 @@ static int set_time_window(struct powercap_zone *power_zone, int cid, } set_time_exit: - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -435,7 +435,7 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, int ret = 0; int id; - get_online_cpus(); + cpus_read_lock(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); if (id < 0) { @@ -458,14 +458,14 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, val = 0; break; default: - put_online_cpus(); + cpus_read_unlock(); return -EINVAL; } if (!ret) *data = val; get_time_exit: - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -491,7 +491,7 @@ static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data) int prim; int ret = 0; - get_online_cpus(); + cpus_read_lock(); rd = power_zone_to_rapl_domain(power_zone); switch (rd->rpl[id].prim_id) { case PL1_ENABLE: @@ -504,7 +504,7 @@ static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data) prim = MAX_POWER; break; default: - put_online_cpus(); + cpus_read_unlock(); return -EINVAL; } if (rapl_read_data_raw(rd, prim, true, &val)) @@ -516,7 +516,7 @@ static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data) if (rd->rpl[id].prim_id == PL4_ENABLE) *data = *data * 2; - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -1358,7 +1358,7 @@ static void power_limit_state_save(void) struct rapl_domain *rd; int nr_pl, ret, i; - get_online_cpus(); + cpus_read_lock(); list_for_each_entry(rp, &rapl_packages, plist) { if (!rp->power_zone) continue; @@ -1390,7 +1390,7 @@ static void power_limit_state_save(void) } } } - put_online_cpus(); + cpus_read_unlock(); } static void power_limit_state_restore(void) @@ -1399,7 +1399,7 @@ static void power_limit_state_restore(void) struct rapl_domain *rd; int nr_pl, i; - get_online_cpus(); + cpus_read_lock(); list_for_each_entry(rp, &rapl_packages, plist) { if (!rp->power_zone) continue; @@ -1425,7 +1425,7 @@ static void power_limit_state_restore(void) } } } - put_online_cpus(); + cpus_read_unlock(); } static int rapl_pm_callback(struct notifier_block *nb, diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index cc3b22881bfe..1be45f36ab6c 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -138,6 +138,8 @@ static int rapl_msr_write_raw(int cpu, struct reg_action *ra) /* List of verified CPUs. */ static const struct x86_cpu_id pl4_support_ids[] = { { X86_VENDOR_INTEL, 6, INTEL_FAM6_TIGERLAKE_L, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ALDERLAKE, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ALDERLAKE_L, X86_FEATURE_ANY }, {} }; diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index 99abdc03c44c..dab7e8fb1059 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -569,13 +569,18 @@ static void notify_thresholds(__u64 msr_val) platform_thermal_notify(msr_val); } +void __weak notify_hwp_interrupt(void) +{ + wrmsrl_safe(MSR_HWP_STATUS, 0); +} + /* Thermal transition interrupt handler */ void intel_thermal_interrupt(void) { __u64 msr_val; if (static_cpu_has(X86_FEATURE_HWP)) - wrmsrl_safe(MSR_HWP_STATUS, 0); + notify_hwp_interrupt(); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); diff --git a/drivers/thermal/intel/thermal_interrupt.h b/drivers/thermal/intel/thermal_interrupt.h index 53f427bb58dc..01e7bed2ffc7 100644 --- a/drivers/thermal/intel/thermal_interrupt.h +++ b/drivers/thermal/intel/thermal_interrupt.h @@ -12,4 +12,7 @@ extern int (*platform_thermal_notify)(__u64 msr_val); * callback has rate control */ extern bool (*platform_thermal_package_rate_control)(void); +/* Handle HWP interrupt */ +extern void notify_hwp_interrupt(void); + #endif /* _INTEL_THERMAL_INTERRUPT_H */ diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 3f221dbf5f95..1834752c5617 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -53,6 +53,22 @@ struct em_perf_domain { #ifdef CONFIG_ENERGY_MODEL #define EM_MAX_POWER 0xFFFF +/* + * Increase resolution of energy estimation calculations for 64-bit + * architectures. The extra resolution improves decision made by EAS for the + * task placement when two Performance Domains might provide similar energy + * estimation values (w/o better resolution the values could be equal). + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. + */ +#ifdef CONFIG_64BIT +#define em_scale_power(p) ((p) * 1000) +#else +#define em_scale_power(p) (p) +#endif + struct em_data_callback { /** * active_power() - Provide power at the next performance state of diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 2fb373a5c1ed..87069b8459af 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -168,8 +168,6 @@ extern int raw_notifier_call_chain(struct raw_notifier_head *nh, extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v); -extern int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, - unsigned long val_up, unsigned long val_down, void *v); extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 21a0577305ef..67017c9390c8 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -198,6 +198,7 @@ struct generic_pm_domain_data { struct notifier_block *power_nb; int cpu; unsigned int performance_state; + unsigned int default_pstate; unsigned int rpm_pstate; ktime_t next_wakeup; void *data; diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index f7e1d0eccdbc..246efc74e3f3 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -13,19 +13,32 @@ #include <linux/spinlock.h> #include <linux/syscore_ops.h> -static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain); +/* + * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT. + * Notifications for cpu_pm will be issued by the idle task itself, which can + * never block, IOW it requires using a raw_spinlock_t. + */ +static struct { + struct raw_notifier_head chain; + raw_spinlock_t lock; +} cpu_pm_notifier = { + .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain), + .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock), +}; static int cpu_pm_notify(enum cpu_pm_event event) { int ret; /* - * atomic_notifier_call_chain has a RCU read critical section, which - * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let - * RCU know this. + * This introduces a RCU read critical section, which could be + * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know + * this. */ rcu_irq_enter_irqson(); - ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL); + rcu_read_lock(); + ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); + rcu_read_unlock(); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -33,10 +46,13 @@ static int cpu_pm_notify(enum cpu_pm_event event) static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down) { + unsigned long flags; int ret; rcu_irq_enter_irqson(); - ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL); + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -49,12 +65,17 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev * Add a driver to a list of drivers that are notified about * CPU and CPU cluster low power entry and exit. * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_register. + * This function has the same return conditions as raw_notifier_chain_register. */ int cpu_pm_register_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); + return ret; } EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); @@ -64,12 +85,17 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); * * Remove a driver from the CPU PM notifier list. * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_unregister. + * This function has the same return conditions as raw_notifier_chain_unregister. */ int cpu_pm_unregister_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); + return ret; } EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); diff --git a/kernel/notifier.c b/kernel/notifier.c index 1b019cbca594..b8251dc0bc0f 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -172,25 +172,6 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); -int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, - unsigned long val_up, unsigned long val_down, void *v) -{ - unsigned long flags; - int ret; - - /* - * Musn't use RCU; because then the notifier list can - * change between the up and down traversal. - */ - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); - spin_unlock_irqrestore(&nh->lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust); -NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust); - /** * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0f4530b3a8cd..a332ccd829e2 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -170,7 +170,9 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { - table[i].cost = div64_u64(fmax * table[i].power, + unsigned long power_res = em_scale_power(table[i].power); + + table[i].cost = div64_u64(fmax * power_res, table[i].frequency); } diff --git a/kernel/power/main.c b/kernel/power/main.c index 12c7e1bb442f..44169f3081fd 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -577,7 +577,7 @@ static inline void pm_print_times_init(void) {} struct kobject *power_kobj; -/** +/* * state - control system sleep states. * * show() returns available sleep state labels, which may be "mem", "standby", diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index d8cae434f9eb..eb75f394a059 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -96,7 +96,7 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - get_online_cpus(); + cpus_read_lock(); cpuidle_resume(); /* Push all the CPUs into the idle loop. */ @@ -106,7 +106,7 @@ static void s2idle_enter(void) s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); - put_online_cpus(); + cpus_read_unlock(); raw_spin_lock_irq(&s2idle_lock); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index e1ed58adb69e..d20526c5be15 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -129,7 +129,7 @@ static int __init has_wakealarm(struct device *dev, const void *data) { struct rtc_device *candidate = to_rtc_device(dev); - if (!candidate->ops->set_alarm) + if (!test_bit(RTC_FEATURE_ALARM, candidate->features)) return 0; if (!device_may_wakeup(candidate->dev.parent)) return 0; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 57124614363d..e7af18857371 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -537,9 +537,17 @@ static struct attribute *sugov_attrs[] = { }; ATTRIBUTE_GROUPS(sugov); +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + + kfree(to_sugov_tunables(attr_set)); +} + static struct kobj_type sugov_tunables_ktype = { .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ @@ -639,12 +647,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -707,7 +713,7 @@ out: fail: kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -734,7 +740,7 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); |