From bfcc1e67ff1e4aa8bfe2ca57f99390fc284c799d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 14 Sep 2021 19:23:28 -0700 Subject: PM: sleep: Do not assume that "mem" is always present An implementation of suspend_ops is allowed to reject the PM_SUSPEND_MEM suspend type from its ->valid() callback, we should not assume that it is always present as this is not a correct reflection of what a firmware interface may support. Fixes: 406e79385f32 ("PM / sleep: System sleep state selection interface rework") Signed-off-by: Florian Fainelli Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index eb75f394a059..02e306ad8db8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -171,8 +171,7 @@ static bool valid_state(suspend_state_t state) void __init pm_states_init(void) { - /* "mem" and "freeze" are always present in /sys/power/state. */ - pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; + /* "freeze" is always present in /sys/power/state. */ pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE]; /* * Suspend-to-idle should be supported even without any suspend_ops, @@ -214,6 +213,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) } if (valid_state(PM_SUSPEND_MEM)) { mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; + pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; if (mem_sleep_default >= PM_SUSPEND_MEM) mem_sleep_current = PM_SUSPEND_MEM; } -- cgit v1.2.3-70-g09d2 From 5416da01ff6e7275f9a4cfd7ff99e6b12b8dc2a8 Mon Sep 17 00:00:00 2001 From: Falla Coulibaly Date: Wed, 18 Aug 2021 16:47:40 -0500 Subject: PM: hibernate: Remove blk_status_to_errno in hib_wait_io blk_status_to_errno doesn't appear to perform extra work besides converting blk_status_t to integer. This patch removes that unnecessary conversion as the return type of the function is blk_status_t. Signed-off-by: Falla Coulibaly Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/power') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3cb89baebc79..9ec418955556 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -306,7 +306,7 @@ static blk_status_t hib_wait_io(struct hib_bio_batch *hb) * a plug will flush the plug list before sleeping. */ wait_event(hb->wait, atomic_read(&hb->count) == 0); - return blk_status_to_errno(hb->error); + return hb->error; } /* -- cgit v1.2.3-70-g09d2 From aa1a43262ad5df010768f69530fa179ff81651d3 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:22 +0100 Subject: PM: EM: Fix inefficient states detection Currently, a debug message is printed if an inefficient state is detected in the Energy Model. Unfortunately, it won't detect if the first state is inefficient or if two successive states are. Fix this behavior. Fixes: 27871f7a8a34 (PM: Introduce an Energy Model management framework) Signed-off-by: Vincent Donnefort Reviewed-by: Quentin Perret Reviewed-by: Lukasz Luba Reviewed-by: Matthias Kaehlcke Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index a332ccd829e2..97e62469a6b3 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -107,8 +107,7 @@ static void em_debug_remove_pd(struct device *dev) {} static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, int nr_states, struct em_data_callback *cb) { - unsigned long opp_eff, prev_opp_eff = ULONG_MAX; - unsigned long power, freq, prev_freq = 0; + unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; struct em_perf_state *table; int i, ret; u64 fmax; @@ -153,27 +152,21 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, table[i].power = power; table[i].frequency = prev_freq = freq; - - /* - * The hertz/watts efficiency ratio should decrease as the - * frequency grows on sane platforms. But this isn't always - * true in practice so warn the user if a higher OPP is more - * power efficient than a lower one. - */ - opp_eff = freq / power; - if (opp_eff >= prev_opp_eff) - dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", - i, i - 1); - prev_opp_eff = opp_eff; } /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; - for (i = 0; i < nr_states; i++) { + for (i = nr_states - 1; i >= 0; i--) { unsigned long power_res = em_scale_power(table[i].power); table[i].cost = div64_u64(fmax * power_res, table[i].frequency); + if (table[i].cost >= prev_cost) { + dev_dbg(dev, "EM: OPP:%lu is inefficient\n", + table[i].frequency); + } else { + prev_cost = table[i].cost; + } } pd->table = table; -- cgit v1.2.3-70-g09d2 From c8ed99533dbc0fcc1142671ec80acb33045d2999 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:23 +0100 Subject: PM: EM: Mark inefficient states Some SoCs, such as the sd855 have OPPs within the same performance domain, whose cost is higher than others with a higher frequency. Even though those OPPs are interesting from a cooling perspective, it makes no sense to use them when the device can run at full capacity. Those OPPs handicap the performance domain, when choosing the most energy-efficient CPU and are wasting energy. They are inefficient. Hence, add support for such OPPs to the Energy Model. The table can now be read skipping inefficient performance states (and by extension, inefficient OPPs). Signed-off-by: Vincent Donnefort Reviewed-by: Matthias Kaehlcke Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 12 ++++++++++++ kernel/power/energy_model.c | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel/power') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 39dcadd492b5..3641ca4acf04 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -17,13 +17,25 @@ * device). It can be a total power: static and dynamic. * @cost: The cost coefficient associated with this level, used during * energy calculation. Equal to: power * max_frequency / frequency + * @flags: see "em_perf_state flags" description below. */ struct em_perf_state { unsigned long frequency; unsigned long power; unsigned long cost; + unsigned long flags; }; +/* + * em_perf_state flags: + * + * EM_PERF_STATE_INEFFICIENT: The performance state is inefficient. There is + * in this em_perf_domain, another performance state with a higher frequency + * but a lower or equal power cost. Such inefficient states are ignored when + * using em_pd_get_efficient_*() functions. + */ +#define EM_PERF_STATE_INEFFICIENT BIT(0) + /** * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 97e62469a6b3..6d8438347535 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -2,7 +2,7 @@ /* * Energy Model of devices * - * Copyright (c) 2018-2020, Arm ltd. + * Copyright (c) 2018-2021, Arm ltd. * Written by: Quentin Perret, Arm ltd. * Improvements provided by: Lukasz Luba, Arm ltd. */ @@ -42,6 +42,7 @@ static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) debugfs_create_ulong("frequency", 0444, d, &ps->frequency); debugfs_create_ulong("power", 0444, d, &ps->power); debugfs_create_ulong("cost", 0444, d, &ps->cost); + debugfs_create_ulong("inefficient", 0444, d, &ps->flags); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -162,6 +163,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, table[i].cost = div64_u64(fmax * power_res, table[i].frequency); if (table[i].cost >= prev_cost) { + table[i].flags = EM_PERF_STATE_INEFFICIENT; dev_dbg(dev, "EM: OPP:%lu is inefficient\n", table[i].frequency); } else { -- cgit v1.2.3-70-g09d2 From 88f7a89560f6d0fc7803a8933637488f14e0a098 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:24 +0100 Subject: PM: EM: Extend em_perf_domain with a flag field Merge the current "milliwatts" option into a "flag" field. This intends to prepare the extension of this structure for inefficient states support in the Energy Model. Signed-off-by: Vincent Donnefort Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 13 ++++++++++--- kernel/power/energy_model.c | 6 ++++-- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel/power') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 3641ca4acf04..671440371a95 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -40,8 +40,7 @@ struct em_perf_state { * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states - * @milliwatts: Flag indicating the power values are in milli-Watts - * or some other scale. + * @flags: See "em_perf_domain flags" * @cpus: Cpumask covering the CPUs of the domain. It's here * for performance reasons to avoid potential cache * misses during energy calculations in the scheduler @@ -56,10 +55,18 @@ struct em_perf_state { struct em_perf_domain { struct em_perf_state *table; int nr_perf_states; - int milliwatts; + unsigned long flags; unsigned long cpus[]; }; +/* + * em_perf_domain flags: + * + * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some + * other scale. + */ +#define EM_PERF_DOMAIN_MILLIWATTS BIT(0) + #define em_span_cpus(em) (to_cpumask((em)->cpus)) #ifdef CONFIG_ENERGY_MODEL diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 6d8438347535..3a7d1573b214 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -56,7 +56,8 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); static int em_debug_units_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; + char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? + "milliWatts" : "bogoWatts"; seq_printf(s, "%s\n", units); @@ -330,7 +331,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, if (ret) goto unlock; - dev->em_pd->milliwatts = milliwatts; + if (milliwatts) + dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); -- cgit v1.2.3-70-g09d2 From 8354eb9eb3ddb4a8d0857648a470beffcc9d8639 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:25 +0100 Subject: PM: EM: Allow skipping inefficient states The new performance domain flag EM_PERF_DOMAIN_SKIP_INEFFICIENCIES allows to not take into account inefficient states when estimating energy consumption. This intends to let the Energy Model know that CPUFreq itself will skip inefficiencies and such states don't need to be part of the estimation anymore. Signed-off-by: Vincent Donnefort Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 43 +++++++++++++++++++++++++++++++++++++------ kernel/power/energy_model.c | 13 +++++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) (limited to 'kernel/power') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 671440371a95..6377adc3b78d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -64,8 +64,12 @@ struct em_perf_domain { * * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some * other scale. + * + * EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating + * energy consumption. */ #define EM_PERF_DOMAIN_MILLIWATTS BIT(0) +#define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1) #define em_span_cpus(em) (to_cpumask((em)->cpus)) @@ -120,6 +124,37 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, bool milliwatts); void em_dev_unregister_perf_domain(struct device *dev); +/** + * em_pd_get_efficient_state() - Get an efficient performance state from the EM + * @pd : Performance domain for which we want an efficient frequency + * @freq : Frequency to map with the EM + * + * It is called from the scheduler code quite frequently and as a consequence + * doesn't implement any check. + * + * Return: An efficient performance state, high enough to meet @freq + * requirement. + */ +static inline +struct em_perf_state *em_pd_get_efficient_state(struct em_perf_domain *pd, + unsigned long freq) +{ + struct em_perf_state *ps; + int i; + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &pd->table[i]; + if (ps->frequency >= freq) { + if (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && + ps->flags & EM_PERF_STATE_INEFFICIENT) + continue; + break; + } + } + + return ps; +} + /** * em_cpu_energy() - Estimates the energy consumed by the CPUs of a * performance domain @@ -142,7 +177,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, { unsigned long freq, scale_cpu; struct em_perf_state *ps; - int i, cpu; + int cpu; if (!sum_util) return 0; @@ -167,11 +202,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested frequency. */ - for (i = 0; i < pd->nr_perf_states; i++) { - ps = &pd->table[i]; - if (ps->frequency >= freq) - break; - } + ps = em_pd_get_efficient_state(pd, freq); /* * The capacity of a CPU in the domain at the performance state (ps) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 3a7d1573b214..d353ef29e37f 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -65,6 +65,17 @@ static int em_debug_units_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_units); +static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; + + seq_printf(s, "%d\n", enabled); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); + static void em_debug_create_pd(struct device *dev) { struct dentry *d; @@ -78,6 +89,8 @@ static void em_debug_create_pd(struct device *dev) &em_debug_cpus_fops); debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); + debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, + &em_debug_skip_inefficiencies_fops); /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) -- cgit v1.2.3-70-g09d2 From e458716a92b57f854deb89bb40aa3554c2b6205e Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:30 +0100 Subject: PM: EM: Mark inefficiencies in CPUFreq The Energy Model has a 1:1 mapping between OPPs and performance states (em_perf_state). If a CPUFreq driver registers an Energy Model, inefficiencies found by the latter can be applied to CPUFreq. Signed-off-by: Vincent Donnefort Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'kernel/power') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d353ef29e37f..0153b0ca7b23 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "energy_model: " fmt #include +#include #include #include #include @@ -231,6 +232,43 @@ static int em_create_pd(struct device *dev, int nr_states, return 0; } +static void em_cpufreq_update_efficiencies(struct device *dev) +{ + struct em_perf_domain *pd = dev->em_pd; + struct em_perf_state *table; + struct cpufreq_policy *policy; + int found = 0; + int i; + + if (!_is_cpu_device(dev) || !pd) + return; + + policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd))); + if (!policy) { + dev_warn(dev, "EM: Access to CPUFreq policy failed"); + return; + } + + table = pd->table; + + for (i = 0; i < pd->nr_perf_states; i++) { + if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT)) + continue; + + if (!cpufreq_table_set_inefficient(policy, table[i].frequency)) + found++; + } + + if (!found) + return; + + /* + * Efficiencies have been installed in CPUFreq, inefficient frequencies + * will be skipped. The EM can do the same. + */ + pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES; +} + /** * em_pd_get() - Return the performance domain for a device * @dev : Device to find the performance domain for @@ -347,6 +385,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, if (milliwatts) dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; + em_cpufreq_update_efficiencies(dev); + em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); -- cgit v1.2.3-70-g09d2 From c1bfc598181bf04b371131df9ea77162079d710e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 19 Oct 2021 20:55:04 +0200 Subject: Revert "PM: sleep: Do not assume that "mem" is always present" Revert commit bfcc1e67ff1e ("PM: sleep: Do not assume that "mem" is always present"), because it breaks compatibility with user space utilities assuming that "mem" will always be present in /sys/power/state. Fixes: bfcc1e67ff1e ("PM: sleep: Do not assume that "mem" is always present") Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 02e306ad8db8..eb75f394a059 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -171,7 +171,8 @@ static bool valid_state(suspend_state_t state) void __init pm_states_init(void) { - /* "freeze" is always present in /sys/power/state. */ + /* "mem" and "freeze" are always present in /sys/power/state. */ + pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE]; /* * Suspend-to-idle should be supported even without any suspend_ops, @@ -213,7 +214,6 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) } if (valid_state(PM_SUSPEND_MEM)) { mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; - pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; if (mem_sleep_default >= PM_SUSPEND_MEM) mem_sleep_current = PM_SUSPEND_MEM; } -- cgit v1.2.3-70-g09d2 From 01de5fcd8b1ac0ca28d2bb0921226a54fdd62684 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 7 Oct 2021 21:13:37 +0200 Subject: PM: hibernate: fix sparse warnings When building the kernel with sparse enabled 'C=1' the following warnings shows up: kernel/power/swap.c:390:29: warning: incorrect type in assignment (different base types) kernel/power/swap.c:390:29: expected int ret kernel/power/swap.c:390:29: got restricted blk_status_t This is due to function hib_wait_io() returns a 'blk_status_t' which is a bitwise u8. Commit 5416da01ff6e ("PM: hibernate: Remove blk_status_to_errno in hib_wait_io") seemed to have mixed up the return type. However, the 4e4cbee93d56 ("block: switch bios to blk_status_t") actually broke the behaviour by returning the wrong type. Rework so function hib_wait_io() returns a 'int' instead of 'blk_status_t' and make sure to call function blk_status_to_errno(hb->error)' when returning from function hib_wait_io() a int gets returned. Fixes: 4e4cbee93d56 ("block: switch bios to blk_status_t") Fixes: 5416da01ff6e ("PM: hibernate: Remove blk_status_to_errno in hib_wait_io") Signed-off-by: Anders Roxell Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 9ec418955556..47107f9cd14c 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -299,14 +299,14 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, return error; } -static blk_status_t hib_wait_io(struct hib_bio_batch *hb) +static int hib_wait_io(struct hib_bio_batch *hb) { /* * We are relying on the behavior of blk_plug that a thread with * a plug will flush the plug list before sleeping. */ wait_event(hb->wait, atomic_read(&hb->count) == 0); - return hb->error; + return blk_status_to_errno(hb->error); } /* -- cgit v1.2.3-70-g09d2 From 9437e393777e6d6c30807c2e9abe27b14703e7f4 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Mon, 18 Oct 2021 21:16:21 +0800 Subject: PM: hibernate: swap: Use vzalloc() and kzalloc() Replace vmalloc()/memset() with vzalloc() and kmalloc()/memset() with kzalloc() to simplify the code. Signed-off-by: Cai Huoqing Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 47107f9cd14c..d59420b8d5ff 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -705,22 +705,19 @@ static int save_image_lzo(struct swap_map_handle *handle, goto out_clean; } - data = vmalloc(array_size(nr_threads, sizeof(*data))); + data = vzalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; goto out_clean; } - for (thr = 0; thr < nr_threads; thr++) - memset(&data[thr], 0, offsetof(struct cmp_data, go)); - crc = kmalloc(sizeof(*crc), GFP_KERNEL); + crc = kzalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } - memset(crc, 0, offsetof(struct crc_data, go)); /* * Start the compression threads. @@ -1198,22 +1195,19 @@ static int load_image_lzo(struct swap_map_handle *handle, goto out_clean; } - data = vmalloc(array_size(nr_threads, sizeof(*data))); + data = vzalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; goto out_clean; } - for (thr = 0; thr < nr_threads; thr++) - memset(&data[thr], 0, offsetof(struct dec_data, go)); - crc = kmalloc(sizeof(*crc), GFP_KERNEL); + crc = kzalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } - memset(crc, 0, offsetof(struct crc_data, go)); clean_pages_on_decompress = true; -- cgit v1.2.3-70-g09d2 From 39fbef4b0f77f9c89c8f014749ca533643a37c9f Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 13 Oct 2021 20:19:14 +0800 Subject: PM: hibernate: Get block device exclusively in swsusp_check() The following kernel crash can be triggered: [ 89.266592] ------------[ cut here ]------------ [ 89.267427] kernel BUG at fs/buffer.c:3020! [ 89.268264] invalid opcode: 0000 [#1] SMP KASAN PTI [ 89.269116] CPU: 7 PID: 1750 Comm: kmmpd-loop0 Not tainted 5.10.0-862.14.0.6.x86_64-08610-gc932cda3cef4-dirty #20 [ 89.273169] RIP: 0010:submit_bh_wbc.isra.0+0x538/0x6d0 [ 89.277157] RSP: 0018:ffff888105ddfd08 EFLAGS: 00010246 [ 89.278093] RAX: 0000000000000005 RBX: ffff888124231498 RCX: ffffffffb2772612 [ 89.279332] RDX: 1ffff11024846293 RSI: 0000000000000008 RDI: ffff888124231498 [ 89.280591] RBP: ffff8881248cc000 R08: 0000000000000001 R09: ffffed1024846294 [ 89.281851] R10: ffff88812423149f R11: ffffed1024846293 R12: 0000000000003800 [ 89.283095] R13: 0000000000000001 R14: 0000000000000000 R15: ffff8881161f7000 [ 89.284342] FS: 0000000000000000(0000) GS:ffff88839b5c0000(0000) knlGS:0000000000000000 [ 89.285711] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 89.286701] CR2: 00007f166ebc01a0 CR3: 0000000435c0e000 CR4: 00000000000006e0 [ 89.287919] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 89.289138] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 89.290368] Call Trace: [ 89.290842] write_mmp_block+0x2ca/0x510 [ 89.292218] kmmpd+0x433/0x9a0 [ 89.294902] kthread+0x2dd/0x3e0 [ 89.296268] ret_from_fork+0x22/0x30 [ 89.296906] Modules linked in: by running the following commands: 1. mkfs.ext4 -O mmp /dev/sda -b 1024 2. mount /dev/sda /home/test 3. echo "/dev/sda" > /sys/power/resume That happens because swsusp_check() calls set_blocksize() on the target partition which confuses the file system: Thread1 Thread2 mount /dev/sda /home/test get s_mmp_bh --> has mapped flag start kmmpd thread echo "/dev/sda" > /sys/power/resume resume_store software_resume swsusp_check set_blocksize truncate_inode_pages_range truncate_cleanup_page block_invalidatepage discard_buffer --> clean mapped flag write_mmp_block submit_bh submit_bh_wbc BUG_ON(!buffer_mapped(bh)) To address this issue, modify swsusp_check() to open the target block device with exclusive access. Signed-off-by: Ye Bin [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index d59420b8d5ff..ff326c2cb77b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1515,9 +1515,10 @@ end: int swsusp_check(void) { int error; + void *holder; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - FMODE_READ, NULL); + FMODE_READ | FMODE_EXCL, &holder); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); @@ -1539,7 +1540,7 @@ int swsusp_check(void) put: if (error) - blkdev_put(hib_resume_bdev, FMODE_READ); + blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL); else pr_debug("Image signature found, resuming\n"); } else { -- cgit v1.2.3-70-g09d2 From 8d89835b0467b7e618c1c93603c1aff85a0c3c66 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Oct 2021 18:04:02 +0200 Subject: PM: suspend: Do not pause cpuidle in the suspend-to-idle path It is pointless to pause cpuidle in the suspend-to-idle path, because it is going to be resumed in the same path later and pausing it does not serve any particular purpose in that case. Rework the code to avoid doing that. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Tested-by: Ulf Hansson --- drivers/base/power/main.c | 11 ++++++----- kernel/power/suspend.c | 8 ++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel/power') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index fca6eab871fc..41b2afa3aacc 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -747,8 +747,6 @@ void dpm_resume_noirq(pm_message_t state) resume_device_irqs(); device_wakeup_disarm_wake_irqs(); - - cpuidle_resume(); } /** @@ -881,6 +879,7 @@ void dpm_resume_early(pm_message_t state) void dpm_resume_start(pm_message_t state) { dpm_resume_noirq(state); + cpuidle_resume(); dpm_resume_early(state); } EXPORT_SYMBOL_GPL(dpm_resume_start); @@ -1337,8 +1336,6 @@ int dpm_suspend_noirq(pm_message_t state) { int ret; - cpuidle_pause(); - device_wakeup_arm_wake_irqs(); suspend_device_irqs(); @@ -1522,9 +1519,13 @@ int dpm_suspend_end(pm_message_t state) if (error) goto out; + cpuidle_pause(); + error = dpm_suspend_noirq(state); - if (error) + if (error) { + cpuidle_resume(); dpm_resume_early(resume_event(state)); + } out: dpm_show_time(starttime, state, error, "end"); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index eb75f394a059..529d7818513f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -97,7 +97,6 @@ static void s2idle_enter(void) raw_spin_unlock_irq(&s2idle_lock); cpus_read_lock(); - cpuidle_resume(); /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); @@ -105,7 +104,6 @@ static void s2idle_enter(void) swait_event_exclusive(s2idle_wait_head, s2idle_state == S2IDLE_STATE_WAKE); - cpuidle_pause(); cpus_read_unlock(); raw_spin_lock_irq(&s2idle_lock); @@ -405,6 +403,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (error) goto Devices_early_resume; + if (state != PM_SUSPEND_TO_IDLE) + cpuidle_pause(); + error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { pr_err("noirq suspend of devices failed\n"); @@ -459,6 +460,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) dpm_resume_noirq(PMSG_RESUME); Platform_early_resume: + if (state != PM_SUSPEND_TO_IDLE) + cpuidle_resume(); + platform_resume_early(state); Devices_early_resume: -- cgit v1.2.3-70-g09d2 From 23f62d7ab25bd1a7dbbb89cfcd429df7735855af Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Oct 2021 18:07:47 +0200 Subject: PM: sleep: Pause cpuidle later and resume it earlier during system transitions Commit 8651f97bd951 ("PM / cpuidle: System resume hang fix with cpuidle") that introduced cpuidle pausing during system suspend did that to work around a platform firmware issue causing systems to hang during resume if CPUs were allowed to enter idle states in the system suspend and resume code paths. However, pausing cpuidle before the last phase of suspending devices is the source of an otherwise arbitrary difference between the suspend-to-idle path and other system suspend variants, so it is cleaner to do that later, before taking secondary CPUs offline (it is still safer to take secondary CPUs offline with cpuidle paused, though). Modify the code accordingly, but in order to avoid code duplication, introduce new wrapper functions, pm_sleep_disable_secondary_cpus() and pm_sleep_enable_secondary_cpus(), to combine cpuidle_pause() and cpuidle_resume(), respectively, with the handling of secondary CPUs during system-wide transitions to sleep states. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Tested-by: Ulf Hansson --- drivers/base/power/main.c | 8 +------- kernel/power/hibernate.c | 12 +++++++----- kernel/power/power.h | 14 ++++++++++++++ kernel/power/suspend.c | 10 ++-------- 4 files changed, 24 insertions(+), 20 deletions(-) (limited to 'kernel/power') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 41b2afa3aacc..ac4dde8fdb8b 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include @@ -879,7 +878,6 @@ void dpm_resume_early(pm_message_t state) void dpm_resume_start(pm_message_t state) { dpm_resume_noirq(state); - cpuidle_resume(); dpm_resume_early(state); } EXPORT_SYMBOL_GPL(dpm_resume_start); @@ -1519,13 +1517,9 @@ int dpm_suspend_end(pm_message_t state) if (error) goto out; - cpuidle_pause(); - error = dpm_suspend_noirq(state); - if (error) { - cpuidle_resume(); + if (error) dpm_resume_early(resume_event(state)); - } out: dpm_show_time(starttime, state, error, "end"); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 559acef3fddb..9ed9b744876c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -300,7 +300,7 @@ static int create_image(int platform_mode) if (error || hibernation_test(TEST_PLATFORM)) goto Platform_finish; - error = suspend_disable_secondary_cpus(); + error = pm_sleep_disable_secondary_cpus(); if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; @@ -342,7 +342,7 @@ static int create_image(int platform_mode) local_irq_enable(); Enable_cpus: - suspend_enable_secondary_cpus(); + pm_sleep_enable_secondary_cpus(); /* Allow architectures to do nosmt-specific post-resume dances */ if (!in_suspend) @@ -466,6 +466,8 @@ static int resume_target_kernel(bool platform_mode) if (error) goto Cleanup; + cpuidle_pause(); + error = hibernate_resume_nonboot_cpu_disable(); if (error) goto Enable_cpus; @@ -509,7 +511,7 @@ static int resume_target_kernel(bool platform_mode) local_irq_enable(); Enable_cpus: - suspend_enable_secondary_cpus(); + pm_sleep_enable_secondary_cpus(); Cleanup: platform_restore_cleanup(platform_mode); @@ -587,7 +589,7 @@ int hibernation_platform_enter(void) if (error) goto Platform_finish; - error = suspend_disable_secondary_cpus(); + error = pm_sleep_disable_secondary_cpus(); if (error) goto Enable_cpus; @@ -609,7 +611,7 @@ int hibernation_platform_enter(void) local_irq_enable(); Enable_cpus: - suspend_enable_secondary_cpus(); + pm_sleep_enable_secondary_cpus(); Platform_finish: hibernation_ops->finish(); diff --git a/kernel/power/power.h b/kernel/power/power.h index 778bf431ec02..326f8d032eb5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -4,6 +4,8 @@ #include #include #include +#include +#include struct swsusp_info { struct new_utsname uts; @@ -310,3 +312,15 @@ extern int pm_wake_lock(const char *buf); extern int pm_wake_unlock(const char *buf); #endif /* !CONFIG_PM_WAKELOCKS */ + +static inline int pm_sleep_disable_secondary_cpus(void) +{ + cpuidle_pause(); + return suspend_disable_secondary_cpus(); +} + +static inline void pm_sleep_enable_secondary_cpus(void) +{ + suspend_enable_secondary_cpus(); + cpuidle_resume(); +} diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 529d7818513f..8bea835ef1fa 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -403,9 +403,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (error) goto Devices_early_resume; - if (state != PM_SUSPEND_TO_IDLE) - cpuidle_pause(); - error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { pr_err("noirq suspend of devices failed\n"); @@ -423,7 +420,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } - error = suspend_disable_secondary_cpus(); + error = pm_sleep_disable_secondary_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -453,16 +450,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) BUG_ON(irqs_disabled()); Enable_cpus: - suspend_enable_secondary_cpus(); + pm_sleep_enable_secondary_cpus(); Platform_wake: platform_resume_noirq(state); dpm_resume_noirq(PMSG_RESUME); Platform_early_resume: - if (state != PM_SUSPEND_TO_IDLE) - cpuidle_resume(); - platform_resume_early(state); Devices_early_resume: -- cgit v1.2.3-70-g09d2 From 9f6abfcd67aae51374b4e8aa0b11f0ebd0d8562f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Oct 2021 17:53:43 +0200 Subject: PM: suspend: Use valid_state() consistently Make valid_state() check if the ->enter callback is present in suspend_ops (only PM_SUSPEND_TO_IDLE can be valid otherwise) and make sleep_state_supported() call valid_state() consistently to validate the states other than PM_SUSPEND_TO_IDLE. While at it, clean up the comment in valid_state(). No expected functional impact. Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8bea835ef1fa..80cc1f0f502b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -160,11 +160,13 @@ EXPORT_SYMBOL_GPL(s2idle_wake); static bool valid_state(suspend_state_t state) { /* - * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level - * support and need to be valid to the low level - * implementation, no valid callback implies that none are valid. + * The PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states require low-level + * support and need to be valid to the low-level implementation. + * + * No ->valid() or ->enter() callback implies that none are valid. */ - return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state) && + suspend_ops->enter; } void __init pm_states_init(void) @@ -236,7 +238,7 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem); static bool sleep_state_supported(suspend_state_t state) { - return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter); + return state == PM_SUSPEND_TO_IDLE || valid_state(state); } static int platform_suspend_prepare(suspend_state_t state) -- cgit v1.2.3-70-g09d2