From 40fc735b78f0c81cea7d1c511cfd83892cb4d679 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:19 +0000 Subject: x86/resctrl: Track the closid with the rmid x86's RMID are independent of the CLOSID. An RMID can be allocated, used and freed without considering the CLOSID. MPAM's equivalent feature is PMG, which is not an independent number, it extends the CLOSID/PARTID space. For MPAM, only PMG-bits worth of 'RMID' can be allocated for a single CLOSID. i.e. if there is 1 bit of PMG space, then each CLOSID can have two monitor groups. To allow resctrl to disambiguate RMID values for different CLOSID, everything in resctrl that keeps an RMID value needs to know the CLOSID too. This will always be ignored on x86. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Xin Hao Reviewed-by: Reinette Chatre Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-6-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- include/linux/resctrl.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux/resctrl.h') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 66942d7fba7f..bd4ec22b5a96 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -6,6 +6,10 @@ #include #include +/* CLOSID, RMID value used by the default control group */ +#define RESCTRL_RESERVED_CLOSID 0 +#define RESCTRL_RESERVED_RMID 0 + #ifdef CONFIG_PROC_CPU_RESCTRL int proc_resctrl_show(struct seq_file *m, @@ -225,6 +229,9 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * for this resource and domain. * @r: resource that the counter should be read from. * @d: domain that the counter should be read from. + * @closid: closid that matches the rmid. Depending on the architecture, the + * counter may match traffic of both @closid and @rmid, or @rmid + * only. * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. * @val: result of the counter read in bytes. @@ -235,20 +242,25 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * 0 on success, or -EIO, -EINVAL etc on error. */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid, u64 *val); + u32 closid, u32 rmid, enum resctrl_event_id eventid, + u64 *val); + /** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid * and eventid. * @r: The domain's resource. * @d: The rmid's domain. + * @closid: closid that matches the rmid. Depending on the architecture, the + * counter may match traffic of both @closid and @rmid, or @rmid only. * @rmid: The rmid whose counter values should be reset. * @eventid: The eventid whose counter values should be reset. * * This can be called from any CPU. */ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid); + u32 closid, u32 rmid, + enum resctrl_event_id eventid); /** * resctrl_arch_reset_rmid_all() - Reset all private state associated with -- cgit v1.2.3-70-g09d2 From 6fde1424f29b151b9dc8c660eecf4d1645facea5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:28 +0000 Subject: x86/resctrl: Allow resctrl_arch_rmid_read() to sleep MPAM's cache occupancy counters can take a little while to settle once the monitor has been configured. The maximum settling time is described to the driver via a firmware table. The value could be large enough that it makes sense to sleep. To avoid exposing this to resctrl, it should be hidden behind MPAM's resctrl_arch_rmid_read(). resctrl_arch_rmid_read() may be called via IPI meaning it is unable to sleep. In this case, it should return an error if it needs to sleep. This will only affect MPAM platforms where the cache occupancy counter isn't available immediately, nohz_full is in use, and there are no housekeeping CPUs in the necessary domain. There are three callers of resctrl_arch_rmid_read(): __mon_event_count() and __check_limbo() are both called from a non-migrateable context. mon_event_read() invokes __mon_event_count() using smp_call_on_cpu(), which adds work to the target CPUs workqueue. rdtgroup_mutex() is held, meaning this cannot race with the resctrl cpuhp callback. __check_limbo() is invoked via schedule_delayed_work_on() also adds work to a per-cpu workqueue. The remaining call is add_rmid_to_limbo() which is called in response to a user-space syscall that frees an RMID. This opportunistically reads the LLC occupancy counter on the current domain to see if the RMID is over the dirty threshold. This has to disable preemption to avoid reading the wrong domain's value. Disabling preemption here prevents resctrl_arch_rmid_read() from sleeping. add_rmid_to_limbo() walks each domain, but only reads the counter on one domain. If the system has more than one domain, the RMID will always be added to the limbo list. If the RMIDs usage was not over the threshold, it will be removed from the list when __check_limbo() runs. Make this the default behaviour. Free RMIDs are always added to the limbo list for each domain. The user visible effect of this is that a clean RMID is not available for re-allocation immediately after 'rmdir()' completes. This behaviour was never portable as it never happened on a machine with multiple domains. Removing this path allows resctrl_arch_rmid_read() to sleep if its called with interrupts unmasked. Document this is the expected behaviour, and add a might_sleep() annotation to catch changes that won't work on arm64. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-15-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- arch/x86/kernel/cpu/resctrl/monitor.c | 25 +++++-------------------- include/linux/resctrl.h | 23 ++++++++++++++++++++++- 2 files changed, 27 insertions(+), 21 deletions(-) (limited to 'include/linux/resctrl.h') diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index fd060ef86f38..e8aeff6673ea 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -277,6 +277,8 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u64 msr_val, chunks; int ret; + resctrl_arch_rmid_read_context_check(); + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) return -EINVAL; @@ -455,8 +457,6 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_domain *d; - int cpu, err; - u64 val = 0; u32 idx; lockdep_assert_held(&rdtgroup_mutex); @@ -464,17 +464,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; - cpu = get_cpu(); list_for_each_entry(d, &r->domains, list) { - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - err = resctrl_arch_rmid_read(r, d, entry->closid, - entry->rmid, - QOS_L3_OCCUP_EVENT_ID, - &val); - if (err || val <= resctrl_rmid_realloc_threshold) - continue; - } - /* * For the first limbo RMID in the domain, * setup up the limbo worker. @@ -484,15 +474,10 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) set_bit(idx, d->rmid_busy_llc); entry->busy++; } - put_cpu(); - if (entry->busy) { - rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]++; - } else { - list_add_tail(&entry->list, &rmid_free_lru); - } + rmid_limbo_count++; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; } void free_rmid(u32 closid, u32 rmid) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index bd4ec22b5a96..8649fc84aac2 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -236,7 +236,12 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * @eventid: eventid to read, e.g. L3 occupancy. * @val: result of the counter read in bytes. * - * Call from process context on a CPU that belongs to domain @d. + * Some architectures need to sleep when first programming some of the counters. + * (specifically: arm64's MPAM cache occupancy counters can return 'not ready' + * for a short period of time). Call from a non-migrateable process context on + * a CPU that belongs to domain @d. e.g. use smp_call_on_cpu() or + * schedule_work_on(). This function can be called with interrupts masked, + * e.g. using smp_call_function_any(), but may consistently return an error. * * Return: * 0 on success, or -EIO, -EINVAL etc on error. @@ -245,6 +250,22 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid, u64 *val); +/** + * resctrl_arch_rmid_read_context_check() - warn about invalid contexts + * + * When built with CONFIG_DEBUG_ATOMIC_SLEEP generate a warning when + * resctrl_arch_rmid_read() is called with preemption disabled. + * + * The contract with resctrl_arch_rmid_read() is that if interrupts + * are unmasked, it can sleep. This allows NOHZ_FULL systems to use an + * IPI, (and fail if the call needed to sleep), while most of the time + * the work is scheduled, allowing the call to sleep. + */ +static inline void resctrl_arch_rmid_read_context_check(void) +{ + if (!irqs_disabled()) + might_sleep(); +} /** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid -- cgit v1.2.3-70-g09d2 From e557999f80a5ee4ec812f594ab42bb76c3ec4eb2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:29 +0000 Subject: x86/resctrl: Allow arch to allocate memory needed in resctrl_arch_rmid_read() Depending on the number of monitors available, Arm's MPAM may need to allocate a monitor prior to reading the counter value. Allocating a contended resource may involve sleeping. __check_limbo() and mon_event_count() each make multiple calls to resctrl_arch_rmid_read(), to avoid extra work on contended systems, the allocation should be valid for multiple invocations of resctrl_arch_rmid_read(). The memory or hardware allocated is not specific to a domain. Add arch hooks for this allocation, which need calling before resctrl_arch_rmid_read(). The allocated monitor is passed to resctrl_arch_rmid_read(), then freed again afterwards. The helper can be called on any CPU, and can sleep. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-16-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- arch/x86/include/asm/resctrl.h | 11 ++++++++++ arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 7 +++++++ arch/x86/kernel/cpu/resctrl/internal.h | 1 + arch/x86/kernel/cpu/resctrl/monitor.c | 35 ++++++++++++++++++++++++++++--- include/linux/resctrl.h | 5 ++++- 5 files changed, 55 insertions(+), 4 deletions(-) (limited to 'include/linux/resctrl.h') diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 1d274dbabc44..29c4cc343787 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -136,6 +136,17 @@ static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid) return rmid; } +/* x86 can always read an rmid, nothing needs allocating */ +struct rdt_resource; +static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid) +{ + might_sleep(); + return NULL; +}; + +static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, + void *ctx) { }; + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index e933e1cdb1c9..52fa0e14cb86 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -546,6 +546,11 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->d = d; rr->val = 0; rr->first = first; + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } cpu = cpumask_any_housekeeping(&d->cpu_mask); @@ -559,6 +564,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); + + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 81f5de916db8..e089d1a1a055 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -137,6 +137,7 @@ struct rmid_read { bool first; int err; u64 val; + void *arch_mon_ctx; }; extern bool rdt_alloc_capable; diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index e8aeff6673ea..9b503e6ac490 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -269,7 +269,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid, - u64 *val) + u64 *val, void *ignored) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); @@ -324,9 +324,17 @@ void __check_limbo(struct rdt_domain *d, bool force_free) u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; u32 idx, cur_idx = 1; + void *arch_mon_ctx; bool rmid_dirty; u64 val = 0; + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); + if (IS_ERR(arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(arch_mon_ctx)); + return; + } + /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that * are marked as busy for occupancy < threshold. If the occupancy @@ -340,7 +348,8 @@ void __check_limbo(struct rdt_domain *d, bool force_free) entry = __rmid_entry(idx); if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val)) { + QOS_L3_OCCUP_EVENT_ID, &val, + arch_mon_ctx)) { rmid_dirty = true; } else { rmid_dirty = (val >= resctrl_rmid_realloc_threshold); @@ -353,6 +362,8 @@ void __check_limbo(struct rdt_domain *d, bool force_free) } cur_idx = idx + 1; } + + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } bool has_busy_rmid(struct rdt_domain *d) @@ -533,7 +544,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, - &tval); + &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -722,11 +733,27 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, if (is_mbm_total_enabled()) { rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; rr.val = 0; + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + __mon_event_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; rr.val = 0; + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + __mon_event_count(closid, rmid, &rr); /* @@ -736,6 +763,8 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, */ if (is_mba_sc(NULL)) mbm_bw_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 8649fc84aac2..bf460c912bf5 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -235,6 +235,9 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. * @val: result of the counter read in bytes. + * @arch_mon_ctx: An architecture specific value from + * resctrl_arch_mon_ctx_alloc(), for MPAM this identifies + * the hardware monitor allocated for this read request. * * Some architectures need to sleep when first programming some of the counters. * (specifically: arm64's MPAM cache occupancy counters can return 'not ready' @@ -248,7 +251,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid, - u64 *val); + u64 *val, void *arch_mon_ctx); /** * resctrl_arch_rmid_read_context_check() - warn about invalid contexts -- cgit v1.2.3-70-g09d2 From 1b3e50ce7f5001f1e0edaf7d6abea43b264db7ee Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:34 +0000 Subject: x86/resctrl: Add CPU online callback for resctrl work The resctrl architecture specific code may need to create a domain when a CPU comes online, it also needs to reset the CPUs PQR_ASSOC register. The resctrl filesystem code needs to update the rdtgroup_default CPU mask when CPUs are brought online. Currently, this is all done in one function, resctrl_online_cpu(). It will need to be split into architecture and filesystem parts before resctrl can be moved to /fs/. Pull the rdtgroup_default update work out as a filesystem specific cpu_online helper. resctrl_online_cpu() is the obvious name for this, which means the version in core.c needs renaming. resctrl_online_cpu() is called by the arch code once it has done the work to add the new CPU to any domains. In future patches, resctrl_online_cpu() will take the rdtgroup_mutex itself. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-21-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- arch/x86/kernel/cpu/resctrl/core.c | 8 ++++---- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 8 ++++++++ include/linux/resctrl.h | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux/resctrl.h') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d1dc80a21ea9..4627d447bc3d 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -606,16 +606,16 @@ static void clear_closid_rmid(int cpu) RESCTRL_RESERVED_CLOSID); } -static int resctrl_online_cpu(unsigned int cpu) +static int resctrl_arch_online_cpu(unsigned int cpu) { struct rdt_resource *r; mutex_lock(&rdtgroup_mutex); for_each_capable_rdt_resource(r) domain_add_cpu(cpu, r); - /* The cpu is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); clear_closid_rmid(cpu); + + resctrl_online_cpu(cpu); mutex_unlock(&rdtgroup_mutex); return 0; @@ -967,7 +967,7 @@ static int __init resctrl_late_init(void) state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", - resctrl_online_cpu, resctrl_offline_cpu); + resctrl_arch_online_cpu, resctrl_offline_cpu); if (state < 0) return state; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index ed5fc677a99d..38d3b19a3aca 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -4007,6 +4007,14 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) return 0; } +void resctrl_online_cpu(unsigned int cpu) +{ + lockdep_assert_held(&rdtgroup_mutex); + + /* The CPU is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); +} + /* * rdtgroup_init - rdtgroup initialization * diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index bf460c912bf5..4c4bad3c34e4 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -223,6 +223,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); +void resctrl_online_cpu(unsigned int cpu); /** * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid -- cgit v1.2.3-70-g09d2 From 978fcca954cb52249babbc14e53de53c88dd6433 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:35 +0000 Subject: x86/resctrl: Allow overflow/limbo handlers to be scheduled on any-but CPU When a CPU is taken offline resctrl may need to move the overflow or limbo handlers to run on a different CPU. Once the offline callbacks have been split, cqm_setup_limbo_handler() will be called while the CPU that is going offline is still present in the CPU mask. Pass the CPU to exclude to cqm_setup_limbo_handler() and mbm_setup_overflow_handler(). These functions can use a variant of cpumask_any_but() when selecting the CPU. -1 is used to indicate no CPUs need excluding. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Babu Moger Reviewed-by: Reinette Chatre Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-22-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- arch/x86/kernel/cpu/resctrl/core.c | 8 ++++-- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 2 +- arch/x86/kernel/cpu/resctrl/internal.h | 33 ++++++++++++++++++------ arch/x86/kernel/cpu/resctrl/monitor.c | 42 ++++++++++++++++++++++++------- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 6 +++-- include/linux/resctrl.h | 2 ++ 6 files changed, 72 insertions(+), 21 deletions(-) (limited to 'include/linux/resctrl.h') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4627d447bc3d..55322ba629da 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -584,12 +584,16 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) { if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0); + /* + * temporary: exclude_cpu=-1 as this CPU has already + * been removed by cpumask_clear_cpu()d + */ + mbm_setup_overflow_handler(d, 0, RESCTRL_PICK_ANY_CPU); } if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0); + cqm_setup_limbo_handler(d, 0, RESCTRL_PICK_ANY_CPU); } } } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 52fa0e14cb86..20b02d6f02c1 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -552,7 +552,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, return; } - cpu = cpumask_any_housekeeping(&d->cpu_mask); + cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU); /* * cpumask_any_housekeeping() prefers housekeeping CPUs, but diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 3ee855c37447..c99f26ebe7a6 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -60,19 +60,36 @@ * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that * aren't marked nohz_full * @mask: The mask to pick a CPU from. + * @exclude_cpu:The CPU to avoid picking. * - * Returns a CPU in @mask. If there are housekeeping CPUs that don't use - * nohz_full, these are preferred. + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping + * CPUs that don't use nohz_full, these are preferred. Pass + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. + * + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. */ -static inline unsigned int cpumask_any_housekeeping(const struct cpumask *mask) +static inline unsigned int +cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) { unsigned int cpu, hk_cpu; - cpu = cpumask_any(mask); - if (!tick_nohz_full_cpu(cpu)) + if (exclude_cpu == RESCTRL_PICK_ANY_CPU) + cpu = cpumask_any(mask); + else + cpu = cpumask_any_but(mask, exclude_cpu); + + if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) return cpu; + /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ + if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) + return cpu; + + /* Try to find a CPU that isn't nohz_full to use in preference */ hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); + if (hk_cpu == exclude_cpu) + hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); + if (hk_cpu < nr_cpu_ids) cpu = hk_cpu; @@ -573,11 +590,13 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, - unsigned long delay_ms); + unsigned long delay_ms, + int exclude_cpu); void mbm_handle_overflow(struct work_struct *work); void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 92d7ba674003..67edd4c440f0 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -481,7 +481,8 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) * setup up the limbo worker. */ if (!has_busy_rmid(d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, + RESCTRL_PICK_ANY_CPU); set_bit(idx, d->rmid_busy_llc); entry->busy++; } @@ -784,7 +785,8 @@ void cqm_handle_limbo(struct work_struct *work) __check_limbo(d, false); if (has_busy_rmid(d)) { - d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask); + d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + RESCTRL_PICK_ANY_CPU); schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, delay); } @@ -792,15 +794,25 @@ void cqm_handle_limbo(struct work_struct *work) mutex_unlock(&rdtgroup_mutex); } -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this + * domain. + * @dom: The domain the limbo handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - cpu = cpumask_any_housekeeping(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); dom->cqm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); } void mbm_handle_overflow(struct work_struct *work) @@ -838,14 +850,24 @@ void mbm_handle_overflow(struct work_struct *work) * Re-check for housekeeping CPUs. This allows the overflow handler to * move off a nohz_full CPU quickly. */ - d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask); + d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + RESCTRL_PICK_ANY_CPU); schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); out_unlock: mutex_unlock(&rdtgroup_mutex); } -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this + * domain. + * @dom: The domain the overflow handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; @@ -856,9 +878,11 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) */ if (!resctrl_mounted || !resctrl_arch_mon_capable()) return; - cpu = cpumask_any_housekeeping(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } static int dom_data_init(struct rdt_resource *r) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 38d3b19a3aca..f5688c79d94f 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2678,7 +2678,8 @@ static int rdt_get_tree(struct fs_context *fc) if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; list_for_each_entry(dom, &r->domains, list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } goto out; @@ -3989,7 +3990,8 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) if (is_mbm_enabled()) { INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } if (is_llc_occupancy_enabled()) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 4c4bad3c34e4..ccbbbe5d18d3 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -10,6 +10,8 @@ #define RESCTRL_RESERVED_CLOSID 0 #define RESCTRL_RESERVED_RMID 0 +#define RESCTRL_PICK_ANY_CPU -1 + #ifdef CONFIG_PROC_CPU_RESCTRL int proc_resctrl_show(struct seq_file *m, -- cgit v1.2.3-70-g09d2 From 258c91e84fedc789353a35ad91d827a9111d3cbd Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:36 +0000 Subject: x86/resctrl: Add CPU offline callback for resctrl work The resctrl architecture specific code may need to free a domain when a CPU goes offline, it also needs to reset the CPUs PQR_ASSOC register. Amongst other things, the resctrl filesystem code needs to clear this CPU from the cpu_mask of any control and monitor groups. Currently, this is all done in core.c and called from resctrl_offline_cpu(), making the split between architecture and filesystem code unclear. Move the filesystem work to remove the CPU from the control and monitor groups into a filesystem helper called resctrl_offline_cpu(), and rename the one in core.c resctrl_arch_offline_cpu(). Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-23-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- arch/x86/kernel/cpu/resctrl/core.c | 25 +++++-------------------- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 24 ++++++++++++++++++++++++ include/linux/resctrl.h | 1 + 3 files changed, 30 insertions(+), 20 deletions(-) (limited to 'include/linux/resctrl.h') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 55322ba629da..4aedefa22f61 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -625,31 +625,15 @@ static int resctrl_arch_online_cpu(unsigned int cpu) return 0; } -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +static int resctrl_arch_offline_cpu(unsigned int cpu) { - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { - break; - } - } -} - -static int resctrl_offline_cpu(unsigned int cpu) -{ - struct rdtgroup *rdtgrp; struct rdt_resource *r; mutex_lock(&rdtgroup_mutex); + resctrl_offline_cpu(cpu); + for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); @@ -971,7 +955,8 @@ static int __init resctrl_late_init(void) state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", - resctrl_arch_online_cpu, resctrl_offline_cpu); + resctrl_arch_online_cpu, + resctrl_arch_offline_cpu); if (state < 0) return state; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f5688c79d94f..5bd3d8fb3f67 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -4017,6 +4017,30 @@ void resctrl_online_cpu(unsigned int cpu) cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); } +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) + break; + } +} + +void resctrl_offline_cpu(unsigned int cpu) +{ + struct rdtgroup *rdtgrp; + + lockdep_assert_held(&rdtgroup_mutex); + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } +} + /* * rdtgroup_init - rdtgroup initialization * diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index ccbbbe5d18d3..270ff1d5c051 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -226,6 +226,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_online_cpu(unsigned int cpu); +void resctrl_offline_cpu(unsigned int cpu); /** * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid -- cgit v1.2.3-70-g09d2 From fb700810d30b9eb333a7bf447012e1158e35c62f Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:38 +0000 Subject: x86/resctrl: Separate arch and fs resctrl locks resctrl has one mutex that is taken by the architecture-specific code, and the filesystem parts. The two interact via cpuhp, where the architecture code updates the domain list. Filesystem handlers that walk the domains list should not run concurrently with the cpuhp callback modifying the list. Exposing a lock from the filesystem code means the interface is not cleanly defined, and creates the possibility of cross-architecture lock ordering headaches. The interaction only exists so that certain filesystem paths are serialised against CPU hotplug. The CPU hotplug code already has a mechanism to do this using cpus_read_lock(). MPAM's monitors have an overflow interrupt, so it needs to be possible to walk the domains list in irq context. RCU is ideal for this, but some paths need to be able to sleep to allocate memory. Because resctrl_{on,off}line_cpu() take the rdtgroup_mutex as part of a cpuhp callback, cpus_read_lock() must always be taken first. rdtgroup_schemata_write() already does this. Most of the filesystem code's domain list walkers are currently protected by the rdtgroup_mutex taken in rdtgroup_kn_lock_live(). The exceptions are rdt_bit_usage_show() and the mon_config helpers which take the lock directly. Make the domain list protected by RCU. An architecture-specific lock prevents concurrent writers. rdt_bit_usage_show() could walk the domain list using RCU, but to keep all the filesystem operations the same, this is changed to call cpus_read_lock(). The mon_config helpers send multiple IPIs, take the cpus_read_lock() in these cases. The other filesystem list walkers need to be able to sleep. Add cpus_read_lock() to rdtgroup_kn_lock_live() so that the cpuhp callbacks can't be invoked when file system operations are occurring. Add lockdep_assert_cpus_held() in the cases where the rdtgroup_kn_lock_live() call isn't obvious. Resctrl's domain online/offline calls now need to take the rdtgroup_mutex themselves. [ bp: Fold in a build fix: https://lore.kernel.org/r/87zfvwieli.ffs@tglx ] Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-25-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- arch/x86/kernel/cpu/resctrl/core.c | 44 +++++++++++++++----- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 15 +++++-- arch/x86/kernel/cpu/resctrl/monitor.c | 8 ++++ arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 3 ++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 68 ++++++++++++++++++++++++------- include/linux/resctrl.h | 2 +- 6 files changed, 112 insertions(+), 28 deletions(-) (limited to 'include/linux/resctrl.h') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index b03a6c658ae5..8a4ef4f5bddc 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -16,6 +16,7 @@ #define pr_fmt(fmt) "resctrl: " fmt +#include #include #include #include @@ -25,8 +26,15 @@ #include #include "internal.h" -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); +/* + * rdt_domain structures are kfree()d when their last CPU goes offline, + * and allocated when the first CPU in a new domain comes online. + * The rdt_resource's domain list is updated when this happens. Readers of + * the domain list must either take cpus_read_lock(), or rely on an RCU + * read-side critical section, to avoid observing concurrent modification. + * All writers take this mutex: + */ +static DEFINE_MUTEX(domain_list_lock); /* * The cached resctrl_pqr_state is strictly per CPU and can never be @@ -354,6 +362,15 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) { struct rdt_domain *d; + /* + * Walking r->domains, ensure it can't race with cpuhp. + * Because this is called via IPI by rdt_ctrl_update(), assertions + * about locks this thread holds will lead to false positives. Check + * someone is holding the CPUs lock. + */ + if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON_ONCE(!lockdep_is_cpus_held()); + list_for_each_entry(d, &r->domains, list) { /* Find the domain that contains this CPU */ if (cpumask_test_cpu(cpu, &d->cpu_mask)) @@ -510,6 +527,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) struct rdt_domain *d; int err; + lockdep_assert_held(&domain_list_lock); + d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { pr_warn("Couldn't find cache id for CPU %d\n", cpu); @@ -543,11 +562,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; } - list_add_tail(&d->list, add_pos); + list_add_tail_rcu(&d->list, add_pos); err = resctrl_online_domain(r, d); if (err) { - list_del(&d->list); + list_del_rcu(&d->list); + synchronize_rcu(); domain_free(hw_dom); } } @@ -558,6 +578,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) struct rdt_hw_domain *hw_dom; struct rdt_domain *d; + lockdep_assert_held(&domain_list_lock); + d = rdt_find_domain(r, id, NULL); if (IS_ERR_OR_NULL(d)) { pr_warn("Couldn't find cache id for CPU %d\n", cpu); @@ -568,7 +590,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { resctrl_offline_domain(r, d); - list_del(&d->list); + list_del_rcu(&d->list); + synchronize_rcu(); /* * rdt_domain "d" is going to be freed below, so clear @@ -598,13 +621,13 @@ static int resctrl_arch_online_cpu(unsigned int cpu) { struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_add_cpu(cpu, r); - clear_closid_rmid(cpu); + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); resctrl_online_cpu(cpu); - mutex_unlock(&rdtgroup_mutex); return 0; } @@ -613,13 +636,14 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) { struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); resctrl_offline_cpu(cpu); + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); - mutex_unlock(&rdtgroup_mutex); return 0; } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 20b02d6f02c1..7997b47743a2 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -212,6 +212,9 @@ static int parse_line(char *line, struct resctrl_schema *s, struct rdt_domain *d; unsigned long dom_id; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); @@ -316,6 +319,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) struct rdt_domain *d; u32 idx; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -381,11 +387,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, return -EINVAL; buf[nbytes - 1] = '\0'; - cpus_read_lock(); rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return -ENOENT; } rdt_last_cmd_clear(); @@ -447,7 +451,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdt_staged_configs_clear(); rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return ret ?: nbytes; } @@ -467,6 +470,9 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo bool sep = false; u32 ctrl_val; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + seq_printf(s, "%*s:", max_name_width, schema->name); list_for_each_entry(dom, &r->domains, list) { if (sep) @@ -537,6 +543,9 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, { int cpu; + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + /* * Setup the parameters to pass to mon_event_count() to read the data. */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 67edd4c440f0..c34a35ec0f03 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -15,6 +15,7 @@ * Software Developer Manual June 2016, volume 3, section 17.17. */ +#include #include #include #include @@ -472,6 +473,9 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) lockdep_assert_held(&rdtgroup_mutex); + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; @@ -778,6 +782,7 @@ void cqm_handle_limbo(struct work_struct *work) unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); d = container_of(work, struct rdt_domain, cqm_limbo.work); @@ -792,6 +797,7 @@ void cqm_handle_limbo(struct work_struct *work) } mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } /** @@ -823,6 +829,7 @@ void mbm_handle_overflow(struct work_struct *work) struct rdt_resource *r; struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* @@ -856,6 +863,7 @@ void mbm_handle_overflow(struct work_struct *work) out_unlock: mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } /** diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 8056bed033cc..884b88e25141 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -844,6 +844,9 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) struct rdt_domain *d_i; bool ret = false; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) return true; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 777e9f680332..011e17efb1a6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -35,6 +35,10 @@ DEFINE_STATIC_KEY_FALSE(rdt_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); @@ -1014,6 +1018,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, bool sep = false; u32 ctrl_val; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->domains, list) { @@ -1074,6 +1079,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } seq_putc(seq, '\n'); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1329,6 +1335,9 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) struct rdt_domain *d; u32 ctrl; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) @@ -1593,6 +1602,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid struct rdt_domain *dom; bool sep = false; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); list_for_each_entry(dom, &r->domains, list) { @@ -1609,6 +1619,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid seq_puts(s, "\n"); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1690,6 +1701,9 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) unsigned long dom_id, val; struct rdt_domain *d; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + next: if (!tok || tok[0] == '\0') return 0; @@ -1736,6 +1750,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1745,6 +1760,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -1760,6 +1776,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1769,6 +1786,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -2245,6 +2263,9 @@ static int set_cache_qos_cfg(int level, bool enable) struct rdt_domain *d; int cpu; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (level == RDT_RESOURCE_L3) update = l3_qos_cfg_update; else if (level == RDT_RESOURCE_L2) @@ -2444,6 +2465,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) rdtgroup_kn_get(rdtgrp, kn); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* Was this group deleted while we waited? */ @@ -2461,6 +2483,8 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + rdtgroup_kn_put(rdtgrp, kn); } @@ -2793,6 +2817,9 @@ static int reset_all_ctrls(struct rdt_resource *r) struct rdt_domain *d; int i; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -3077,6 +3104,9 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_domain *dom; int ret; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(dom, &r->domains, list) { ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); if (ret) @@ -3907,13 +3937,13 @@ static void domain_destroy_mon_state(struct rdt_domain *d) void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) { - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) mba_sc_domain_destroy(r, d); if (!r->mon_capable) - return; + goto out_unlock; /* * If resctrl is mounted, remove all the @@ -3938,6 +3968,9 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) } domain_destroy_mon_state(d); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) @@ -3973,20 +4006,22 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) { - int err; + int err = 0; - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { /* RDT_RESOURCE_MBA is never mon_capable */ - return mba_sc_domain_allocate(r, d); + err = mba_sc_domain_allocate(r, d); + goto out_unlock; + } if (!r->mon_capable) - return 0; + goto out_unlock; err = domain_setup_mon_state(r, d); if (err) - return err; + goto out_unlock; if (is_mbm_enabled()) { INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); @@ -4006,15 +4041,18 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) if (resctrl_mounted && resctrl_arch_mon_capable()) mkdir_mondata_subdir_allrdtgrp(r, d); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; } void resctrl_online_cpu(unsigned int cpu) { - lockdep_assert_held(&rdtgroup_mutex); - + mutex_lock(&rdtgroup_mutex); /* The CPU is set in default rdtgroup after online. */ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&rdtgroup_mutex); } static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) @@ -4033,8 +4071,7 @@ void resctrl_offline_cpu(unsigned int cpu) struct rdtgroup *rdtgrp; struct rdt_domain *d; - lockdep_assert_held(&rdtgroup_mutex); - + mutex_lock(&rdtgroup_mutex); list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { clear_childcpus(rdtgrp, cpu); @@ -4043,7 +4080,7 @@ void resctrl_offline_cpu(unsigned int cpu) } if (!l3->mon_capable) - return; + goto out_unlock; d = get_domain_from_cpu(cpu, l3); if (d) { @@ -4057,6 +4094,9 @@ void resctrl_offline_cpu(unsigned int cpu) cqm_setup_limbo_handler(d, 0, cpu); } } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } /* diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 270ff1d5c051..a365f67131ec 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -159,7 +159,7 @@ struct resctrl_schema; * @cache_level: Which cache level defines scope of this resource * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. - * @domains: All domains for this resource + * @domains: RCU list of all domains for this resource * @name: Name to use in "schemata" file. * @data_width: Character width of data when displaying * @default_ctrl: Specifies default cache cbm or memory B/W percent. -- cgit v1.2.3-70-g09d2