diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-18 15:41:45 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-18 15:41:45 -0700 |
commit | 1c7d0c3af5cc8adafef6477f9416820fc894ca40 (patch) | |
tree | 449450c8ca1726cefb8197256b61c4de0b7cfddb /arch/s390/kernel | |
parent | dde1a0e1625c08cf4f958348a83434b2ddecf449 (diff) | |
parent | df39038cd89525d465c2c8827eb64116873f141a (diff) |
Merge tag 's390-6.11-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
Pull s390 updates from Vasily Gorbik:
- Remove restrictions on PAI NNPA and crypto counters, enabling
concurrent per-task and system-wide sampling and counting events
- Switch to GENERIC_CPU_DEVICES by setting up the CPU present mask in
the architecture code and letting the generic code handle CPU
bring-up
- Add support for the diag204 busy indication facility to prevent
undesirable blocking during hypervisor logical CPU utilization
queries. Implement results caching
- Improve the handling of Store Data SCLP events by suppressing
unnecessary warning, preventing buffer release in I/O during
failures, and adding timeout handling for Store Data requests to
address potential firmware issues
- Provide optimized __arch_hweight*() implementations
- Remove the unnecessary CPU KOBJ_CHANGE uevents generated during
topology updates, as they are unused and also not present on other
architectures
- Cleanup atomic_ops, optimize __atomic_set() for small values and
__atomic_cmpxchg_bool() for compilers supporting flag output
constraint
- Couple of cleanups for KVM:
- Move and improve KVM struct definitions for DAT tables from
gaccess.c to a new header
- Pass the asce as parameter to sie64a()
- Make the crdte() and cspg() page table handling wrappers return a
boolean to indicate success, like the other existing "compare and
swap" wrappers
- Add documentation for HWCAP flags
- Switch to obtaining total RAM pages from memblock instead of
totalram_pages() during mm init, to ensure correct calculation of
zero page size, when defer_init is enabled
- Refactor lowcore access and switch to using the get_lowcore()
function instead of the S390_lowcore macro
- Cleanups for PG_arch_1 and folio handling in UV and hugetlb code
- Add missing MODULE_DESCRIPTION() macros
- Fix VM_FAULT_HWPOISON handling in do_exception()
* tag 's390-6.11-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (54 commits)
s390/mm: Fix VM_FAULT_HWPOISON handling in do_exception()
s390/kvm: Move bitfields for dat tables
s390/entry: Pass the asce as parameter to sie64a()
s390/sthyi: Use cached data when diag is busy
s390/sthyi: Move diag operations
s390/hypfs_diag: Diag204 busy loop
s390/diag: Add busy-indication-facility requirements
s390/diag: Diag204 add busy return errno
s390/diag: Return errno's from diag204
s390/sclp: Diag204 busy indication facility detection
s390/atomic_ops: Make use of flag output constraint
s390/atomic_ops: Improve __atomic_set() for small values
s390/atomic_ops: Use symbolic names
s390/smp: Switch to GENERIC_CPU_DEVICES
s390/hwcaps: Add documentation for HWCAP flags
s390/pgtable: Make crdte() and cspg() return a value
s390/topology: Remove CPU KOBJ_CHANGE uevents
s390/sclp: Add timeout to Store Data requests
s390/sclp: Prevent release of buffer in I/O
s390/sclp: Suppress unnecessary Store Data warning
...
Diffstat (limited to 'arch/s390/kernel')
-rw-r--r-- | arch/s390/kernel/asm-offsets.c | 1 | ||||
-rw-r--r-- | arch/s390/kernel/diag.c | 12 | ||||
-rw-r--r-- | arch/s390/kernel/dumpstack.c | 8 | ||||
-rw-r--r-- | arch/s390/kernel/early.c | 36 | ||||
-rw-r--r-- | arch/s390/kernel/entry.S | 8 | ||||
-rw-r--r-- | arch/s390/kernel/idle.c | 11 | ||||
-rw-r--r-- | arch/s390/kernel/irq.c | 18 | ||||
-rw-r--r-- | arch/s390/kernel/machine_kexec.c | 4 | ||||
-rw-r--r-- | arch/s390/kernel/nmi.c | 31 | ||||
-rw-r--r-- | arch/s390/kernel/perf_cpum_sf.c | 2 | ||||
-rw-r--r-- | arch/s390/kernel/perf_pai_crypto.c | 183 | ||||
-rw-r--r-- | arch/s390/kernel/perf_pai_ext.c | 146 | ||||
-rw-r--r-- | arch/s390/kernel/process.c | 6 | ||||
-rw-r--r-- | arch/s390/kernel/setup.c | 24 | ||||
-rw-r--r-- | arch/s390/kernel/smp.c | 88 | ||||
-rw-r--r-- | arch/s390/kernel/sthyi.c | 95 | ||||
-rw-r--r-- | arch/s390/kernel/syscall.c | 4 | ||||
-rw-r--r-- | arch/s390/kernel/time.c | 22 | ||||
-rw-r--r-- | arch/s390/kernel/topology.c | 8 | ||||
-rw-r--r-- | arch/s390/kernel/traps.c | 28 | ||||
-rw-r--r-- | arch/s390/kernel/uv.c | 207 | ||||
-rw-r--r-- | arch/s390/kernel/vtime.c | 82 |
22 files changed, 610 insertions, 414 deletions
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index f55979f64d49..26bb45d0e6f1 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -63,6 +63,7 @@ int main(void) OFFSET(__SF_SIE_REASON, stack_frame, sie_reason); OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); + OFFSET(__SF_SIE_GUEST_ASCE, stack_frame, sie_guest_asce); DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); OFFSET(__SFUSER_BACKCHAIN, stack_frame_user, back_chain); diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index 8dee9aa0ec95..9b65f04c83de 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -185,6 +185,8 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) } EXPORT_SYMBOL(diag14); +#define DIAG204_BUSY_RC 8 + static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr) { union register_pair rp = { .even = *subcode, .odd = size }; @@ -215,16 +217,18 @@ int diag204(unsigned long subcode, unsigned long size, void *addr) { if (addr) { if (WARN_ON_ONCE(!is_vmalloc_addr(addr))) - return -1; + return -EINVAL; if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE))) - return -1; + return -EINVAL; } if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4) addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr)); diag_stat_inc(DIAG_STAT_X204); size = __diag204(&subcode, size, addr); - if (subcode) - return -1; + if (subcode == DIAG204_BUSY_RC) + return -EBUSY; + else if (subcode) + return -EOPNOTSUPP; return size; } EXPORT_SYMBOL(diag204); diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index d2012635b093..1ecd0580561f 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -61,28 +61,28 @@ static bool in_task_stack(unsigned long sp, struct task_struct *task, static bool in_irq_stack(unsigned long sp, struct stack_info *info) { - unsigned long stack = S390_lowcore.async_stack - STACK_INIT_OFFSET; + unsigned long stack = get_lowcore()->async_stack - STACK_INIT_OFFSET; return in_stack(sp, info, STACK_TYPE_IRQ, stack); } static bool in_nodat_stack(unsigned long sp, struct stack_info *info) { - unsigned long stack = S390_lowcore.nodat_stack - STACK_INIT_OFFSET; + unsigned long stack = get_lowcore()->nodat_stack - STACK_INIT_OFFSET; return in_stack(sp, info, STACK_TYPE_NODAT, stack); } static bool in_mcck_stack(unsigned long sp, struct stack_info *info) { - unsigned long stack = S390_lowcore.mcck_stack - STACK_INIT_OFFSET; + unsigned long stack = get_lowcore()->mcck_stack - STACK_INIT_OFFSET; return in_stack(sp, info, STACK_TYPE_MCCK, stack); } static bool in_restart_stack(unsigned long sp, struct stack_info *info) { - unsigned long stack = S390_lowcore.restart_stack - STACK_INIT_OFFSET; + unsigned long stack = get_lowcore()->restart_stack - STACK_INIT_OFFSET; return in_stack(sp, info, STACK_TYPE_RESTART, stack); } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index c666271433fb..467ed4dba817 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -72,7 +72,7 @@ static void __init reset_tod_clock(void) memset(&tod_clock_base, 0, sizeof(tod_clock_base)); tod_clock_base.tod = TOD_UNIX_EPOCH; - S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; + get_lowcore()->last_update_clock = TOD_UNIX_EPOCH; } /* @@ -99,7 +99,7 @@ static noinline __init void detect_machine_type(void) /* Check current-configuration-level */ if (stsi(NULL, 0, 0, 0) <= 2) { - S390_lowcore.machine_flags |= MACHINE_FLAG_LPAR; + get_lowcore()->machine_flags |= MACHINE_FLAG_LPAR; return; } /* Get virtual-machine cpu information. */ @@ -108,9 +108,9 @@ static noinline __init void detect_machine_type(void) /* Detect known hypervisors */ if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) - S390_lowcore.machine_flags |= MACHINE_FLAG_KVM; + get_lowcore()->machine_flags |= MACHINE_FLAG_KVM; else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) - S390_lowcore.machine_flags |= MACHINE_FLAG_VM; + get_lowcore()->machine_flags |= MACHINE_FLAG_VM; } /* Remove leading, trailing and double whitespace. */ @@ -166,7 +166,7 @@ static __init void setup_topology(void) if (!test_facility(11)) return; - S390_lowcore.machine_flags |= MACHINE_FLAG_TOPOLOGY; + get_lowcore()->machine_flags |= MACHINE_FLAG_TOPOLOGY; for (max_mnest = 6; max_mnest > 1; max_mnest--) { if (stsi(&sysinfo_page, 15, 1, max_mnest) == 0) break; @@ -186,8 +186,8 @@ static noinline __init void setup_lowcore_early(void) psw.addr = (unsigned long)early_pgm_check_handler; psw.mask = PSW_KERNEL_BITS; - S390_lowcore.program_new_psw = psw; - S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; + get_lowcore()->program_new_psw = psw; + get_lowcore()->preempt_count = INIT_PREEMPT_COUNT; } static noinline __init void setup_facility_list(void) @@ -211,43 +211,43 @@ static __init void detect_diag9c(void) EX_TABLE(0b,1b) : "=d" (rc) : "0" (-EOPNOTSUPP), "d" (cpu_address) : "cc"); if (!rc) - S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C; + get_lowcore()->machine_flags |= MACHINE_FLAG_DIAG9C; } static __init void detect_machine_facilities(void) { if (test_facility(8)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1; + get_lowcore()->machine_flags |= MACHINE_FLAG_EDAT1; system_ctl_set_bit(0, CR0_EDAT_BIT); } if (test_facility(78)) - S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2; + get_lowcore()->machine_flags |= MACHINE_FLAG_EDAT2; if (test_facility(3)) - S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE; + get_lowcore()->machine_flags |= MACHINE_FLAG_IDTE; if (test_facility(50) && test_facility(73)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_TE; + get_lowcore()->machine_flags |= MACHINE_FLAG_TE; system_ctl_set_bit(0, CR0_TRANSACTIONAL_EXECUTION_BIT); } if (test_facility(51)) - S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; + get_lowcore()->machine_flags |= MACHINE_FLAG_TLB_LC; if (test_facility(129)) system_ctl_set_bit(0, CR0_VECTOR_BIT); if (test_facility(130)) - S390_lowcore.machine_flags |= MACHINE_FLAG_NX; + get_lowcore()->machine_flags |= MACHINE_FLAG_NX; if (test_facility(133)) - S390_lowcore.machine_flags |= MACHINE_FLAG_GS; + get_lowcore()->machine_flags |= MACHINE_FLAG_GS; if (test_facility(139) && (tod_clock_base.tod >> 63)) { /* Enabled signed clock comparator comparisons */ - S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; + get_lowcore()->machine_flags |= MACHINE_FLAG_SCC; clock_comparator_max = -1ULL >> 1; system_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SIGN_BIT); } if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO; + get_lowcore()->machine_flags |= MACHINE_FLAG_PCI_MIO; /* the control bit is set during PCI initialization */ } if (test_facility(194)) - S390_lowcore.machine_flags |= MACHINE_FLAG_RDP; + get_lowcore()->machine_flags |= MACHINE_FLAG_RDP; } static inline void save_vector_registers(void) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 60cf917a7122..454b6b92c7f8 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -179,6 +179,7 @@ SYM_FUNC_END(__switch_to_asm) * %r2 pointer to sie control block phys * %r3 pointer to sie control block virt * %r4 guest register save area + * %r5 guest asce */ SYM_FUNC_START(__sie64a) stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers @@ -186,15 +187,12 @@ SYM_FUNC_START(__sie64a) stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical.. stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area + stg %r5,__SF_SIE_GUEST_ASCE(%r15) # save guest asce xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags lmg %r0,%r13,0(%r4) # load guest gprs 0-13 - lg %r14,__LC_GMAP # get gmap pointer - ltgr %r14,%r14 - jz .Lsie_gmap oi __LC_CPU_FLAGS+7,_CIF_SIE - lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce -.Lsie_gmap: + lctlg %c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now tm __SIE_PROG20+3(%r14),3 # last exit... diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index af9c97c0ad73..39cb8d0ae348 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -24,6 +24,7 @@ static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); void account_idle_time_irq(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + struct lowcore *lc = get_lowcore(); unsigned long idle_time; u64 cycles_new[8]; int i; @@ -34,13 +35,13 @@ void account_idle_time_irq(void) this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); } - idle_time = S390_lowcore.int_clock - idle->clock_idle_enter; + idle_time = lc->int_clock - idle->clock_idle_enter; - S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock; - S390_lowcore.last_update_clock = S390_lowcore.int_clock; + lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; + lc->last_update_clock = lc->int_clock; - S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter; - S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer; + lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; + lc->last_update_timer = lc->sys_enter_timer; /* Account time spent with enabled wait psw loaded as idle time. */ WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 9acc6630abd3..1af5a08d72ab 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -100,8 +100,8 @@ static const struct irq_class irqclass_sub_desc[] = { static void do_IRQ(struct pt_regs *regs, int irq) { - if (tod_after_eq(S390_lowcore.int_clock, - S390_lowcore.clock_comparator)) + if (tod_after_eq(get_lowcore()->int_clock, + get_lowcore()->clock_comparator)) /* Serve timer interrupts first. */ clock_comparator_work(); generic_handle_irq(irq); @@ -111,7 +111,7 @@ static int on_async_stack(void) { unsigned long frame = current_frame_address(); - return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; + return ((get_lowcore()->async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; } static void do_irq_async(struct pt_regs *regs, int irq) @@ -119,7 +119,7 @@ static void do_irq_async(struct pt_regs *regs, int irq) if (on_async_stack()) { do_IRQ(regs, irq); } else { - call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ, + call_on_stack(2, get_lowcore()->async_stack, void, do_IRQ, struct pt_regs *, regs, int, irq); } } @@ -153,8 +153,8 @@ void noinstr do_io_irq(struct pt_regs *regs) set_cpu_flag(CIF_NOHZ_DELAY); do { - regs->tpi_info = S390_lowcore.tpi_info; - if (S390_lowcore.tpi_info.adapter_IO) + regs->tpi_info = get_lowcore()->tpi_info; + if (get_lowcore()->tpi_info.adapter_IO) do_irq_async(regs, THIN_INTERRUPT); else do_irq_async(regs, IO_INTERRUPT); @@ -183,9 +183,9 @@ void noinstr do_ext_irq(struct pt_regs *regs) current->thread.last_break = regs->last_break; } - regs->int_code = S390_lowcore.ext_int_code_addr; - regs->int_parm = S390_lowcore.ext_params; - regs->int_parm_long = S390_lowcore.ext_params2; + regs->int_code = get_lowcore()->ext_int_code_addr; + regs->int_parm = get_lowcore()->ext_params; + regs->int_parm_long = get_lowcore()->ext_params2; from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 3aee98efc374..f4cf65da6d49 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -52,7 +52,7 @@ static void __do_machine_kdump(void *data) purgatory = (purgatory_t)image->start; /* store_status() saved the prefix register to lowcore */ - prefix = (unsigned long) S390_lowcore.prefixreg_save_area; + prefix = (unsigned long)get_lowcore()->prefixreg_save_area; /* Now do the reset */ s390_reset_system(); @@ -91,7 +91,7 @@ static noinline void __machine_kdump(void *image) continue; } /* Store status of the boot CPU */ - mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + mcesa = __va(get_lowcore()->mcesad & MCESA_ORIGIN_MASK); if (cpu_has_vx()) save_vx_regs((__vector128 *) mcesa->vector_save_area); if (MACHINE_HAS_GS) { diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 230d010bac9b..fbd218b6fc8e 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -117,6 +117,7 @@ static __always_inline char *u64_to_hex(char *dest, u64 val) static notrace void s390_handle_damage(void) { + struct lowcore *lc = get_lowcore(); union ctlreg0 cr0, cr0_new; char message[100]; psw_t psw_save; @@ -125,7 +126,7 @@ static notrace void s390_handle_damage(void) smp_emergency_stop(); diag_amode31_ops.diag308_reset(); ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x"); - u64_to_hex(ptr, S390_lowcore.mcck_interruption_code); + u64_to_hex(ptr, lc->mcck_interruption_code); /* * Disable low address protection and make machine check new PSW a @@ -135,17 +136,17 @@ static notrace void s390_handle_damage(void) cr0_new = cr0; cr0_new.lap = 0; local_ctl_load(0, &cr0_new.reg); - psw_save = S390_lowcore.mcck_new_psw; - psw_bits(S390_lowcore.mcck_new_psw).io = 0; - psw_bits(S390_lowcore.mcck_new_psw).ext = 0; - psw_bits(S390_lowcore.mcck_new_psw).wait = 1; + psw_save = lc->mcck_new_psw; + psw_bits(lc->mcck_new_psw).io = 0; + psw_bits(lc->mcck_new_psw).ext = 0; + psw_bits(lc->mcck_new_psw).wait = 1; sclp_emergency_printk(message); /* * Restore machine check new PSW and control register 0 to original * values. This makes possible system dump analysis easier. */ - S390_lowcore.mcck_new_psw = psw_save; + lc->mcck_new_psw = psw_save; local_ctl_load(0, &cr0.reg); disabled_wait(); while (1); @@ -226,7 +227,7 @@ static bool notrace nmi_registers_valid(union mci mci) /* * Set the clock comparator register to the next expected value. */ - set_clock_comparator(S390_lowcore.clock_comparator); + set_clock_comparator(get_lowcore()->clock_comparator); if (!mci.gr || !mci.fp || !mci.fc) return false; /* @@ -252,7 +253,7 @@ static bool notrace nmi_registers_valid(union mci mci) * check handling must take care of this. The host values are saved by * KVM and are not affected. */ - cr2.reg = S390_lowcore.cregs_save_area[2]; + cr2.reg = get_lowcore()->cregs_save_area[2]; if (cr2.gse && !mci.gs && !test_cpu_flag(CIF_MCCK_GUEST)) return false; if (!mci.ms || !mci.pm || !mci.ia) @@ -278,11 +279,10 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs) sie_page = container_of(sie_block, struct sie_page, sie_block); mcck_backup = &sie_page->mcck_info; - mcck_backup->mcic = S390_lowcore.mcck_interruption_code & + mcck_backup->mcic = get_lowcore()->mcck_interruption_code & ~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE); - mcck_backup->ext_damage_code = S390_lowcore.external_damage_code; - mcck_backup->failing_storage_address - = S390_lowcore.failing_storage_address; + mcck_backup->ext_damage_code = get_lowcore()->external_damage_code; + mcck_backup->failing_storage_address = get_lowcore()->failing_storage_address; } NOKPROBE_SYMBOL(s390_backup_mcck_info); @@ -302,6 +302,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) static int ipd_count; static DEFINE_SPINLOCK(ipd_lock); static unsigned long long last_ipd; + struct lowcore *lc = get_lowcore(); struct mcck_struct *mcck; unsigned long long tmp; irqentry_state_t irq_state; @@ -314,7 +315,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) if (user_mode(regs)) update_timer_mcck(); inc_irq_stat(NMI_NMI); - mci.val = S390_lowcore.mcck_interruption_code; + mci.val = lc->mcck_interruption_code; mcck = this_cpu_ptr(&cpu_mcck); /* @@ -382,9 +383,9 @@ void notrace s390_do_machine_check(struct pt_regs *regs) } if (mci.ed && mci.ec) { /* External damage */ - if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC)) + if (lc->external_damage_code & (1U << ED_STP_SYNC)) mcck->stp_queue |= stp_sync_check(); - if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) + if (lc->external_damage_code & (1U << ED_STP_ISLAND)) mcck->stp_queue |= stp_island_check(); mcck_pending = 1; } diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 06efad5b4f93..736c1d9632dd 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1022,7 +1022,7 @@ static void cpumsf_pmu_enable(struct pmu *pmu) } /* Load current program parameter */ - lpp(&S390_lowcore.lpp); + lpp(&get_lowcore()->lpp); debug_sprintf_event(sfdbg, 6, "%s: es %i cs %i ed %i cd %i " "interval %#lx tear %#lx dear %#lx\n", __func__, diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 4ad472d130a3..2f5a20e300f6 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -36,8 +36,8 @@ struct paicrypt_map { struct pai_userdata *save; /* Page to store no-zero counters */ unsigned int active_events; /* # of PAI crypto users */ refcount_t refcnt; /* Reference count mapped buffers */ - enum paievt_mode mode; /* Type of event */ struct perf_event *event; /* Perf event for sampling */ + struct list_head syswide_list; /* List system-wide sampling events */ }; struct paicrypt_mapptr { @@ -84,20 +84,16 @@ static DEFINE_MUTEX(pai_reserve_mutex); /* Adjust usage counters and remove allocated memory when all users are * gone. */ -static void paicrypt_event_destroy(struct perf_event *event) +static void paicrypt_event_destroy_cpu(struct perf_event *event, int cpu) { - struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr, - event->cpu); + struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr, cpu); struct paicrypt_map *cpump = mp->mapptr; - static_branch_dec(&pai_key); mutex_lock(&pai_reserve_mutex); - debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d" - " mode %d refcnt %u\n", __func__, - event->attr.config, event->cpu, - cpump->active_events, cpump->mode, + debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d " + "refcnt %u\n", __func__, event->attr.config, + event->cpu, cpump->active_events, refcount_read(&cpump->refcnt)); - free_page(PAI_SAVE_AREA(event)); if (refcount_dec_and_test(&cpump->refcnt)) { debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n", __func__, (unsigned long)cpump->page, @@ -111,6 +107,23 @@ static void paicrypt_event_destroy(struct perf_event *event) mutex_unlock(&pai_reserve_mutex); } +static void paicrypt_event_destroy(struct perf_event *event) +{ + int cpu; + + static_branch_dec(&pai_key); + free_page(PAI_SAVE_AREA(event)); + if (event->cpu == -1) { + struct cpumask *mask = PAI_CPU_MASK(event); + + for_each_cpu(cpu, mask) + paicrypt_event_destroy_cpu(event, cpu); + kfree(mask); + } else { + paicrypt_event_destroy_cpu(event, event->cpu); + } +} + static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel) { if (kernel) @@ -156,23 +169,15 @@ static u64 paicrypt_getall(struct perf_event *event) return sum; } -/* Used to avoid races in checking concurrent access of counting and - * sampling for crypto events - * - * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is - * allowed and when this event is running, no counting event is allowed. - * Several counting events are allowed in parallel, but no sampling event - * is allowed while one (or more) counting events are running. - * +/* Check concurrent access of counting and sampling for crypto events. * This function is called in process context and it is save to block. * When the event initialization functions fails, no other call back will * be invoked. * * Allocate the memory for the event. */ -static struct paicrypt_map *paicrypt_busy(struct perf_event *event) +static struct paicrypt_map *paicrypt_busy(struct perf_event *event, int cpu) { - struct perf_event_attr *a = &event->attr; struct paicrypt_map *cpump = NULL; struct paicrypt_mapptr *mp; int rc; @@ -185,7 +190,7 @@ static struct paicrypt_map *paicrypt_busy(struct perf_event *event) goto unlock; /* Allocate node for this event */ - mp = per_cpu_ptr(paicrypt_root.mapptr, event->cpu); + mp = per_cpu_ptr(paicrypt_root.mapptr, cpu); cpump = mp->mapptr; if (!cpump) { /* Paicrypt_map allocated? */ cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); @@ -193,25 +198,9 @@ static struct paicrypt_map *paicrypt_busy(struct perf_event *event) rc = -ENOMEM; goto free_root; } + INIT_LIST_HEAD(&cpump->syswide_list); } - if (a->sample_period) { /* Sampling requested */ - if (cpump->mode != PAI_MODE_NONE) - rc = -EBUSY; /* ... sampling/counting active */ - } else { /* Counting requested */ - if (cpump->mode == PAI_MODE_SAMPLING) - rc = -EBUSY; /* ... and sampling active */ - } - /* - * This error case triggers when there is a conflict: - * Either sampling requested and counting already active, or visa - * versa. Therefore the struct paicrypto_map for this CPU is - * needed or the error could not have occurred. Only adjust root - * node refcount. - */ - if (rc) - goto free_root; - /* Allocate memory for counter page and counter extraction. * Only the first counting event has to allocate a page. */ @@ -235,26 +224,58 @@ static struct paicrypt_map *paicrypt_busy(struct perf_event *event) /* Set mode and reference count */ rc = 0; refcount_set(&cpump->refcnt, 1); - cpump->mode = a->sample_period ? PAI_MODE_SAMPLING : PAI_MODE_COUNTING; mp->mapptr = cpump; - debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx users %d" - " mode %d refcnt %u page %#lx save %p rc %d\n", - __func__, a->sample_period, cpump->active_events, - cpump->mode, refcount_read(&cpump->refcnt), + debug_sprintf_event(cfm_dbg, 5, "%s users %d refcnt %u page %#lx " + "save %p rc %d\n", __func__, cpump->active_events, + refcount_read(&cpump->refcnt), (unsigned long)cpump->page, cpump->save, rc); goto unlock; free_paicrypt_map: + /* Undo memory allocation */ kfree(cpump); mp->mapptr = NULL; free_root: paicrypt_root_free(); - unlock: mutex_unlock(&pai_reserve_mutex); return rc ? ERR_PTR(rc) : cpump; } +static int paicrypt_event_init_all(struct perf_event *event) +{ + struct paicrypt_map *cpump; + struct cpumask *maskptr; + int cpu, rc = -ENOMEM; + + maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL); + if (!maskptr) + goto out; + + for_each_online_cpu(cpu) { + cpump = paicrypt_busy(event, cpu); + if (IS_ERR(cpump)) { + for_each_cpu(cpu, maskptr) + paicrypt_event_destroy_cpu(event, cpu); + kfree(maskptr); + rc = PTR_ERR(cpump); + goto out; + } + cpumask_set_cpu(cpu, maskptr); + } + + /* + * On error all cpumask are freed and all events have been destroyed. + * Save of which CPUs data structures have been allocated for. + * Release them in paicrypt_event_destroy call back function + * for this event. + */ + PAI_CPU_MASK(event) = maskptr; + rc = 0; +out: + return rc; +} + /* Might be called on different CPU than the one the event is intended for. */ static int paicrypt_event_init(struct perf_event *event) { @@ -269,10 +290,7 @@ static int paicrypt_event_init(struct perf_event *event) if (a->config < PAI_CRYPTO_BASE || a->config > PAI_CRYPTO_BASE + paicrypt_cnt) return -EINVAL; - /* Allow only CPU wide operation, no process context for now. */ - if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1) - return -ENOENT; - /* Allow only CRYPTO_ALL for sampling. */ + /* Allow only CRYPTO_ALL for sampling */ if (a->sample_period && a->config != PAI_CRYPTO_BASE) return -EINVAL; /* Get a page to store last counter values for sampling */ @@ -284,13 +302,17 @@ static int paicrypt_event_init(struct perf_event *event) } } - cpump = paicrypt_busy(event); - if (IS_ERR(cpump)) { + if (event->cpu >= 0) { + cpump = paicrypt_busy(event, event->cpu); + if (IS_ERR(cpump)) + rc = PTR_ERR(cpump); + } else { + rc = paicrypt_event_init_all(event); + } + if (rc) { free_page(PAI_SAVE_AREA(event)); - rc = PTR_ERR(cpump); goto out; } - event->destroy = paicrypt_event_destroy; if (a->sample_period) { @@ -331,8 +353,14 @@ static void paicrypt_start(struct perf_event *event, int flags) sum = paicrypt_getall(event); /* Get current value */ local64_set(&event->hw.prev_count, sum); } else { /* Sampling */ - cpump->event = event; - perf_sched_cb_inc(event->pmu); + memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE); + /* Enable context switch callback for system-wide sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_add_tail(PAI_SWLIST(event), &cpump->syswide_list); + perf_sched_cb_inc(event->pmu); + } else { + cpump->event = event; + } } } @@ -344,7 +372,7 @@ static int paicrypt_add(struct perf_event *event, int flags) if (++cpump->active_events == 1) { ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET; - WRITE_ONCE(S390_lowcore.ccd, ccd); + WRITE_ONCE(get_lowcore()->ccd, ccd); local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); } if (flags & PERF_EF_START) @@ -353,6 +381,7 @@ static int paicrypt_add(struct perf_event *event, int flags) return 0; } +static void paicrypt_have_sample(struct perf_event *, struct paicrypt_map *); static void paicrypt_stop(struct perf_event *event, int flags) { struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); @@ -361,8 +390,13 @@ static void paicrypt_stop(struct perf_event *event, int flags) if (!event->attr.sample_period) { /* Counting */ paicrypt_read(event); } else { /* Sampling */ - perf_sched_cb_dec(event->pmu); - cpump->event = NULL; + if (!(event->attach_state & PERF_ATTACH_TASK)) { + perf_sched_cb_dec(event->pmu); + list_del(PAI_SWLIST(event)); + } else { + paicrypt_have_sample(event, cpump); + cpump->event = NULL; + } } event->hw.state = PERF_HES_STOPPED; } @@ -375,7 +409,7 @@ static void paicrypt_del(struct perf_event *event, int flags) paicrypt_stop(event, PERF_EF_UPDATE); if (--cpump->active_events == 0) { local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); - WRITE_ONCE(S390_lowcore.ccd, 0); + WRITE_ONCE(get_lowcore()->ccd, 0); } } @@ -455,23 +489,30 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump, } /* Check if there is data to be saved on schedule out of a task. */ -static int paicrypt_have_sample(void) +static void paicrypt_have_sample(struct perf_event *event, + struct paicrypt_map *cpump) { - struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); - struct paicrypt_map *cpump = mp->mapptr; - struct perf_event *event = cpump->event; size_t rawsize; - int rc = 0; if (!event) /* No event active */ - return 0; + return; rawsize = paicrypt_copy(cpump->save, cpump->page, (unsigned long *)PAI_SAVE_AREA(event), - cpump->event->attr.exclude_user, - cpump->event->attr.exclude_kernel); + event->attr.exclude_user, + event->attr.exclude_kernel); if (rawsize) /* No incremented counters */ - rc = paicrypt_push_sample(rawsize, cpump, event); - return rc; + paicrypt_push_sample(rawsize, cpump, event); +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void paicrypt_have_samples(void) +{ + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + struct perf_event *event; + + list_for_each_entry(event, &cpump->syswide_list, hw.tp_list) + paicrypt_have_sample(event, cpump); } /* Called on schedule-in and schedule-out. No access to event structure, @@ -480,10 +521,10 @@ static int paicrypt_have_sample(void) static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { /* We started with a clean page on event installation. So read out - * results on schedule_out and if page was dirty, clear values. + * results on schedule_out and if page was dirty, save old values. */ if (!sched_in) - paicrypt_have_sample(); + paicrypt_have_samples(); } /* Attribute definitions for paicrypt interface. As with other CPU @@ -527,7 +568,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = { /* Performance monitoring unit for mapped counters */ static struct pmu paicrypt = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = paicrypt_event_init, .add = paicrypt_add, .del = paicrypt_del, diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index a6da7e0cc7a6..6295531b39a2 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -47,11 +47,11 @@ struct paiext_cb { /* PAI extension 1 control block */ struct paiext_map { unsigned long *area; /* Area for CPU to store counters */ struct pai_userdata *save; /* Area to store non-zero counters */ - enum paievt_mode mode; /* Type of event */ unsigned int active_events; /* # of PAI Extension users */ refcount_t refcnt; struct perf_event *event; /* Perf event for sampling */ struct paiext_cb *paiext_cb; /* PAI extension control block area */ + struct list_head syswide_list; /* List system-wide sampling events */ }; struct paiext_mapptr { @@ -70,6 +70,8 @@ static void paiext_root_free(void) free_percpu(paiext_root.mapptr); paiext_root.mapptr = NULL; } + debug_sprintf_event(paiext_dbg, 5, "%s root.refcount %d\n", __func__, + refcount_read(&paiext_root.refcnt)); } /* On initialization of first event also allocate per CPU data dynamically. @@ -115,20 +117,34 @@ static void paiext_free(struct paiext_mapptr *mp) } /* Release the PMU if event is the last perf event */ -static void paiext_event_destroy(struct perf_event *event) +static void paiext_event_destroy_cpu(struct perf_event *event, int cpu) { - struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, cpu); struct paiext_map *cpump = mp->mapptr; - free_page(PAI_SAVE_AREA(event)); mutex_lock(&paiext_reserve_mutex); if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */ paiext_free(mp); paiext_root_free(); mutex_unlock(&paiext_reserve_mutex); - debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__, - event->cpu, mp->mapptr); +} + +static void paiext_event_destroy(struct perf_event *event) +{ + int cpu; + + free_page(PAI_SAVE_AREA(event)); + if (event->cpu == -1) { + struct cpumask *mask = PAI_CPU_MASK(event); + for_each_cpu(cpu, mask) + paiext_event_destroy_cpu(event, cpu); + kfree(mask); + } else { + paiext_event_destroy_cpu(event, event->cpu); + } + debug_sprintf_event(paiext_dbg, 4, "%s cpu %d\n", __func__, + event->cpu); } /* Used to avoid races in checking concurrent access of counting and @@ -145,19 +161,18 @@ static void paiext_event_destroy(struct perf_event *event) * * Allocate the memory for the event. */ -static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event) +static int paiext_alloc_cpu(struct perf_event *event, int cpu) { struct paiext_mapptr *mp; struct paiext_map *cpump; int rc; mutex_lock(&paiext_reserve_mutex); - rc = paiext_root_alloc(); if (rc) goto unlock; - mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + mp = per_cpu_ptr(paiext_root.mapptr, cpu); cpump = mp->mapptr; if (!cpump) { /* Paiext_map allocated? */ rc = -ENOMEM; @@ -185,24 +200,13 @@ static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event) paiext_free(mp); goto undo; } + INIT_LIST_HEAD(&cpump->syswide_list); refcount_set(&cpump->refcnt, 1); - cpump->mode = a->sample_period ? PAI_MODE_SAMPLING - : PAI_MODE_COUNTING; + rc = 0; } else { - /* Multiple invocation, check what is active. - * Supported are multiple counter events or only one sampling - * event concurrently at any one time. - */ - if (cpump->mode == PAI_MODE_SAMPLING || - (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) { - rc = -EBUSY; - goto undo; - } refcount_inc(&cpump->refcnt); } - rc = 0; - undo: if (rc) { /* Error in allocation of event, decrement anchor. Since @@ -217,6 +221,38 @@ unlock: return rc; } +static int paiext_alloc(struct perf_event *event) +{ + struct cpumask *maskptr; + int cpu, rc = -ENOMEM; + + maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL); + if (!maskptr) + goto out; + + for_each_online_cpu(cpu) { + rc = paiext_alloc_cpu(event, cpu); + if (rc) { + for_each_cpu(cpu, maskptr) + paiext_event_destroy_cpu(event, cpu); + kfree(maskptr); + goto out; + } + cpumask_set_cpu(cpu, maskptr); + } + + /* + * On error all cpumask are freed and all events have been destroyed. + * Save of which CPUs data structures have been allocated for. + * Release them in paicrypt_event_destroy call back function + * for this event. + */ + PAI_CPU_MASK(event) = maskptr; + rc = 0; +out: + return rc; +} + /* The PAI extension 1 control block supports up to 128 entries. Return * the index within PAIE1_CB given the event number. Also validate event * number. @@ -246,9 +282,6 @@ static int paiext_event_init(struct perf_event *event) rc = paiext_event_valid(event); if (rc) return rc; - /* Allow only CPU wide operation, no process context for now. */ - if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1) - return -ENOENT; /* Allow only event NNPA_ALL for sampling. */ if (a->sample_period && a->config != PAI_NNPA_BASE) return -EINVAL; @@ -262,7 +295,10 @@ static int paiext_event_init(struct perf_event *event) return -ENOMEM; } - rc = paiext_alloc(a, event); + if (event->cpu >= 0) + rc = paiext_alloc_cpu(event, event->cpu); + else + rc = paiext_alloc(event); if (rc) { free_page(PAI_SAVE_AREA(event)); return rc; @@ -334,8 +370,15 @@ static void paiext_start(struct perf_event *event, int flags) sum = paiext_getall(event); /* Get current value */ local64_set(&event->hw.prev_count, sum); } else { /* Sampling */ - cpump->event = event; - perf_sched_cb_inc(event->pmu); + memcpy((void *)PAI_SAVE_AREA(event), cpump->area, + PAIE1_CTRBLOCK_SZ); + /* Enable context switch callback for system-wide sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_add_tail(PAI_SWLIST(event), &cpump->syswide_list); + perf_sched_cb_inc(event->pmu); + } else { + cpump->event = event; + } } } @@ -346,12 +389,10 @@ static int paiext_add(struct perf_event *event, int flags) struct paiext_cb *pcb = cpump->paiext_cb; if (++cpump->active_events == 1) { - S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb); + get_lowcore()->aicd = virt_to_phys(cpump->paiext_cb); pcb->acc = virt_to_phys(cpump->area) | 0x1; /* Enable CPU instruction lookup for PAIE1 control block */ local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT); - debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", - __func__, S390_lowcore.aicd, pcb->acc); } if (flags & PERF_EF_START) paiext_start(event, PERF_EF_RELOAD); @@ -359,6 +400,7 @@ static int paiext_add(struct perf_event *event, int flags) return 0; } +static void paiext_have_sample(struct perf_event *, struct paiext_map *); static void paiext_stop(struct perf_event *event, int flags) { struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); @@ -367,8 +409,13 @@ static void paiext_stop(struct perf_event *event, int flags) if (!event->attr.sample_period) { /* Counting */ paiext_read(event); } else { /* Sampling */ - perf_sched_cb_dec(event->pmu); - cpump->event = NULL; + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_del(PAI_SWLIST(event)); + perf_sched_cb_dec(event->pmu); + } else { + paiext_have_sample(event, cpump); + cpump->event = NULL; + } } event->hw.state = PERF_HES_STOPPED; } @@ -384,9 +431,7 @@ static void paiext_del(struct perf_event *event, int flags) /* Disable CPU instruction lookup for PAIE1 control block */ local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT); pcb->acc = 0; - S390_lowcore.aicd = 0; - debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", - __func__, S390_lowcore.aicd, pcb->acc); + get_lowcore()->aicd = 0; } } @@ -470,21 +515,28 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump, } /* Check if there is data to be saved on schedule out of a task. */ -static int paiext_have_sample(void) +static void paiext_have_sample(struct perf_event *event, + struct paiext_map *cpump) { - struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); - struct paiext_map *cpump = mp->mapptr; - struct perf_event *event = cpump->event; size_t rawsize; - int rc = 0; if (!event) - return 0; + return; rawsize = paiext_copy(cpump->save, cpump->area, (unsigned long *)PAI_SAVE_AREA(event)); if (rawsize) /* Incremented counters */ - rc = paiext_push_sample(rawsize, cpump, event); - return rc; + paiext_push_sample(rawsize, cpump, event); +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void paiext_have_samples(void) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct perf_event *event; + + list_for_each_entry(event, &cpump->syswide_list, hw.tp_list) + paiext_have_sample(event, cpump); } /* Called on schedule-in and schedule-out. No access to event structure, @@ -493,10 +545,10 @@ static int paiext_have_sample(void) static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { /* We started with a clean page on event installation. So read out - * results on schedule_out and if page was dirty, clear values. + * results on schedule_out and if page was dirty, save old values. */ if (!sched_in) - paiext_have_sample(); + paiext_have_samples(); } /* Attribute definitions for pai extension1 interface. As with other CPU @@ -542,7 +594,7 @@ static const struct attribute_group *paiext_attr_groups[] = { /* Performance monitoring unit for mapped counters */ static struct pmu paiext = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = paiext_event_init, .add = paiext_add, .del = paiext_del, diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index d8740631df4b..9637aee43c40 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -71,10 +71,10 @@ void flush_thread(void) void arch_setup_new_exec(void) { - if (S390_lowcore.current_pid != current->pid) { - S390_lowcore.current_pid = current->pid; + if (get_lowcore()->current_pid != current->pid) { + get_lowcore()->current_pid = current->pid; if (test_facility(40)) - lpp(&S390_lowcore.lpp); + lpp(&get_lowcore()->lpp); } } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 90c2c786bb35..3993f4caf224 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -421,16 +421,16 @@ static void __init setup_lowcore(void) lc->clock_comparator = clock_comparator_max; lc->current_task = (unsigned long)&init_task; lc->lpp = LPP_MAGIC; - lc->machine_flags = S390_lowcore.machine_flags; - lc->preempt_count = S390_lowcore.preempt_count; + lc->machine_flags = get_lowcore()->machine_flags; + lc->preempt_count = get_lowcore()->preempt_count; nmi_alloc_mcesa_early(&lc->mcesad); - lc->sys_enter_timer = S390_lowcore.sys_enter_timer; - lc->exit_timer = S390_lowcore.exit_timer; - lc->user_timer = S390_lowcore.user_timer; - lc->system_timer = S390_lowcore.system_timer; - lc->steal_timer = S390_lowcore.steal_timer; - lc->last_update_timer = S390_lowcore.last_update_timer; - lc->last_update_clock = S390_lowcore.last_update_clock; + lc->sys_enter_timer = get_lowcore()->sys_enter_timer; + lc->exit_timer = get_lowcore()->exit_timer; + lc->user_timer = get_lowcore()->user_timer; + lc->system_timer = get_lowcore()->system_timer; + lc->steal_timer = get_lowcore()->steal_timer; + lc->last_update_timer = get_lowcore()->last_update_timer; + lc->last_update_clock = get_lowcore()->last_update_clock; /* * Allocate the global restart stack which is the same for * all CPUs in case *one* of them does a PSW restart. @@ -439,7 +439,7 @@ static void __init setup_lowcore(void) lc->mcck_stack = stack_alloc_early() + STACK_INIT_OFFSET; lc->async_stack = stack_alloc_early() + STACK_INIT_OFFSET; lc->nodat_stack = stack_alloc_early() + STACK_INIT_OFFSET; - lc->kernel_stack = S390_lowcore.kernel_stack; + lc->kernel_stack = get_lowcore()->kernel_stack; /* * Set up PSW restart to call ipl.c:do_restart(). Copy the relevant * restart data to the absolute zero lowcore. This is necessary if @@ -455,8 +455,8 @@ static void __init setup_lowcore(void) lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; - lc->kernel_asce = S390_lowcore.kernel_asce; - lc->user_asce = S390_lowcore.user_asce; + lc->kernel_asce = get_lowcore()->kernel_asce; + lc->user_asce = get_lowcore()->user_asce; system_ctlreg_init_save_area(lc); abs_lc = get_abs_lowcore(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 0324649aae0a..c3c54adf67bc 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -74,8 +74,6 @@ enum { CPU_STATE_CONFIGURED, }; -static DEFINE_PER_CPU(struct cpu *, cpu_device); - struct pcpu { unsigned long ec_mask; /* bit mask for ec_xxx functions */ unsigned long ec_clk; /* sigp timestamp for ec_xxx */ @@ -203,7 +201,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) mcck_stack = stack_alloc(); if (!lc || !nodat_stack || !async_stack || !mcck_stack) goto out; - memcpy(lc, &S390_lowcore, 512); + memcpy(lc, get_lowcore(), 512); memset((char *) lc + 512, 0, sizeof(*lc) - 512); lc->async_stack = async_stack + STACK_INIT_OFFSET; lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET; @@ -265,9 +263,9 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; lc->percpu_offset = __per_cpu_offset[cpu]; - lc->kernel_asce = S390_lowcore.kernel_asce; + lc->kernel_asce = get_lowcore()->kernel_asce; lc->user_asce = s390_invalid_asce; - lc->machine_flags = S390_lowcore.machine_flags; + lc->machine_flags = get_lowcore()->machine_flags; lc->user_timer = lc->system_timer = lc->steal_timer = lc->avg_steal_timer = 0; abs_lc = get_abs_lowcore(); @@ -407,7 +405,7 @@ void smp_call_ipl_cpu(void (*func)(void *), void *data) struct lowcore *lc = lowcore_ptr[0]; if (pcpu_devices[0].address == stap()) - lc = &S390_lowcore; + lc = get_lowcore(); pcpu_delegate(&pcpu_devices[0], func, data, lc->nodat_stack); @@ -719,8 +717,6 @@ static void __ref smp_get_core_info(struct sclp_core_info *info, int early) } } -static int smp_add_present_cpu(int cpu); - static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, bool configured, bool early) { @@ -744,7 +740,7 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, pcpu->state = CPU_STATE_STANDBY; smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN); set_cpu_present(cpu, true); - if (!early && smp_add_present_cpu(cpu) != 0) + if (!early && arch_register_cpu(cpu)) set_cpu_present(cpu, false); else nr++; @@ -831,9 +827,6 @@ void __init smp_detect_cpus(void) s_cpus += smp_cpu_mtid + 1; } pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus); - - /* Add CPUs present at boot */ - __smp_rescan_cpus(info, true); memblock_free(info, sizeof(*info)); } @@ -842,15 +835,16 @@ void __init smp_detect_cpus(void) */ static void smp_start_secondary(void *cpuvoid) { + struct lowcore *lc = get_lowcore(); int cpu = raw_smp_processor_id(); - S390_lowcore.last_update_clock = get_tod_clock(); - S390_lowcore.restart_stack = (unsigned long)restart_stack; - S390_lowcore.restart_fn = (unsigned long)do_restart; - S390_lowcore.restart_data = 0; - S390_lowcore.restart_source = -1U; - S390_lowcore.restart_flags = 0; - restore_access_regs(S390_lowcore.access_regs_save_area); + lc->last_update_clock = get_tod_clock(); + lc->restart_stack = (unsigned long)restart_stack; + lc->restart_fn = (unsigned long)do_restart; + lc->restart_data = 0; + lc->restart_source = -1U; + lc->restart_flags = 0; + restore_access_regs(lc->access_regs_save_area); cpu_init(); rcutree_report_cpu_starting(cpu); init_cpu_timer(); @@ -973,6 +967,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1202"); system_ctl_set_bit(0, 13); + smp_rescan_cpus(true); } void __init smp_prepare_boot_cpu(void) @@ -981,16 +976,18 @@ void __init smp_prepare_boot_cpu(void) WARN_ON(!cpu_present(0) || !cpu_online(0)); pcpu->state = CPU_STATE_CONFIGURED; - S390_lowcore.percpu_offset = __per_cpu_offset[0]; + get_lowcore()->percpu_offset = __per_cpu_offset[0]; smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); } void __init smp_setup_processor_id(void) { + struct lowcore *lc = get_lowcore(); + pcpu_devices[0].address = stap(); - S390_lowcore.cpu_nr = 0; - S390_lowcore.spinlock_lockval = arch_spin_lockval(0); - S390_lowcore.spinlock_index = 0; + lc->cpu_nr = 0; + lc->spinlock_lockval = arch_spin_lockval(0); + lc->spinlock_index = 0; } /* @@ -1108,35 +1105,34 @@ static struct attribute_group cpu_online_attr_group = { static int smp_cpu_online(unsigned int cpu) { - struct device *s = &per_cpu(cpu_device, cpu)->dev; + struct cpu *c = &per_cpu(cpu_devices, cpu); - return sysfs_create_group(&s->kobj, &cpu_online_attr_group); + return sysfs_create_group(&c->dev.kobj, &cpu_online_attr_group); } static int smp_cpu_pre_down(unsigned int cpu) { - struct device *s = &per_cpu(cpu_device, cpu)->dev; + struct cpu *c = &per_cpu(cpu_devices, cpu); - sysfs_remove_group(&s->kobj, &cpu_online_attr_group); + sysfs_remove_group(&c->dev.kobj, &cpu_online_attr_group); return 0; } -static int smp_add_present_cpu(int cpu) +bool arch_cpu_is_hotpluggable(int cpu) +{ + return !!cpu; +} + +int arch_register_cpu(int cpu) { - struct device *s; - struct cpu *c; + struct cpu *c = &per_cpu(cpu_devices, cpu); int rc; - c = kzalloc(sizeof(*c), GFP_KERNEL); - if (!c) - return -ENOMEM; - per_cpu(cpu_device, cpu) = c; - s = &c->dev; - c->hotpluggable = !!cpu; + c->hotpluggable = arch_cpu_is_hotpluggable(cpu); rc = register_cpu(c, cpu); if (rc) goto out; - rc = sysfs_create_group(&s->kobj, &cpu_common_attr_group); + rc = sysfs_create_group(&c->dev.kobj, &cpu_common_attr_group); if (rc) goto out_cpu; rc = topology_cpu_init(c); @@ -1145,14 +1141,14 @@ static int smp_add_present_cpu(int cpu) return 0; out_topology: - sysfs_remove_group(&s->kobj, &cpu_common_attr_group); + sysfs_remove_group(&c->dev.kobj, &cpu_common_attr_group); out_cpu: unregister_cpu(c); out: return rc; } -int __ref smp_rescan_cpus(void) +int __ref smp_rescan_cpus(bool early) { struct sclp_core_info *info; int nr; @@ -1161,7 +1157,7 @@ int __ref smp_rescan_cpus(void) if (!info) return -ENOMEM; smp_get_core_info(info, 0); - nr = __smp_rescan_cpus(info, false); + nr = __smp_rescan_cpus(info, early); kfree(info); if (nr) topology_schedule_update(); @@ -1178,7 +1174,7 @@ static ssize_t __ref rescan_store(struct device *dev, rc = lock_device_hotplug_sysfs(); if (rc) return rc; - rc = smp_rescan_cpus(); + rc = smp_rescan_cpus(false); unlock_device_hotplug(); return rc ? rc : count; } @@ -1187,7 +1183,7 @@ static DEVICE_ATTR_WO(rescan); static int __init s390_smp_init(void) { struct device *dev_root; - int cpu, rc = 0; + int rc; dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { @@ -1196,17 +1192,9 @@ static int __init s390_smp_init(void) if (rc) return rc; } - - for_each_present_cpu(cpu) { - rc = smp_add_present_cpu(cpu); - if (rc) - goto out; - } - rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", smp_cpu_online, smp_cpu_pre_down); rc = rc <= 0 ? rc : 0; -out: return rc; } subsys_initcall(s390_smp_init); diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 30bb20461db4..1cf2ad04f8e9 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -300,33 +300,56 @@ static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, return (struct diag204_x_part_block *)&block->cpus[i]; } -static void fill_diag(struct sthyi_sctns *sctns) +static void *diag204_get_data(bool diag204_allow_busy) { - int i, r, pages; - bool this_lpar; + unsigned long subcode; void *diag204_buf; - void *diag224_buf = NULL; - struct diag204_x_info_blk_hdr *ti_hdr; - struct diag204_x_part_block *part_block; - struct diag204_x_phys_block *phys_block; - struct lpar_cpu_inf lpar_inf = {}; - - /* Errors are handled through the validity bits in the response. */ - pages = diag204((unsigned long)DIAG204_SUBC_RSI | - (unsigned long)DIAG204_INFO_EXT, 0, NULL); - if (pages <= 0) - return; - + int pages, rc; + + subcode = DIAG204_SUBC_RSI; + subcode |= DIAG204_INFO_EXT; + pages = diag204(subcode, 0, NULL); + if (pages < 0) + return ERR_PTR(pages); + if (pages == 0) + return ERR_PTR(-ENODATA); diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE), PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); if (!diag204_buf) - return; + return ERR_PTR(-ENOMEM); + subcode = DIAG204_SUBC_STIB7; + subcode |= DIAG204_INFO_EXT; + if (diag204_has_bif() && diag204_allow_busy) + subcode |= DIAG204_BIF_BIT; + rc = diag204(subcode, pages, diag204_buf); + if (rc < 0) { + vfree(diag204_buf); + return ERR_PTR(rc); + } + return diag204_buf; +} - r = diag204((unsigned long)DIAG204_SUBC_STIB7 | - (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); - if (r < 0) - goto out; +static bool is_diag204_cached(struct sthyi_sctns *sctns) +{ + /* + * Check if validity bits are set when diag204 data + * is gathered. + */ + if (sctns->par.infpval1) + return true; + return false; +} + +static void fill_diag(struct sthyi_sctns *sctns, void *diag204_buf) +{ + int i; + bool this_lpar; + void *diag224_buf = NULL; + struct diag204_x_info_blk_hdr *ti_hdr; + struct diag204_x_part_block *part_block; + struct diag204_x_phys_block *phys_block; + struct lpar_cpu_inf lpar_inf = {}; diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); if (!diag224_buf || diag224(diag224_buf)) @@ -392,7 +415,6 @@ static void fill_diag(struct sthyi_sctns *sctns) out: free_page((unsigned long)diag224_buf); - vfree(diag204_buf); } static int sthyi(u64 vaddr, u64 *rc) @@ -414,19 +436,31 @@ static int sthyi(u64 vaddr, u64 *rc) static int fill_dst(void *dst, u64 *rc) { + void *diag204_buf; + struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; /* * If the facility is on, we don't want to emulate the instruction. * We ask the hypervisor to provide the data. */ - if (test_facility(74)) + if (test_facility(74)) { + memset(dst, 0, PAGE_SIZE); return sthyi((u64)dst, rc); - + } + /* + * When emulating, if diag204 returns BUSY don't reset dst buffer + * and use cached data. + */ + *rc = 0; + diag204_buf = diag204_get_data(is_diag204_cached(sctns)); + if (IS_ERR(diag204_buf)) + return PTR_ERR(diag204_buf); + memset(dst, 0, PAGE_SIZE); fill_hdr(sctns); fill_stsi(sctns); - fill_diag(sctns); - *rc = 0; + fill_diag(sctns, diag204_buf); + vfree(diag204_buf); return 0; } @@ -445,11 +479,14 @@ static int sthyi_update_cache(u64 *rc) { int r; - memset(sthyi_cache.info, 0, PAGE_SIZE); r = fill_dst(sthyi_cache.info, rc); - if (r) - return r; - sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + if (r == 0) { + sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + } else if (r == -EBUSY) { + /* mark as expired and return 0 to keep using cached data */ + sthyi_cache.end = jiffies - 1; + r = 0; + } return r; } diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index 50cbcbbaa03d..5ec28028315b 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -124,8 +124,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap) { add_random_kstack_offset(); enter_from_user_mode(regs); - regs->psw = S390_lowcore.svc_old_psw; - regs->int_code = S390_lowcore.svc_int_code; + regs->psw = get_lowcore()->svc_old_psw; + regs->int_code = get_lowcore()->svc_int_code; update_timer_sys(); if (static_branch_likely(&cpu_has_bear)) current->thread.last_break = regs->last_break; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index fb9f31f36628..b713effe0579 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -131,7 +131,7 @@ void clock_comparator_work(void) { struct clock_event_device *cd; - S390_lowcore.clock_comparator = clock_comparator_max; + get_lowcore()->clock_comparator = clock_comparator_max; cd = this_cpu_ptr(&comparators); cd->event_handler(cd); } @@ -139,8 +139,8 @@ void clock_comparator_work(void) static int s390_next_event(unsigned long delta, struct clock_event_device *evt) { - S390_lowcore.clock_comparator = get_tod_clock() + delta; - set_clock_comparator(S390_lowcore.clock_comparator); + get_lowcore()->clock_comparator = get_tod_clock() + delta; + set_clock_comparator(get_lowcore()->clock_comparator); return 0; } @@ -153,8 +153,8 @@ void init_cpu_timer(void) struct clock_event_device *cd; int cpu; - S390_lowcore.clock_comparator = clock_comparator_max; - set_clock_comparator(S390_lowcore.clock_comparator); + get_lowcore()->clock_comparator = clock_comparator_max; + set_clock_comparator(get_lowcore()->clock_comparator); cpu = smp_processor_id(); cd = &per_cpu(comparators, cpu); @@ -184,8 +184,8 @@ static void clock_comparator_interrupt(struct ext_code ext_code, unsigned long param64) { inc_irq_stat(IRQEXT_CLK); - if (S390_lowcore.clock_comparator == clock_comparator_max) - set_clock_comparator(S390_lowcore.clock_comparator); + if (get_lowcore()->clock_comparator == clock_comparator_max) + set_clock_comparator(get_lowcore()->clock_comparator); } static void stp_timing_alert(struct stp_irq_parm *); @@ -408,12 +408,12 @@ static void clock_sync_global(long delta) static void clock_sync_local(long delta) { /* Add the delta to the clock comparator. */ - if (S390_lowcore.clock_comparator != clock_comparator_max) { - S390_lowcore.clock_comparator += delta; - set_clock_comparator(S390_lowcore.clock_comparator); + if (get_lowcore()->clock_comparator != clock_comparator_max) { + get_lowcore()->clock_comparator += delta; + set_clock_comparator(get_lowcore()->clock_comparator); } /* Adjust the last_update_clock time-stamp. */ - S390_lowcore.last_update_clock += delta; + get_lowcore()->last_update_clock += delta; } /* Single threaded workqueue used for stp sync events */ diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 89e91b8ce842..98ef6dc7916b 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -320,16 +320,10 @@ static int __arch_update_cpu_topology(void) int arch_update_cpu_topology(void) { - struct device *dev; - int cpu, rc; + int rc; rc = __arch_update_cpu_topology(); on_each_cpu(__arch_update_dedicated_flag, NULL, 0); - for_each_online_cpu(cpu) { - dev = get_cpu_device(cpu); - if (dev) - kobject_uevent(&dev->kobj, KOBJ_CHANGE); - } return rc; } diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 52578b5cecbd..a7c211a3a0c9 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -288,15 +288,16 @@ static void __init test_monitor_call(void) void __init trap_init(void) { + struct lowcore *lc = get_lowcore(); unsigned long flags; struct ctlreg cr0; local_irq_save(flags); cr0 = local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); - psw_bits(S390_lowcore.external_new_psw).mcheck = 1; - psw_bits(S390_lowcore.program_new_psw).mcheck = 1; - psw_bits(S390_lowcore.svc_new_psw).mcheck = 1; - psw_bits(S390_lowcore.io_new_psw).mcheck = 1; + psw_bits(lc->external_new_psw).mcheck = 1; + psw_bits(lc->program_new_psw).mcheck = 1; + psw_bits(lc->svc_new_psw).mcheck = 1; + psw_bits(lc->io_new_psw).mcheck = 1; local_ctl_load(0, &cr0); local_irq_restore(flags); local_mcck_enable(); @@ -307,11 +308,12 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { - unsigned int trapnr; + struct lowcore *lc = get_lowcore(); irqentry_state_t state; + unsigned int trapnr; - regs->int_code = S390_lowcore.pgm_int_code; - regs->int_parm_long = S390_lowcore.trans_exc_code; + regs->int_code = lc->pgm_int_code; + regs->int_parm_long = lc->trans_exc_code; state = irqentry_enter(regs); @@ -324,19 +326,19 @@ void noinstr __do_pgm_check(struct pt_regs *regs) current->thread.last_break = regs->last_break; } - if (S390_lowcore.pgm_code & 0x0200) { + if (lc->pgm_code & 0x0200) { /* transaction abort */ - current->thread.trap_tdb = S390_lowcore.pgm_tdb; + current->thread.trap_tdb = lc->pgm_tdb; } - if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) { + if (lc->pgm_code & PGM_INT_CODE_PER) { if (user_mode(regs)) { struct per_event *ev = ¤t->thread.per_event; set_thread_flag(TIF_PER_TRAP); - ev->address = S390_lowcore.per_address; - ev->cause = S390_lowcore.per_code_combined; - ev->paid = S390_lowcore.per_access_id; + ev->address = lc->per_address; + ev->cause = lc->per_code_combined; + ev->paid = lc->per_access_id; } else { /* PER event in kernel is kprobes */ __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 265fea37e030..fa62fa0e369f 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(uv_pin_shared); * * @paddr: Absolute host address of page to be destroyed */ -static int uv_destroy_page(unsigned long paddr) +static int uv_destroy(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_DESTR_SEC_STOR, @@ -131,28 +131,40 @@ static int uv_destroy_page(unsigned long paddr) } /* - * The caller must already hold a reference to the page + * The caller must already hold a reference to the folio */ -int uv_destroy_owned_page(unsigned long paddr) +int uv_destroy_folio(struct folio *folio) { - struct page *page = phys_to_page(paddr); int rc; - get_page(page); - rc = uv_destroy_page(paddr); + /* See gmap_make_secure(): large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) + return 0; + + folio_get(folio); + rc = uv_destroy(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &page->flags); - put_page(page); + clear_bit(PG_arch_1, &folio->flags); + folio_put(folio); return rc; } /* + * The present PTE still indirectly holds a folio reference through the mapping. + */ +int uv_destroy_pte(pte_t pte) +{ + VM_WARN_ON(!pte_present(pte)); + return uv_destroy_folio(pfn_folio(pte_pfn(pte))); +} + +/* * Requests the Ultravisor to encrypt a guest page and make it * accessible to the host for paging (export). * * @paddr: Absolute host address of page to be exported */ -int uv_convert_from_secure(unsigned long paddr) +static int uv_convert_from_secure(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR, @@ -166,22 +178,34 @@ int uv_convert_from_secure(unsigned long paddr) } /* - * The caller must already hold a reference to the page + * The caller must already hold a reference to the folio. */ -int uv_convert_owned_from_secure(unsigned long paddr) +static int uv_convert_from_secure_folio(struct folio *folio) { - struct page *page = phys_to_page(paddr); int rc; - get_page(page); - rc = uv_convert_from_secure(paddr); + /* See gmap_make_secure(): large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) + return 0; + + folio_get(folio); + rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &page->flags); - put_page(page); + clear_bit(PG_arch_1, &folio->flags); + folio_put(folio); return rc; } /* + * The present PTE still indirectly holds a folio reference through the mapping. + */ +int uv_convert_from_secure_pte(pte_t pte) +{ + VM_WARN_ON(!pte_present(pte)); + return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte))); +} + +/* * Calculate the expected ref_count for a folio that would otherwise have no * further pins. This was cribbed from similar functions in other places in * the kernel, but with some slight modifications. We know that a secure @@ -267,6 +291,36 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str } /* + * Drain LRU caches: the local one on first invocation and the ones of all + * CPUs on successive invocations. Returns "true" on the first invocation. + */ +static bool drain_lru(bool *drain_lru_called) +{ + /* + * If we have tried a local drain and the folio refcount + * still does not match our expected safe value, try with a + * system wide drain. This is needed if the pagevecs holding + * the page are on a different CPU. + */ + if (*drain_lru_called) { + lru_add_drain_all(); + /* We give up here, don't retry immediately. */ + return false; + } + /* + * We are here if the folio refcount does not match the + * expected safe value. The main culprits are usually + * pagevecs. With lru_add_drain() we drain the pagevecs + * on the local CPU so that hopefully the refcount will + * reach the expected safe value. + */ + lru_add_drain(); + *drain_lru_called = true; + /* The caller should try again immediately */ + return true; +} + +/* * Requests the Ultravisor to make a page accessible to a guest. * If it's brought in the first time, it will be cleared. If * it has been exported before, it will be decrypted and integrity @@ -275,7 +329,7 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) { struct vm_area_struct *vma; - bool local_drain = false; + bool drain_lru_called = false; spinlock_t *ptelock; unsigned long uaddr; struct folio *folio; @@ -308,52 +362,63 @@ again: goto out; if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) { folio = page_folio(pte_page(*ptep)); - rc = -EINVAL; - if (folio_test_large(folio)) - goto unlock; rc = -EAGAIN; - if (folio_trylock(folio)) { + if (folio_test_large(folio)) { + rc = -E2BIG; + } else if (folio_trylock(folio)) { if (should_export_before_import(uvcb, gmap->mm)) uv_convert_from_secure(PFN_PHYS(folio_pfn(folio))); rc = make_folio_secure(folio, uvcb); folio_unlock(folio); } + + /* + * Once we drop the PTL, the folio may get unmapped and + * freed immediately. We need a temporary reference. + */ + if (rc == -EAGAIN || rc == -E2BIG) + folio_get(folio); } -unlock: pte_unmap_unlock(ptep, ptelock); out: mmap_read_unlock(gmap->mm); - if (rc == -EAGAIN) { + switch (rc) { + case -E2BIG: + folio_lock(folio); + rc = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + + switch (rc) { + case 0: + /* Splitting succeeded, try again immediately. */ + goto again; + case -EAGAIN: + /* Additional folio references. */ + if (drain_lru(&drain_lru_called)) + goto again; + return -EAGAIN; + case -EBUSY: + /* Unexpected race. */ + return -EAGAIN; + } + WARN_ON_ONCE(1); + return -ENXIO; + case -EAGAIN: /* * If we are here because the UVC returned busy or partial * completion, this is just a useless check, but it is safe. */ folio_wait_writeback(folio); - } else if (rc == -EBUSY) { - /* - * If we have tried a local drain and the folio refcount - * still does not match our expected safe value, try with a - * system wide drain. This is needed if the pagevecs holding - * the page are on a different CPU. - */ - if (local_drain) { - lru_add_drain_all(); - /* We give up here, and let the caller try again */ - return -EAGAIN; - } - /* - * We are here if the folio refcount does not match the - * expected safe value. The main culprits are usually - * pagevecs. With lru_add_drain() we drain the pagevecs - * on the local CPU so that hopefully the refcount will - * reach the expected safe value. - */ - lru_add_drain(); - local_drain = true; - /* And now we try again immediately after draining */ - goto again; - } else if (rc == -ENXIO) { + folio_put(folio); + return -EAGAIN; + case -EBUSY: + /* Additional folio references. */ + if (drain_lru(&drain_lru_called)) + goto again; + return -EAGAIN; + case -ENXIO: if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) return -EFAULT; return -EAGAIN; @@ -388,6 +453,7 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) { struct vm_area_struct *vma; unsigned long uaddr; + struct folio *folio; struct page *page; int rc; @@ -411,7 +477,8 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - rc = uv_destroy_owned_page(page_to_phys(page)); + folio = page_folio(page); + rc = uv_destroy_folio(folio); /* * Fault handlers can race; it is possible that two CPUs will fault * on the same secure page. One CPU can destroy the page, reboot, @@ -422,8 +489,8 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) * we instead try to export the page. */ if (rc) - rc = uv_convert_owned_from_secure(page_to_phys(page)); - put_page(page); + rc = uv_convert_from_secure_folio(folio); + folio_put(folio); out: mmap_read_unlock(gmap->mm); return rc; @@ -431,47 +498,51 @@ out: EXPORT_SYMBOL_GPL(gmap_destroy_page); /* - * To be called with the page locked or with an extra reference! This will - * prevent gmap_make_secure from touching the page concurrently. Having 2 - * parallel make_page_accessible is fine, as the UV calls will become a - * no-op if the page is already exported. + * To be called with the folio locked or with an extra reference! This will + * prevent gmap_make_secure from touching the folio concurrently. Having 2 + * parallel arch_make_folio_accessible is fine, as the UV calls will become a + * no-op if the folio is already exported. */ -int arch_make_page_accessible(struct page *page) +int arch_make_folio_accessible(struct folio *folio) { int rc = 0; - /* Hugepage cannot be protected, so nothing to do */ - if (PageHuge(page)) + /* See gmap_make_secure(): large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) return 0; /* - * PG_arch_1 is used in 3 places: - * 1. for kernel page tables during early boot - * 2. for storage keys of huge pages and KVM - * 3. As an indication that this page might be secure. This can + * PG_arch_1 is used in 2 places: + * 1. for storage keys of hugetlb folios and KVM + * 2. As an indication that this small folio might be secure. This can * overindicate, e.g. we set the bit before calling * convert_to_secure. - * As secure pages are never huge, all 3 variants can co-exists. + * As secure pages are never large folios, both variants can co-exists. */ - if (!test_bit(PG_arch_1, &page->flags)) + if (!test_bit(PG_arch_1, &folio->flags)) return 0; - rc = uv_pin_shared(page_to_phys(page)); + rc = uv_pin_shared(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags); return 0; } - rc = uv_convert_from_secure(page_to_phys(page)); + rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags); return 0; } return rc; } -EXPORT_SYMBOL_GPL(arch_make_page_accessible); +EXPORT_SYMBOL_GPL(arch_make_folio_accessible); +int arch_make_page_accessible(struct page *page) +{ + return arch_make_folio_accessible(page_folio(page)); +} +EXPORT_SYMBOL_GPL(arch_make_page_accessible); #endif #if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index ffc1db0cbf9c..234a0ba30510 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -35,14 +35,15 @@ static DEFINE_PER_CPU(u64, mt_scaling_jiffies); static inline void set_vtimer(u64 expires) { + struct lowcore *lc = get_lowcore(); u64 timer; asm volatile( " stpt %0\n" /* Store current cpu timer value */ " spt %1" /* Set new value imm. afterwards */ : "=Q" (timer) : "Q" (expires)); - S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; - S390_lowcore.last_update_timer = expires; + lc->system_timer += lc->last_update_timer - timer; + lc->last_update_timer = expires; } static inline int virt_timer_forward(u64 elapsed) @@ -117,22 +118,23 @@ static void account_system_index_scaled(struct task_struct *p, u64 cputime, static int do_account_vtime(struct task_struct *tsk) { u64 timer, clock, user, guest, system, hardirq, softirq; + struct lowcore *lc = get_lowcore(); - timer = S390_lowcore.last_update_timer; - clock = S390_lowcore.last_update_clock; + timer = lc->last_update_timer; + clock = lc->last_update_clock; asm volatile( " stpt %0\n" /* Store current cpu timer value */ " stckf %1" /* Store current tod clock value */ - : "=Q" (S390_lowcore.last_update_timer), - "=Q" (S390_lowcore.last_update_clock) + : "=Q" (lc->last_update_timer), + "=Q" (lc->last_update_clock) : : "cc"); - clock = S390_lowcore.last_update_clock - clock; - timer -= S390_lowcore.last_update_timer; + clock = lc->last_update_clock - clock; + timer -= lc->last_update_timer; if (hardirq_count()) - S390_lowcore.hardirq_timer += timer; + lc->hardirq_timer += timer; else - S390_lowcore.system_timer += timer; + lc->system_timer += timer; /* Update MT utilization calculation */ if (smp_cpu_mtid && @@ -141,16 +143,16 @@ static int do_account_vtime(struct task_struct *tsk) /* Calculate cputime delta */ user = update_tsk_timer(&tsk->thread.user_timer, - READ_ONCE(S390_lowcore.user_timer)); + READ_ONCE(lc->user_timer)); guest = update_tsk_timer(&tsk->thread.guest_timer, - READ_ONCE(S390_lowcore.guest_timer)); + READ_ONCE(lc->guest_timer)); system = update_tsk_timer(&tsk->thread.system_timer, - READ_ONCE(S390_lowcore.system_timer)); + READ_ONCE(lc->system_timer)); hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, - READ_ONCE(S390_lowcore.hardirq_timer)); + READ_ONCE(lc->hardirq_timer)); softirq = update_tsk_timer(&tsk->thread.softirq_timer, - READ_ONCE(S390_lowcore.softirq_timer)); - S390_lowcore.steal_timer += + READ_ONCE(lc->softirq_timer)); + lc->steal_timer += clock - user - guest - system - hardirq - softirq; /* Push account value */ @@ -176,17 +178,19 @@ static int do_account_vtime(struct task_struct *tsk) void vtime_task_switch(struct task_struct *prev) { + struct lowcore *lc = get_lowcore(); + do_account_vtime(prev); - prev->thread.user_timer = S390_lowcore.user_timer; - prev->thread.guest_timer = S390_lowcore.guest_timer; - prev->thread.system_timer = S390_lowcore.system_timer; - prev->thread.hardirq_timer = S390_lowcore.hardirq_timer; - prev->thread.softirq_timer = S390_lowcore.softirq_timer; - S390_lowcore.user_timer = current->thread.user_timer; - S390_lowcore.guest_timer = current->thread.guest_timer; - S390_lowcore.system_timer = current->thread.system_timer; - S390_lowcore.hardirq_timer = current->thread.hardirq_timer; - S390_lowcore.softirq_timer = current->thread.softirq_timer; + prev->thread.user_timer = lc->user_timer; + prev->thread.guest_timer = lc->guest_timer; + prev->thread.system_timer = lc->system_timer; + prev->thread.hardirq_timer = lc->hardirq_timer; + prev->thread.softirq_timer = lc->softirq_timer; + lc->user_timer = current->thread.user_timer; + lc->guest_timer = current->thread.guest_timer; + lc->system_timer = current->thread.system_timer; + lc->hardirq_timer = current->thread.hardirq_timer; + lc->softirq_timer = current->thread.softirq_timer; } /* @@ -196,28 +200,29 @@ void vtime_task_switch(struct task_struct *prev) */ void vtime_flush(struct task_struct *tsk) { + struct lowcore *lc = get_lowcore(); u64 steal, avg_steal; if (do_account_vtime(tsk)) virt_timer_expire(); - steal = S390_lowcore.steal_timer; - avg_steal = S390_lowcore.avg_steal_timer; + steal = lc->steal_timer; + avg_steal = lc->avg_steal_timer; if ((s64) steal > 0) { - S390_lowcore.steal_timer = 0; + lc->steal_timer = 0; account_steal_time(cputime_to_nsecs(steal)); avg_steal += steal; } - S390_lowcore.avg_steal_timer = avg_steal / 2; + lc->avg_steal_timer = avg_steal / 2; } static u64 vtime_delta(void) { - u64 timer = S390_lowcore.last_update_timer; - - S390_lowcore.last_update_timer = get_cpu_timer(); + struct lowcore *lc = get_lowcore(); + u64 timer = lc->last_update_timer; - return timer - S390_lowcore.last_update_timer; + lc->last_update_timer = get_cpu_timer(); + return timer - lc->last_update_timer; } /* @@ -226,12 +231,13 @@ static u64 vtime_delta(void) */ void vtime_account_kernel(struct task_struct *tsk) { + struct lowcore *lc = get_lowcore(); u64 delta = vtime_delta(); if (tsk->flags & PF_VCPU) - S390_lowcore.guest_timer += delta; + lc->guest_timer += delta; else - S390_lowcore.system_timer += delta; + lc->system_timer += delta; virt_timer_forward(delta); } @@ -241,7 +247,7 @@ void vtime_account_softirq(struct task_struct *tsk) { u64 delta = vtime_delta(); - S390_lowcore.softirq_timer += delta; + get_lowcore()->softirq_timer += delta; virt_timer_forward(delta); } @@ -250,7 +256,7 @@ void vtime_account_hardirq(struct task_struct *tsk) { u64 delta = vtime_delta(); - S390_lowcore.hardirq_timer += delta; + get_lowcore()->hardirq_timer += delta; virt_timer_forward(delta); } |