summaryrefslogtreecommitdiff
path: root/arch/s390/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-18 15:41:45 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-18 15:41:45 -0700
commit1c7d0c3af5cc8adafef6477f9416820fc894ca40 (patch)
tree449450c8ca1726cefb8197256b61c4de0b7cfddb /arch/s390/kernel
parentdde1a0e1625c08cf4f958348a83434b2ddecf449 (diff)
parentdf39038cd89525d465c2c8827eb64116873f141a (diff)
Merge tag 's390-6.11-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
Pull s390 updates from Vasily Gorbik: - Remove restrictions on PAI NNPA and crypto counters, enabling concurrent per-task and system-wide sampling and counting events - Switch to GENERIC_CPU_DEVICES by setting up the CPU present mask in the architecture code and letting the generic code handle CPU bring-up - Add support for the diag204 busy indication facility to prevent undesirable blocking during hypervisor logical CPU utilization queries. Implement results caching - Improve the handling of Store Data SCLP events by suppressing unnecessary warning, preventing buffer release in I/O during failures, and adding timeout handling for Store Data requests to address potential firmware issues - Provide optimized __arch_hweight*() implementations - Remove the unnecessary CPU KOBJ_CHANGE uevents generated during topology updates, as they are unused and also not present on other architectures - Cleanup atomic_ops, optimize __atomic_set() for small values and __atomic_cmpxchg_bool() for compilers supporting flag output constraint - Couple of cleanups for KVM: - Move and improve KVM struct definitions for DAT tables from gaccess.c to a new header - Pass the asce as parameter to sie64a() - Make the crdte() and cspg() page table handling wrappers return a boolean to indicate success, like the other existing "compare and swap" wrappers - Add documentation for HWCAP flags - Switch to obtaining total RAM pages from memblock instead of totalram_pages() during mm init, to ensure correct calculation of zero page size, when defer_init is enabled - Refactor lowcore access and switch to using the get_lowcore() function instead of the S390_lowcore macro - Cleanups for PG_arch_1 and folio handling in UV and hugetlb code - Add missing MODULE_DESCRIPTION() macros - Fix VM_FAULT_HWPOISON handling in do_exception() * tag 's390-6.11-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (54 commits) s390/mm: Fix VM_FAULT_HWPOISON handling in do_exception() s390/kvm: Move bitfields for dat tables s390/entry: Pass the asce as parameter to sie64a() s390/sthyi: Use cached data when diag is busy s390/sthyi: Move diag operations s390/hypfs_diag: Diag204 busy loop s390/diag: Add busy-indication-facility requirements s390/diag: Diag204 add busy return errno s390/diag: Return errno's from diag204 s390/sclp: Diag204 busy indication facility detection s390/atomic_ops: Make use of flag output constraint s390/atomic_ops: Improve __atomic_set() for small values s390/atomic_ops: Use symbolic names s390/smp: Switch to GENERIC_CPU_DEVICES s390/hwcaps: Add documentation for HWCAP flags s390/pgtable: Make crdte() and cspg() return a value s390/topology: Remove CPU KOBJ_CHANGE uevents s390/sclp: Add timeout to Store Data requests s390/sclp: Prevent release of buffer in I/O s390/sclp: Suppress unnecessary Store Data warning ...
Diffstat (limited to 'arch/s390/kernel')
-rw-r--r--arch/s390/kernel/asm-offsets.c1
-rw-r--r--arch/s390/kernel/diag.c12
-rw-r--r--arch/s390/kernel/dumpstack.c8
-rw-r--r--arch/s390/kernel/early.c36
-rw-r--r--arch/s390/kernel/entry.S8
-rw-r--r--arch/s390/kernel/idle.c11
-rw-r--r--arch/s390/kernel/irq.c18
-rw-r--r--arch/s390/kernel/machine_kexec.c4
-rw-r--r--arch/s390/kernel/nmi.c31
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c2
-rw-r--r--arch/s390/kernel/perf_pai_crypto.c183
-rw-r--r--arch/s390/kernel/perf_pai_ext.c146
-rw-r--r--arch/s390/kernel/process.c6
-rw-r--r--arch/s390/kernel/setup.c24
-rw-r--r--arch/s390/kernel/smp.c88
-rw-r--r--arch/s390/kernel/sthyi.c95
-rw-r--r--arch/s390/kernel/syscall.c4
-rw-r--r--arch/s390/kernel/time.c22
-rw-r--r--arch/s390/kernel/topology.c8
-rw-r--r--arch/s390/kernel/traps.c28
-rw-r--r--arch/s390/kernel/uv.c207
-rw-r--r--arch/s390/kernel/vtime.c82
22 files changed, 610 insertions, 414 deletions
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index f55979f64d49..26bb45d0e6f1 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -63,6 +63,7 @@ int main(void)
OFFSET(__SF_SIE_REASON, stack_frame, sie_reason);
OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags);
OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys);
+ OFFSET(__SF_SIE_GUEST_ASCE, stack_frame, sie_guest_asce);
DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame));
BLANK();
OFFSET(__SFUSER_BACKCHAIN, stack_frame_user, back_chain);
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index 8dee9aa0ec95..9b65f04c83de 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -185,6 +185,8 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
}
EXPORT_SYMBOL(diag14);
+#define DIAG204_BUSY_RC 8
+
static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
{
union register_pair rp = { .even = *subcode, .odd = size };
@@ -215,16 +217,18 @@ int diag204(unsigned long subcode, unsigned long size, void *addr)
{
if (addr) {
if (WARN_ON_ONCE(!is_vmalloc_addr(addr)))
- return -1;
+ return -EINVAL;
if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE)))
- return -1;
+ return -EINVAL;
}
if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4)
addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr));
diag_stat_inc(DIAG_STAT_X204);
size = __diag204(&subcode, size, addr);
- if (subcode)
- return -1;
+ if (subcode == DIAG204_BUSY_RC)
+ return -EBUSY;
+ else if (subcode)
+ return -EOPNOTSUPP;
return size;
}
EXPORT_SYMBOL(diag204);
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index d2012635b093..1ecd0580561f 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -61,28 +61,28 @@ static bool in_task_stack(unsigned long sp, struct task_struct *task,
static bool in_irq_stack(unsigned long sp, struct stack_info *info)
{
- unsigned long stack = S390_lowcore.async_stack - STACK_INIT_OFFSET;
+ unsigned long stack = get_lowcore()->async_stack - STACK_INIT_OFFSET;
return in_stack(sp, info, STACK_TYPE_IRQ, stack);
}
static bool in_nodat_stack(unsigned long sp, struct stack_info *info)
{
- unsigned long stack = S390_lowcore.nodat_stack - STACK_INIT_OFFSET;
+ unsigned long stack = get_lowcore()->nodat_stack - STACK_INIT_OFFSET;
return in_stack(sp, info, STACK_TYPE_NODAT, stack);
}
static bool in_mcck_stack(unsigned long sp, struct stack_info *info)
{
- unsigned long stack = S390_lowcore.mcck_stack - STACK_INIT_OFFSET;
+ unsigned long stack = get_lowcore()->mcck_stack - STACK_INIT_OFFSET;
return in_stack(sp, info, STACK_TYPE_MCCK, stack);
}
static bool in_restart_stack(unsigned long sp, struct stack_info *info)
{
- unsigned long stack = S390_lowcore.restart_stack - STACK_INIT_OFFSET;
+ unsigned long stack = get_lowcore()->restart_stack - STACK_INIT_OFFSET;
return in_stack(sp, info, STACK_TYPE_RESTART, stack);
}
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index c666271433fb..467ed4dba817 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -72,7 +72,7 @@ static void __init reset_tod_clock(void)
memset(&tod_clock_base, 0, sizeof(tod_clock_base));
tod_clock_base.tod = TOD_UNIX_EPOCH;
- S390_lowcore.last_update_clock = TOD_UNIX_EPOCH;
+ get_lowcore()->last_update_clock = TOD_UNIX_EPOCH;
}
/*
@@ -99,7 +99,7 @@ static noinline __init void detect_machine_type(void)
/* Check current-configuration-level */
if (stsi(NULL, 0, 0, 0) <= 2) {
- S390_lowcore.machine_flags |= MACHINE_FLAG_LPAR;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_LPAR;
return;
}
/* Get virtual-machine cpu information. */
@@ -108,9 +108,9 @@ static noinline __init void detect_machine_type(void)
/* Detect known hypervisors */
if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3))
- S390_lowcore.machine_flags |= MACHINE_FLAG_KVM;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_KVM;
else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4))
- S390_lowcore.machine_flags |= MACHINE_FLAG_VM;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_VM;
}
/* Remove leading, trailing and double whitespace. */
@@ -166,7 +166,7 @@ static __init void setup_topology(void)
if (!test_facility(11))
return;
- S390_lowcore.machine_flags |= MACHINE_FLAG_TOPOLOGY;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_TOPOLOGY;
for (max_mnest = 6; max_mnest > 1; max_mnest--) {
if (stsi(&sysinfo_page, 15, 1, max_mnest) == 0)
break;
@@ -186,8 +186,8 @@ static noinline __init void setup_lowcore_early(void)
psw.addr = (unsigned long)early_pgm_check_handler;
psw.mask = PSW_KERNEL_BITS;
- S390_lowcore.program_new_psw = psw;
- S390_lowcore.preempt_count = INIT_PREEMPT_COUNT;
+ get_lowcore()->program_new_psw = psw;
+ get_lowcore()->preempt_count = INIT_PREEMPT_COUNT;
}
static noinline __init void setup_facility_list(void)
@@ -211,43 +211,43 @@ static __init void detect_diag9c(void)
EX_TABLE(0b,1b)
: "=d" (rc) : "0" (-EOPNOTSUPP), "d" (cpu_address) : "cc");
if (!rc)
- S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_DIAG9C;
}
static __init void detect_machine_facilities(void)
{
if (test_facility(8)) {
- S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_EDAT1;
system_ctl_set_bit(0, CR0_EDAT_BIT);
}
if (test_facility(78))
- S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_EDAT2;
if (test_facility(3))
- S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_IDTE;
if (test_facility(50) && test_facility(73)) {
- S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_TE;
system_ctl_set_bit(0, CR0_TRANSACTIONAL_EXECUTION_BIT);
}
if (test_facility(51))
- S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_TLB_LC;
if (test_facility(129))
system_ctl_set_bit(0, CR0_VECTOR_BIT);
if (test_facility(130))
- S390_lowcore.machine_flags |= MACHINE_FLAG_NX;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_NX;
if (test_facility(133))
- S390_lowcore.machine_flags |= MACHINE_FLAG_GS;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_GS;
if (test_facility(139) && (tod_clock_base.tod >> 63)) {
/* Enabled signed clock comparator comparisons */
- S390_lowcore.machine_flags |= MACHINE_FLAG_SCC;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_SCC;
clock_comparator_max = -1ULL >> 1;
system_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SIGN_BIT);
}
if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) {
- S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_PCI_MIO;
/* the control bit is set during PCI initialization */
}
if (test_facility(194))
- S390_lowcore.machine_flags |= MACHINE_FLAG_RDP;
+ get_lowcore()->machine_flags |= MACHINE_FLAG_RDP;
}
static inline void save_vector_registers(void)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 60cf917a7122..454b6b92c7f8 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -179,6 +179,7 @@ SYM_FUNC_END(__switch_to_asm)
* %r2 pointer to sie control block phys
* %r3 pointer to sie control block virt
* %r4 guest register save area
+ * %r5 guest asce
*/
SYM_FUNC_START(__sie64a)
stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers
@@ -186,15 +187,12 @@ SYM_FUNC_START(__sie64a)
stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical..
stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses
stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area
+ stg %r5,__SF_SIE_GUEST_ASCE(%r15) # save guest asce
xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0
mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags
lmg %r0,%r13,0(%r4) # load guest gprs 0-13
- lg %r14,__LC_GMAP # get gmap pointer
- ltgr %r14,%r14
- jz .Lsie_gmap
oi __LC_CPU_FLAGS+7,_CIF_SIE
- lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce
-.Lsie_gmap:
+ lctlg %c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce
lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer
oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now
tm __SIE_PROG20+3(%r14),3 # last exit...
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index af9c97c0ad73..39cb8d0ae348 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -24,6 +24,7 @@ static DEFINE_PER_CPU(struct s390_idle_data, s390_idle);
void account_idle_time_irq(void)
{
struct s390_idle_data *idle = this_cpu_ptr(&s390_idle);
+ struct lowcore *lc = get_lowcore();
unsigned long idle_time;
u64 cycles_new[8];
int i;
@@ -34,13 +35,13 @@ void account_idle_time_irq(void)
this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]);
}
- idle_time = S390_lowcore.int_clock - idle->clock_idle_enter;
+ idle_time = lc->int_clock - idle->clock_idle_enter;
- S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock;
- S390_lowcore.last_update_clock = S390_lowcore.int_clock;
+ lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock;
+ lc->last_update_clock = lc->int_clock;
- S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter;
- S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer;
+ lc->system_timer += lc->last_update_timer - idle->timer_idle_enter;
+ lc->last_update_timer = lc->sys_enter_timer;
/* Account time spent with enabled wait psw loaded as idle time. */
WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time);
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 9acc6630abd3..1af5a08d72ab 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -100,8 +100,8 @@ static const struct irq_class irqclass_sub_desc[] = {
static void do_IRQ(struct pt_regs *regs, int irq)
{
- if (tod_after_eq(S390_lowcore.int_clock,
- S390_lowcore.clock_comparator))
+ if (tod_after_eq(get_lowcore()->int_clock,
+ get_lowcore()->clock_comparator))
/* Serve timer interrupts first. */
clock_comparator_work();
generic_handle_irq(irq);
@@ -111,7 +111,7 @@ static int on_async_stack(void)
{
unsigned long frame = current_frame_address();
- return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0;
+ return ((get_lowcore()->async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0;
}
static void do_irq_async(struct pt_regs *regs, int irq)
@@ -119,7 +119,7 @@ static void do_irq_async(struct pt_regs *regs, int irq)
if (on_async_stack()) {
do_IRQ(regs, irq);
} else {
- call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ,
+ call_on_stack(2, get_lowcore()->async_stack, void, do_IRQ,
struct pt_regs *, regs, int, irq);
}
}
@@ -153,8 +153,8 @@ void noinstr do_io_irq(struct pt_regs *regs)
set_cpu_flag(CIF_NOHZ_DELAY);
do {
- regs->tpi_info = S390_lowcore.tpi_info;
- if (S390_lowcore.tpi_info.adapter_IO)
+ regs->tpi_info = get_lowcore()->tpi_info;
+ if (get_lowcore()->tpi_info.adapter_IO)
do_irq_async(regs, THIN_INTERRUPT);
else
do_irq_async(regs, IO_INTERRUPT);
@@ -183,9 +183,9 @@ void noinstr do_ext_irq(struct pt_regs *regs)
current->thread.last_break = regs->last_break;
}
- regs->int_code = S390_lowcore.ext_int_code_addr;
- regs->int_parm = S390_lowcore.ext_params;
- regs->int_parm_long = S390_lowcore.ext_params2;
+ regs->int_code = get_lowcore()->ext_int_code_addr;
+ regs->int_parm = get_lowcore()->ext_params;
+ regs->int_parm_long = get_lowcore()->ext_params2;
from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
if (from_idle)
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 3aee98efc374..f4cf65da6d49 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -52,7 +52,7 @@ static void __do_machine_kdump(void *data)
purgatory = (purgatory_t)image->start;
/* store_status() saved the prefix register to lowcore */
- prefix = (unsigned long) S390_lowcore.prefixreg_save_area;
+ prefix = (unsigned long)get_lowcore()->prefixreg_save_area;
/* Now do the reset */
s390_reset_system();
@@ -91,7 +91,7 @@ static noinline void __machine_kdump(void *image)
continue;
}
/* Store status of the boot CPU */
- mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
+ mcesa = __va(get_lowcore()->mcesad & MCESA_ORIGIN_MASK);
if (cpu_has_vx())
save_vx_regs((__vector128 *) mcesa->vector_save_area);
if (MACHINE_HAS_GS) {
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 230d010bac9b..fbd218b6fc8e 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -117,6 +117,7 @@ static __always_inline char *u64_to_hex(char *dest, u64 val)
static notrace void s390_handle_damage(void)
{
+ struct lowcore *lc = get_lowcore();
union ctlreg0 cr0, cr0_new;
char message[100];
psw_t psw_save;
@@ -125,7 +126,7 @@ static notrace void s390_handle_damage(void)
smp_emergency_stop();
diag_amode31_ops.diag308_reset();
ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x");
- u64_to_hex(ptr, S390_lowcore.mcck_interruption_code);
+ u64_to_hex(ptr, lc->mcck_interruption_code);
/*
* Disable low address protection and make machine check new PSW a
@@ -135,17 +136,17 @@ static notrace void s390_handle_damage(void)
cr0_new = cr0;
cr0_new.lap = 0;
local_ctl_load(0, &cr0_new.reg);
- psw_save = S390_lowcore.mcck_new_psw;
- psw_bits(S390_lowcore.mcck_new_psw).io = 0;
- psw_bits(S390_lowcore.mcck_new_psw).ext = 0;
- psw_bits(S390_lowcore.mcck_new_psw).wait = 1;
+ psw_save = lc->mcck_new_psw;
+ psw_bits(lc->mcck_new_psw).io = 0;
+ psw_bits(lc->mcck_new_psw).ext = 0;
+ psw_bits(lc->mcck_new_psw).wait = 1;
sclp_emergency_printk(message);
/*
* Restore machine check new PSW and control register 0 to original
* values. This makes possible system dump analysis easier.
*/
- S390_lowcore.mcck_new_psw = psw_save;
+ lc->mcck_new_psw = psw_save;
local_ctl_load(0, &cr0.reg);
disabled_wait();
while (1);
@@ -226,7 +227,7 @@ static bool notrace nmi_registers_valid(union mci mci)
/*
* Set the clock comparator register to the next expected value.
*/
- set_clock_comparator(S390_lowcore.clock_comparator);
+ set_clock_comparator(get_lowcore()->clock_comparator);
if (!mci.gr || !mci.fp || !mci.fc)
return false;
/*
@@ -252,7 +253,7 @@ static bool notrace nmi_registers_valid(union mci mci)
* check handling must take care of this. The host values are saved by
* KVM and are not affected.
*/
- cr2.reg = S390_lowcore.cregs_save_area[2];
+ cr2.reg = get_lowcore()->cregs_save_area[2];
if (cr2.gse && !mci.gs && !test_cpu_flag(CIF_MCCK_GUEST))
return false;
if (!mci.ms || !mci.pm || !mci.ia)
@@ -278,11 +279,10 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs)
sie_page = container_of(sie_block, struct sie_page, sie_block);
mcck_backup = &sie_page->mcck_info;
- mcck_backup->mcic = S390_lowcore.mcck_interruption_code &
+ mcck_backup->mcic = get_lowcore()->mcck_interruption_code &
~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE);
- mcck_backup->ext_damage_code = S390_lowcore.external_damage_code;
- mcck_backup->failing_storage_address
- = S390_lowcore.failing_storage_address;
+ mcck_backup->ext_damage_code = get_lowcore()->external_damage_code;
+ mcck_backup->failing_storage_address = get_lowcore()->failing_storage_address;
}
NOKPROBE_SYMBOL(s390_backup_mcck_info);
@@ -302,6 +302,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
static int ipd_count;
static DEFINE_SPINLOCK(ipd_lock);
static unsigned long long last_ipd;
+ struct lowcore *lc = get_lowcore();
struct mcck_struct *mcck;
unsigned long long tmp;
irqentry_state_t irq_state;
@@ -314,7 +315,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
if (user_mode(regs))
update_timer_mcck();
inc_irq_stat(NMI_NMI);
- mci.val = S390_lowcore.mcck_interruption_code;
+ mci.val = lc->mcck_interruption_code;
mcck = this_cpu_ptr(&cpu_mcck);
/*
@@ -382,9 +383,9 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
}
if (mci.ed && mci.ec) {
/* External damage */
- if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC))
+ if (lc->external_damage_code & (1U << ED_STP_SYNC))
mcck->stp_queue |= stp_sync_check();
- if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND))
+ if (lc->external_damage_code & (1U << ED_STP_ISLAND))
mcck->stp_queue |= stp_island_check();
mcck_pending = 1;
}
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 06efad5b4f93..736c1d9632dd 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1022,7 +1022,7 @@ static void cpumsf_pmu_enable(struct pmu *pmu)
}
/* Load current program parameter */
- lpp(&S390_lowcore.lpp);
+ lpp(&get_lowcore()->lpp);
debug_sprintf_event(sfdbg, 6, "%s: es %i cs %i ed %i cd %i "
"interval %#lx tear %#lx dear %#lx\n", __func__,
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
index 4ad472d130a3..2f5a20e300f6 100644
--- a/arch/s390/kernel/perf_pai_crypto.c
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -36,8 +36,8 @@ struct paicrypt_map {
struct pai_userdata *save; /* Page to store no-zero counters */
unsigned int active_events; /* # of PAI crypto users */
refcount_t refcnt; /* Reference count mapped buffers */
- enum paievt_mode mode; /* Type of event */
struct perf_event *event; /* Perf event for sampling */
+ struct list_head syswide_list; /* List system-wide sampling events */
};
struct paicrypt_mapptr {
@@ -84,20 +84,16 @@ static DEFINE_MUTEX(pai_reserve_mutex);
/* Adjust usage counters and remove allocated memory when all users are
* gone.
*/
-static void paicrypt_event_destroy(struct perf_event *event)
+static void paicrypt_event_destroy_cpu(struct perf_event *event, int cpu)
{
- struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr,
- event->cpu);
+ struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr, cpu);
struct paicrypt_map *cpump = mp->mapptr;
- static_branch_dec(&pai_key);
mutex_lock(&pai_reserve_mutex);
- debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d"
- " mode %d refcnt %u\n", __func__,
- event->attr.config, event->cpu,
- cpump->active_events, cpump->mode,
+ debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d "
+ "refcnt %u\n", __func__, event->attr.config,
+ event->cpu, cpump->active_events,
refcount_read(&cpump->refcnt));
- free_page(PAI_SAVE_AREA(event));
if (refcount_dec_and_test(&cpump->refcnt)) {
debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n",
__func__, (unsigned long)cpump->page,
@@ -111,6 +107,23 @@ static void paicrypt_event_destroy(struct perf_event *event)
mutex_unlock(&pai_reserve_mutex);
}
+static void paicrypt_event_destroy(struct perf_event *event)
+{
+ int cpu;
+
+ static_branch_dec(&pai_key);
+ free_page(PAI_SAVE_AREA(event));
+ if (event->cpu == -1) {
+ struct cpumask *mask = PAI_CPU_MASK(event);
+
+ for_each_cpu(cpu, mask)
+ paicrypt_event_destroy_cpu(event, cpu);
+ kfree(mask);
+ } else {
+ paicrypt_event_destroy_cpu(event, event->cpu);
+ }
+}
+
static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel)
{
if (kernel)
@@ -156,23 +169,15 @@ static u64 paicrypt_getall(struct perf_event *event)
return sum;
}
-/* Used to avoid races in checking concurrent access of counting and
- * sampling for crypto events
- *
- * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is
- * allowed and when this event is running, no counting event is allowed.
- * Several counting events are allowed in parallel, but no sampling event
- * is allowed while one (or more) counting events are running.
- *
+/* Check concurrent access of counting and sampling for crypto events.
* This function is called in process context and it is save to block.
* When the event initialization functions fails, no other call back will
* be invoked.
*
* Allocate the memory for the event.
*/
-static struct paicrypt_map *paicrypt_busy(struct perf_event *event)
+static struct paicrypt_map *paicrypt_busy(struct perf_event *event, int cpu)
{
- struct perf_event_attr *a = &event->attr;
struct paicrypt_map *cpump = NULL;
struct paicrypt_mapptr *mp;
int rc;
@@ -185,7 +190,7 @@ static struct paicrypt_map *paicrypt_busy(struct perf_event *event)
goto unlock;
/* Allocate node for this event */
- mp = per_cpu_ptr(paicrypt_root.mapptr, event->cpu);
+ mp = per_cpu_ptr(paicrypt_root.mapptr, cpu);
cpump = mp->mapptr;
if (!cpump) { /* Paicrypt_map allocated? */
cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
@@ -193,25 +198,9 @@ static struct paicrypt_map *paicrypt_busy(struct perf_event *event)
rc = -ENOMEM;
goto free_root;
}
+ INIT_LIST_HEAD(&cpump->syswide_list);
}
- if (a->sample_period) { /* Sampling requested */
- if (cpump->mode != PAI_MODE_NONE)
- rc = -EBUSY; /* ... sampling/counting active */
- } else { /* Counting requested */
- if (cpump->mode == PAI_MODE_SAMPLING)
- rc = -EBUSY; /* ... and sampling active */
- }
- /*
- * This error case triggers when there is a conflict:
- * Either sampling requested and counting already active, or visa
- * versa. Therefore the struct paicrypto_map for this CPU is
- * needed or the error could not have occurred. Only adjust root
- * node refcount.
- */
- if (rc)
- goto free_root;
-
/* Allocate memory for counter page and counter extraction.
* Only the first counting event has to allocate a page.
*/
@@ -235,26 +224,58 @@ static struct paicrypt_map *paicrypt_busy(struct perf_event *event)
/* Set mode and reference count */
rc = 0;
refcount_set(&cpump->refcnt, 1);
- cpump->mode = a->sample_period ? PAI_MODE_SAMPLING : PAI_MODE_COUNTING;
mp->mapptr = cpump;
- debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx users %d"
- " mode %d refcnt %u page %#lx save %p rc %d\n",
- __func__, a->sample_period, cpump->active_events,
- cpump->mode, refcount_read(&cpump->refcnt),
+ debug_sprintf_event(cfm_dbg, 5, "%s users %d refcnt %u page %#lx "
+ "save %p rc %d\n", __func__, cpump->active_events,
+ refcount_read(&cpump->refcnt),
(unsigned long)cpump->page, cpump->save, rc);
goto unlock;
free_paicrypt_map:
+ /* Undo memory allocation */
kfree(cpump);
mp->mapptr = NULL;
free_root:
paicrypt_root_free();
-
unlock:
mutex_unlock(&pai_reserve_mutex);
return rc ? ERR_PTR(rc) : cpump;
}
+static int paicrypt_event_init_all(struct perf_event *event)
+{
+ struct paicrypt_map *cpump;
+ struct cpumask *maskptr;
+ int cpu, rc = -ENOMEM;
+
+ maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL);
+ if (!maskptr)
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ cpump = paicrypt_busy(event, cpu);
+ if (IS_ERR(cpump)) {
+ for_each_cpu(cpu, maskptr)
+ paicrypt_event_destroy_cpu(event, cpu);
+ kfree(maskptr);
+ rc = PTR_ERR(cpump);
+ goto out;
+ }
+ cpumask_set_cpu(cpu, maskptr);
+ }
+
+ /*
+ * On error all cpumask are freed and all events have been destroyed.
+ * Save of which CPUs data structures have been allocated for.
+ * Release them in paicrypt_event_destroy call back function
+ * for this event.
+ */
+ PAI_CPU_MASK(event) = maskptr;
+ rc = 0;
+out:
+ return rc;
+}
+
/* Might be called on different CPU than the one the event is intended for. */
static int paicrypt_event_init(struct perf_event *event)
{
@@ -269,10 +290,7 @@ static int paicrypt_event_init(struct perf_event *event)
if (a->config < PAI_CRYPTO_BASE ||
a->config > PAI_CRYPTO_BASE + paicrypt_cnt)
return -EINVAL;
- /* Allow only CPU wide operation, no process context for now. */
- if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1)
- return -ENOENT;
- /* Allow only CRYPTO_ALL for sampling. */
+ /* Allow only CRYPTO_ALL for sampling */
if (a->sample_period && a->config != PAI_CRYPTO_BASE)
return -EINVAL;
/* Get a page to store last counter values for sampling */
@@ -284,13 +302,17 @@ static int paicrypt_event_init(struct perf_event *event)
}
}
- cpump = paicrypt_busy(event);
- if (IS_ERR(cpump)) {
+ if (event->cpu >= 0) {
+ cpump = paicrypt_busy(event, event->cpu);
+ if (IS_ERR(cpump))
+ rc = PTR_ERR(cpump);
+ } else {
+ rc = paicrypt_event_init_all(event);
+ }
+ if (rc) {
free_page(PAI_SAVE_AREA(event));
- rc = PTR_ERR(cpump);
goto out;
}
-
event->destroy = paicrypt_event_destroy;
if (a->sample_period) {
@@ -331,8 +353,14 @@ static void paicrypt_start(struct perf_event *event, int flags)
sum = paicrypt_getall(event); /* Get current value */
local64_set(&event->hw.prev_count, sum);
} else { /* Sampling */
- cpump->event = event;
- perf_sched_cb_inc(event->pmu);
+ memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE);
+ /* Enable context switch callback for system-wide sampling */
+ if (!(event->attach_state & PERF_ATTACH_TASK)) {
+ list_add_tail(PAI_SWLIST(event), &cpump->syswide_list);
+ perf_sched_cb_inc(event->pmu);
+ } else {
+ cpump->event = event;
+ }
}
}
@@ -344,7 +372,7 @@ static int paicrypt_add(struct perf_event *event, int flags)
if (++cpump->active_events == 1) {
ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET;
- WRITE_ONCE(S390_lowcore.ccd, ccd);
+ WRITE_ONCE(get_lowcore()->ccd, ccd);
local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
}
if (flags & PERF_EF_START)
@@ -353,6 +381,7 @@ static int paicrypt_add(struct perf_event *event, int flags)
return 0;
}
+static void paicrypt_have_sample(struct perf_event *, struct paicrypt_map *);
static void paicrypt_stop(struct perf_event *event, int flags)
{
struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
@@ -361,8 +390,13 @@ static void paicrypt_stop(struct perf_event *event, int flags)
if (!event->attr.sample_period) { /* Counting */
paicrypt_read(event);
} else { /* Sampling */
- perf_sched_cb_dec(event->pmu);
- cpump->event = NULL;
+ if (!(event->attach_state & PERF_ATTACH_TASK)) {
+ perf_sched_cb_dec(event->pmu);
+ list_del(PAI_SWLIST(event));
+ } else {
+ paicrypt_have_sample(event, cpump);
+ cpump->event = NULL;
+ }
}
event->hw.state = PERF_HES_STOPPED;
}
@@ -375,7 +409,7 @@ static void paicrypt_del(struct perf_event *event, int flags)
paicrypt_stop(event, PERF_EF_UPDATE);
if (--cpump->active_events == 0) {
local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
- WRITE_ONCE(S390_lowcore.ccd, 0);
+ WRITE_ONCE(get_lowcore()->ccd, 0);
}
}
@@ -455,23 +489,30 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
}
/* Check if there is data to be saved on schedule out of a task. */
-static int paicrypt_have_sample(void)
+static void paicrypt_have_sample(struct perf_event *event,
+ struct paicrypt_map *cpump)
{
- struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
- struct paicrypt_map *cpump = mp->mapptr;
- struct perf_event *event = cpump->event;
size_t rawsize;
- int rc = 0;
if (!event) /* No event active */
- return 0;
+ return;
rawsize = paicrypt_copy(cpump->save, cpump->page,
(unsigned long *)PAI_SAVE_AREA(event),
- cpump->event->attr.exclude_user,
- cpump->event->attr.exclude_kernel);
+ event->attr.exclude_user,
+ event->attr.exclude_kernel);
if (rawsize) /* No incremented counters */
- rc = paicrypt_push_sample(rawsize, cpump, event);
- return rc;
+ paicrypt_push_sample(rawsize, cpump, event);
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static void paicrypt_have_samples(void)
+{
+ struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+ struct paicrypt_map *cpump = mp->mapptr;
+ struct perf_event *event;
+
+ list_for_each_entry(event, &cpump->syswide_list, hw.tp_list)
+ paicrypt_have_sample(event, cpump);
}
/* Called on schedule-in and schedule-out. No access to event structure,
@@ -480,10 +521,10 @@ static int paicrypt_have_sample(void)
static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
/* We started with a clean page on event installation. So read out
- * results on schedule_out and if page was dirty, clear values.
+ * results on schedule_out and if page was dirty, save old values.
*/
if (!sched_in)
- paicrypt_have_sample();
+ paicrypt_have_samples();
}
/* Attribute definitions for paicrypt interface. As with other CPU
@@ -527,7 +568,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = {
/* Performance monitoring unit for mapped counters */
static struct pmu paicrypt = {
- .task_ctx_nr = perf_invalid_context,
+ .task_ctx_nr = perf_hw_context,
.event_init = paicrypt_event_init,
.add = paicrypt_add,
.del = paicrypt_del,
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
index a6da7e0cc7a6..6295531b39a2 100644
--- a/arch/s390/kernel/perf_pai_ext.c
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -47,11 +47,11 @@ struct paiext_cb { /* PAI extension 1 control block */
struct paiext_map {
unsigned long *area; /* Area for CPU to store counters */
struct pai_userdata *save; /* Area to store non-zero counters */
- enum paievt_mode mode; /* Type of event */
unsigned int active_events; /* # of PAI Extension users */
refcount_t refcnt;
struct perf_event *event; /* Perf event for sampling */
struct paiext_cb *paiext_cb; /* PAI extension control block area */
+ struct list_head syswide_list; /* List system-wide sampling events */
};
struct paiext_mapptr {
@@ -70,6 +70,8 @@ static void paiext_root_free(void)
free_percpu(paiext_root.mapptr);
paiext_root.mapptr = NULL;
}
+ debug_sprintf_event(paiext_dbg, 5, "%s root.refcount %d\n", __func__,
+ refcount_read(&paiext_root.refcnt));
}
/* On initialization of first event also allocate per CPU data dynamically.
@@ -115,20 +117,34 @@ static void paiext_free(struct paiext_mapptr *mp)
}
/* Release the PMU if event is the last perf event */
-static void paiext_event_destroy(struct perf_event *event)
+static void paiext_event_destroy_cpu(struct perf_event *event, int cpu)
{
- struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
+ struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, cpu);
struct paiext_map *cpump = mp->mapptr;
- free_page(PAI_SAVE_AREA(event));
mutex_lock(&paiext_reserve_mutex);
if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */
paiext_free(mp);
paiext_root_free();
mutex_unlock(&paiext_reserve_mutex);
- debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__,
- event->cpu, mp->mapptr);
+}
+
+static void paiext_event_destroy(struct perf_event *event)
+{
+ int cpu;
+
+ free_page(PAI_SAVE_AREA(event));
+ if (event->cpu == -1) {
+ struct cpumask *mask = PAI_CPU_MASK(event);
+ for_each_cpu(cpu, mask)
+ paiext_event_destroy_cpu(event, cpu);
+ kfree(mask);
+ } else {
+ paiext_event_destroy_cpu(event, event->cpu);
+ }
+ debug_sprintf_event(paiext_dbg, 4, "%s cpu %d\n", __func__,
+ event->cpu);
}
/* Used to avoid races in checking concurrent access of counting and
@@ -145,19 +161,18 @@ static void paiext_event_destroy(struct perf_event *event)
*
* Allocate the memory for the event.
*/
-static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event)
+static int paiext_alloc_cpu(struct perf_event *event, int cpu)
{
struct paiext_mapptr *mp;
struct paiext_map *cpump;
int rc;
mutex_lock(&paiext_reserve_mutex);
-
rc = paiext_root_alloc();
if (rc)
goto unlock;
- mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
+ mp = per_cpu_ptr(paiext_root.mapptr, cpu);
cpump = mp->mapptr;
if (!cpump) { /* Paiext_map allocated? */
rc = -ENOMEM;
@@ -185,24 +200,13 @@ static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event)
paiext_free(mp);
goto undo;
}
+ INIT_LIST_HEAD(&cpump->syswide_list);
refcount_set(&cpump->refcnt, 1);
- cpump->mode = a->sample_period ? PAI_MODE_SAMPLING
- : PAI_MODE_COUNTING;
+ rc = 0;
} else {
- /* Multiple invocation, check what is active.
- * Supported are multiple counter events or only one sampling
- * event concurrently at any one time.
- */
- if (cpump->mode == PAI_MODE_SAMPLING ||
- (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) {
- rc = -EBUSY;
- goto undo;
- }
refcount_inc(&cpump->refcnt);
}
- rc = 0;
-
undo:
if (rc) {
/* Error in allocation of event, decrement anchor. Since
@@ -217,6 +221,38 @@ unlock:
return rc;
}
+static int paiext_alloc(struct perf_event *event)
+{
+ struct cpumask *maskptr;
+ int cpu, rc = -ENOMEM;
+
+ maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL);
+ if (!maskptr)
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ rc = paiext_alloc_cpu(event, cpu);
+ if (rc) {
+ for_each_cpu(cpu, maskptr)
+ paiext_event_destroy_cpu(event, cpu);
+ kfree(maskptr);
+ goto out;
+ }
+ cpumask_set_cpu(cpu, maskptr);
+ }
+
+ /*
+ * On error all cpumask are freed and all events have been destroyed.
+ * Save of which CPUs data structures have been allocated for.
+ * Release them in paicrypt_event_destroy call back function
+ * for this event.
+ */
+ PAI_CPU_MASK(event) = maskptr;
+ rc = 0;
+out:
+ return rc;
+}
+
/* The PAI extension 1 control block supports up to 128 entries. Return
* the index within PAIE1_CB given the event number. Also validate event
* number.
@@ -246,9 +282,6 @@ static int paiext_event_init(struct perf_event *event)
rc = paiext_event_valid(event);
if (rc)
return rc;
- /* Allow only CPU wide operation, no process context for now. */
- if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1)
- return -ENOENT;
/* Allow only event NNPA_ALL for sampling. */
if (a->sample_period && a->config != PAI_NNPA_BASE)
return -EINVAL;
@@ -262,7 +295,10 @@ static int paiext_event_init(struct perf_event *event)
return -ENOMEM;
}
- rc = paiext_alloc(a, event);
+ if (event->cpu >= 0)
+ rc = paiext_alloc_cpu(event, event->cpu);
+ else
+ rc = paiext_alloc(event);
if (rc) {
free_page(PAI_SAVE_AREA(event));
return rc;
@@ -334,8 +370,15 @@ static void paiext_start(struct perf_event *event, int flags)
sum = paiext_getall(event); /* Get current value */
local64_set(&event->hw.prev_count, sum);
} else { /* Sampling */
- cpump->event = event;
- perf_sched_cb_inc(event->pmu);
+ memcpy((void *)PAI_SAVE_AREA(event), cpump->area,
+ PAIE1_CTRBLOCK_SZ);
+ /* Enable context switch callback for system-wide sampling */
+ if (!(event->attach_state & PERF_ATTACH_TASK)) {
+ list_add_tail(PAI_SWLIST(event), &cpump->syswide_list);
+ perf_sched_cb_inc(event->pmu);
+ } else {
+ cpump->event = event;
+ }
}
}
@@ -346,12 +389,10 @@ static int paiext_add(struct perf_event *event, int flags)
struct paiext_cb *pcb = cpump->paiext_cb;
if (++cpump->active_events == 1) {
- S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb);
+ get_lowcore()->aicd = virt_to_phys(cpump->paiext_cb);
pcb->acc = virt_to_phys(cpump->area) | 0x1;
/* Enable CPU instruction lookup for PAIE1 control block */
local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT);
- debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
- __func__, S390_lowcore.aicd, pcb->acc);
}
if (flags & PERF_EF_START)
paiext_start(event, PERF_EF_RELOAD);
@@ -359,6 +400,7 @@ static int paiext_add(struct perf_event *event, int flags)
return 0;
}
+static void paiext_have_sample(struct perf_event *, struct paiext_map *);
static void paiext_stop(struct perf_event *event, int flags)
{
struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
@@ -367,8 +409,13 @@ static void paiext_stop(struct perf_event *event, int flags)
if (!event->attr.sample_period) { /* Counting */
paiext_read(event);
} else { /* Sampling */
- perf_sched_cb_dec(event->pmu);
- cpump->event = NULL;
+ if (!(event->attach_state & PERF_ATTACH_TASK)) {
+ list_del(PAI_SWLIST(event));
+ perf_sched_cb_dec(event->pmu);
+ } else {
+ paiext_have_sample(event, cpump);
+ cpump->event = NULL;
+ }
}
event->hw.state = PERF_HES_STOPPED;
}
@@ -384,9 +431,7 @@ static void paiext_del(struct perf_event *event, int flags)
/* Disable CPU instruction lookup for PAIE1 control block */
local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT);
pcb->acc = 0;
- S390_lowcore.aicd = 0;
- debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
- __func__, S390_lowcore.aicd, pcb->acc);
+ get_lowcore()->aicd = 0;
}
}
@@ -470,21 +515,28 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
}
/* Check if there is data to be saved on schedule out of a task. */
-static int paiext_have_sample(void)
+static void paiext_have_sample(struct perf_event *event,
+ struct paiext_map *cpump)
{
- struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
- struct paiext_map *cpump = mp->mapptr;
- struct perf_event *event = cpump->event;
size_t rawsize;
- int rc = 0;
if (!event)
- return 0;
+ return;
rawsize = paiext_copy(cpump->save, cpump->area,
(unsigned long *)PAI_SAVE_AREA(event));
if (rawsize) /* Incremented counters */
- rc = paiext_push_sample(rawsize, cpump, event);
- return rc;
+ paiext_push_sample(rawsize, cpump, event);
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static void paiext_have_samples(void)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ struct perf_event *event;
+
+ list_for_each_entry(event, &cpump->syswide_list, hw.tp_list)
+ paiext_have_sample(event, cpump);
}
/* Called on schedule-in and schedule-out. No access to event structure,
@@ -493,10 +545,10 @@ static int paiext_have_sample(void)
static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
/* We started with a clean page on event installation. So read out
- * results on schedule_out and if page was dirty, clear values.
+ * results on schedule_out and if page was dirty, save old values.
*/
if (!sched_in)
- paiext_have_sample();
+ paiext_have_samples();
}
/* Attribute definitions for pai extension1 interface. As with other CPU
@@ -542,7 +594,7 @@ static const struct attribute_group *paiext_attr_groups[] = {
/* Performance monitoring unit for mapped counters */
static struct pmu paiext = {
- .task_ctx_nr = perf_invalid_context,
+ .task_ctx_nr = perf_hw_context,
.event_init = paiext_event_init,
.add = paiext_add,
.del = paiext_del,
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index d8740631df4b..9637aee43c40 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -71,10 +71,10 @@ void flush_thread(void)
void arch_setup_new_exec(void)
{
- if (S390_lowcore.current_pid != current->pid) {
- S390_lowcore.current_pid = current->pid;
+ if (get_lowcore()->current_pid != current->pid) {
+ get_lowcore()->current_pid = current->pid;
if (test_facility(40))
- lpp(&S390_lowcore.lpp);
+ lpp(&get_lowcore()->lpp);
}
}
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 90c2c786bb35..3993f4caf224 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -421,16 +421,16 @@ static void __init setup_lowcore(void)
lc->clock_comparator = clock_comparator_max;
lc->current_task = (unsigned long)&init_task;
lc->lpp = LPP_MAGIC;
- lc->machine_flags = S390_lowcore.machine_flags;
- lc->preempt_count = S390_lowcore.preempt_count;
+ lc->machine_flags = get_lowcore()->machine_flags;
+ lc->preempt_count = get_lowcore()->preempt_count;
nmi_alloc_mcesa_early(&lc->mcesad);
- lc->sys_enter_timer = S390_lowcore.sys_enter_timer;
- lc->exit_timer = S390_lowcore.exit_timer;
- lc->user_timer = S390_lowcore.user_timer;
- lc->system_timer = S390_lowcore.system_timer;
- lc->steal_timer = S390_lowcore.steal_timer;
- lc->last_update_timer = S390_lowcore.last_update_timer;
- lc->last_update_clock = S390_lowcore.last_update_clock;
+ lc->sys_enter_timer = get_lowcore()->sys_enter_timer;
+ lc->exit_timer = get_lowcore()->exit_timer;
+ lc->user_timer = get_lowcore()->user_timer;
+ lc->system_timer = get_lowcore()->system_timer;
+ lc->steal_timer = get_lowcore()->steal_timer;
+ lc->last_update_timer = get_lowcore()->last_update_timer;
+ lc->last_update_clock = get_lowcore()->last_update_clock;
/*
* Allocate the global restart stack which is the same for
* all CPUs in case *one* of them does a PSW restart.
@@ -439,7 +439,7 @@ static void __init setup_lowcore(void)
lc->mcck_stack = stack_alloc_early() + STACK_INIT_OFFSET;
lc->async_stack = stack_alloc_early() + STACK_INIT_OFFSET;
lc->nodat_stack = stack_alloc_early() + STACK_INIT_OFFSET;
- lc->kernel_stack = S390_lowcore.kernel_stack;
+ lc->kernel_stack = get_lowcore()->kernel_stack;
/*
* Set up PSW restart to call ipl.c:do_restart(). Copy the relevant
* restart data to the absolute zero lowcore. This is necessary if
@@ -455,8 +455,8 @@ static void __init setup_lowcore(void)
lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
lc->preempt_count = PREEMPT_DISABLED;
- lc->kernel_asce = S390_lowcore.kernel_asce;
- lc->user_asce = S390_lowcore.user_asce;
+ lc->kernel_asce = get_lowcore()->kernel_asce;
+ lc->user_asce = get_lowcore()->user_asce;
system_ctlreg_init_save_area(lc);
abs_lc = get_abs_lowcore();
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 0324649aae0a..c3c54adf67bc 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -74,8 +74,6 @@ enum {
CPU_STATE_CONFIGURED,
};
-static DEFINE_PER_CPU(struct cpu *, cpu_device);
-
struct pcpu {
unsigned long ec_mask; /* bit mask for ec_xxx functions */
unsigned long ec_clk; /* sigp timestamp for ec_xxx */
@@ -203,7 +201,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
mcck_stack = stack_alloc();
if (!lc || !nodat_stack || !async_stack || !mcck_stack)
goto out;
- memcpy(lc, &S390_lowcore, 512);
+ memcpy(lc, get_lowcore(), 512);
memset((char *) lc + 512, 0, sizeof(*lc) - 512);
lc->async_stack = async_stack + STACK_INIT_OFFSET;
lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET;
@@ -265,9 +263,9 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
lc->spinlock_lockval = arch_spin_lockval(cpu);
lc->spinlock_index = 0;
lc->percpu_offset = __per_cpu_offset[cpu];
- lc->kernel_asce = S390_lowcore.kernel_asce;
+ lc->kernel_asce = get_lowcore()->kernel_asce;
lc->user_asce = s390_invalid_asce;
- lc->machine_flags = S390_lowcore.machine_flags;
+ lc->machine_flags = get_lowcore()->machine_flags;
lc->user_timer = lc->system_timer =
lc->steal_timer = lc->avg_steal_timer = 0;
abs_lc = get_abs_lowcore();
@@ -407,7 +405,7 @@ void smp_call_ipl_cpu(void (*func)(void *), void *data)
struct lowcore *lc = lowcore_ptr[0];
if (pcpu_devices[0].address == stap())
- lc = &S390_lowcore;
+ lc = get_lowcore();
pcpu_delegate(&pcpu_devices[0], func, data,
lc->nodat_stack);
@@ -719,8 +717,6 @@ static void __ref smp_get_core_info(struct sclp_core_info *info, int early)
}
}
-static int smp_add_present_cpu(int cpu);
-
static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail,
bool configured, bool early)
{
@@ -744,7 +740,7 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail,
pcpu->state = CPU_STATE_STANDBY;
smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN);
set_cpu_present(cpu, true);
- if (!early && smp_add_present_cpu(cpu) != 0)
+ if (!early && arch_register_cpu(cpu))
set_cpu_present(cpu, false);
else
nr++;
@@ -831,9 +827,6 @@ void __init smp_detect_cpus(void)
s_cpus += smp_cpu_mtid + 1;
}
pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus);
-
- /* Add CPUs present at boot */
- __smp_rescan_cpus(info, true);
memblock_free(info, sizeof(*info));
}
@@ -842,15 +835,16 @@ void __init smp_detect_cpus(void)
*/
static void smp_start_secondary(void *cpuvoid)
{
+ struct lowcore *lc = get_lowcore();
int cpu = raw_smp_processor_id();
- S390_lowcore.last_update_clock = get_tod_clock();
- S390_lowcore.restart_stack = (unsigned long)restart_stack;
- S390_lowcore.restart_fn = (unsigned long)do_restart;
- S390_lowcore.restart_data = 0;
- S390_lowcore.restart_source = -1U;
- S390_lowcore.restart_flags = 0;
- restore_access_regs(S390_lowcore.access_regs_save_area);
+ lc->last_update_clock = get_tod_clock();
+ lc->restart_stack = (unsigned long)restart_stack;
+ lc->restart_fn = (unsigned long)do_restart;
+ lc->restart_data = 0;
+ lc->restart_source = -1U;
+ lc->restart_flags = 0;
+ restore_access_regs(lc->access_regs_save_area);
cpu_init();
rcutree_report_cpu_starting(cpu);
init_cpu_timer();
@@ -973,6 +967,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt))
panic("Couldn't request external interrupt 0x1202");
system_ctl_set_bit(0, 13);
+ smp_rescan_cpus(true);
}
void __init smp_prepare_boot_cpu(void)
@@ -981,16 +976,18 @@ void __init smp_prepare_boot_cpu(void)
WARN_ON(!cpu_present(0) || !cpu_online(0));
pcpu->state = CPU_STATE_CONFIGURED;
- S390_lowcore.percpu_offset = __per_cpu_offset[0];
+ get_lowcore()->percpu_offset = __per_cpu_offset[0];
smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN);
}
void __init smp_setup_processor_id(void)
{
+ struct lowcore *lc = get_lowcore();
+
pcpu_devices[0].address = stap();
- S390_lowcore.cpu_nr = 0;
- S390_lowcore.spinlock_lockval = arch_spin_lockval(0);
- S390_lowcore.spinlock_index = 0;
+ lc->cpu_nr = 0;
+ lc->spinlock_lockval = arch_spin_lockval(0);
+ lc->spinlock_index = 0;
}
/*
@@ -1108,35 +1105,34 @@ static struct attribute_group cpu_online_attr_group = {
static int smp_cpu_online(unsigned int cpu)
{
- struct device *s = &per_cpu(cpu_device, cpu)->dev;
+ struct cpu *c = &per_cpu(cpu_devices, cpu);
- return sysfs_create_group(&s->kobj, &cpu_online_attr_group);
+ return sysfs_create_group(&c->dev.kobj, &cpu_online_attr_group);
}
static int smp_cpu_pre_down(unsigned int cpu)
{
- struct device *s = &per_cpu(cpu_device, cpu)->dev;
+ struct cpu *c = &per_cpu(cpu_devices, cpu);
- sysfs_remove_group(&s->kobj, &cpu_online_attr_group);
+ sysfs_remove_group(&c->dev.kobj, &cpu_online_attr_group);
return 0;
}
-static int smp_add_present_cpu(int cpu)
+bool arch_cpu_is_hotpluggable(int cpu)
+{
+ return !!cpu;
+}
+
+int arch_register_cpu(int cpu)
{
- struct device *s;
- struct cpu *c;
+ struct cpu *c = &per_cpu(cpu_devices, cpu);
int rc;
- c = kzalloc(sizeof(*c), GFP_KERNEL);
- if (!c)
- return -ENOMEM;
- per_cpu(cpu_device, cpu) = c;
- s = &c->dev;
- c->hotpluggable = !!cpu;
+ c->hotpluggable = arch_cpu_is_hotpluggable(cpu);
rc = register_cpu(c, cpu);
if (rc)
goto out;
- rc = sysfs_create_group(&s->kobj, &cpu_common_attr_group);
+ rc = sysfs_create_group(&c->dev.kobj, &cpu_common_attr_group);
if (rc)
goto out_cpu;
rc = topology_cpu_init(c);
@@ -1145,14 +1141,14 @@ static int smp_add_present_cpu(int cpu)
return 0;
out_topology:
- sysfs_remove_group(&s->kobj, &cpu_common_attr_group);
+ sysfs_remove_group(&c->dev.kobj, &cpu_common_attr_group);
out_cpu:
unregister_cpu(c);
out:
return rc;
}
-int __ref smp_rescan_cpus(void)
+int __ref smp_rescan_cpus(bool early)
{
struct sclp_core_info *info;
int nr;
@@ -1161,7 +1157,7 @@ int __ref smp_rescan_cpus(void)
if (!info)
return -ENOMEM;
smp_get_core_info(info, 0);
- nr = __smp_rescan_cpus(info, false);
+ nr = __smp_rescan_cpus(info, early);
kfree(info);
if (nr)
topology_schedule_update();
@@ -1178,7 +1174,7 @@ static ssize_t __ref rescan_store(struct device *dev,
rc = lock_device_hotplug_sysfs();
if (rc)
return rc;
- rc = smp_rescan_cpus();
+ rc = smp_rescan_cpus(false);
unlock_device_hotplug();
return rc ? rc : count;
}
@@ -1187,7 +1183,7 @@ static DEVICE_ATTR_WO(rescan);
static int __init s390_smp_init(void)
{
struct device *dev_root;
- int cpu, rc = 0;
+ int rc;
dev_root = bus_get_dev_root(&cpu_subsys);
if (dev_root) {
@@ -1196,17 +1192,9 @@ static int __init s390_smp_init(void)
if (rc)
return rc;
}
-
- for_each_present_cpu(cpu) {
- rc = smp_add_present_cpu(cpu);
- if (rc)
- goto out;
- }
-
rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online",
smp_cpu_online, smp_cpu_pre_down);
rc = rc <= 0 ? rc : 0;
-out:
return rc;
}
subsys_initcall(s390_smp_init);
diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c
index 30bb20461db4..1cf2ad04f8e9 100644
--- a/arch/s390/kernel/sthyi.c
+++ b/arch/s390/kernel/sthyi.c
@@ -300,33 +300,56 @@ static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf,
return (struct diag204_x_part_block *)&block->cpus[i];
}
-static void fill_diag(struct sthyi_sctns *sctns)
+static void *diag204_get_data(bool diag204_allow_busy)
{
- int i, r, pages;
- bool this_lpar;
+ unsigned long subcode;
void *diag204_buf;
- void *diag224_buf = NULL;
- struct diag204_x_info_blk_hdr *ti_hdr;
- struct diag204_x_part_block *part_block;
- struct diag204_x_phys_block *phys_block;
- struct lpar_cpu_inf lpar_inf = {};
-
- /* Errors are handled through the validity bits in the response. */
- pages = diag204((unsigned long)DIAG204_SUBC_RSI |
- (unsigned long)DIAG204_INFO_EXT, 0, NULL);
- if (pages <= 0)
- return;
-
+ int pages, rc;
+
+ subcode = DIAG204_SUBC_RSI;
+ subcode |= DIAG204_INFO_EXT;
+ pages = diag204(subcode, 0, NULL);
+ if (pages < 0)
+ return ERR_PTR(pages);
+ if (pages == 0)
+ return ERR_PTR(-ENODATA);
diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE),
PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE,
__builtin_return_address(0));
if (!diag204_buf)
- return;
+ return ERR_PTR(-ENOMEM);
+ subcode = DIAG204_SUBC_STIB7;
+ subcode |= DIAG204_INFO_EXT;
+ if (diag204_has_bif() && diag204_allow_busy)
+ subcode |= DIAG204_BIF_BIT;
+ rc = diag204(subcode, pages, diag204_buf);
+ if (rc < 0) {
+ vfree(diag204_buf);
+ return ERR_PTR(rc);
+ }
+ return diag204_buf;
+}
- r = diag204((unsigned long)DIAG204_SUBC_STIB7 |
- (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf);
- if (r < 0)
- goto out;
+static bool is_diag204_cached(struct sthyi_sctns *sctns)
+{
+ /*
+ * Check if validity bits are set when diag204 data
+ * is gathered.
+ */
+ if (sctns->par.infpval1)
+ return true;
+ return false;
+}
+
+static void fill_diag(struct sthyi_sctns *sctns, void *diag204_buf)
+{
+ int i;
+ bool this_lpar;
+ void *diag224_buf = NULL;
+ struct diag204_x_info_blk_hdr *ti_hdr;
+ struct diag204_x_part_block *part_block;
+ struct diag204_x_phys_block *phys_block;
+ struct lpar_cpu_inf lpar_inf = {};
diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
if (!diag224_buf || diag224(diag224_buf))
@@ -392,7 +415,6 @@ static void fill_diag(struct sthyi_sctns *sctns)
out:
free_page((unsigned long)diag224_buf);
- vfree(diag204_buf);
}
static int sthyi(u64 vaddr, u64 *rc)
@@ -414,19 +436,31 @@ static int sthyi(u64 vaddr, u64 *rc)
static int fill_dst(void *dst, u64 *rc)
{
+ void *diag204_buf;
+
struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst;
/*
* If the facility is on, we don't want to emulate the instruction.
* We ask the hypervisor to provide the data.
*/
- if (test_facility(74))
+ if (test_facility(74)) {
+ memset(dst, 0, PAGE_SIZE);
return sthyi((u64)dst, rc);
-
+ }
+ /*
+ * When emulating, if diag204 returns BUSY don't reset dst buffer
+ * and use cached data.
+ */
+ *rc = 0;
+ diag204_buf = diag204_get_data(is_diag204_cached(sctns));
+ if (IS_ERR(diag204_buf))
+ return PTR_ERR(diag204_buf);
+ memset(dst, 0, PAGE_SIZE);
fill_hdr(sctns);
fill_stsi(sctns);
- fill_diag(sctns);
- *rc = 0;
+ fill_diag(sctns, diag204_buf);
+ vfree(diag204_buf);
return 0;
}
@@ -445,11 +479,14 @@ static int sthyi_update_cache(u64 *rc)
{
int r;
- memset(sthyi_cache.info, 0, PAGE_SIZE);
r = fill_dst(sthyi_cache.info, rc);
- if (r)
- return r;
- sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES;
+ if (r == 0) {
+ sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES;
+ } else if (r == -EBUSY) {
+ /* mark as expired and return 0 to keep using cached data */
+ sthyi_cache.end = jiffies - 1;
+ r = 0;
+ }
return r;
}
diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c
index 50cbcbbaa03d..5ec28028315b 100644
--- a/arch/s390/kernel/syscall.c
+++ b/arch/s390/kernel/syscall.c
@@ -124,8 +124,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
{
add_random_kstack_offset();
enter_from_user_mode(regs);
- regs->psw = S390_lowcore.svc_old_psw;
- regs->int_code = S390_lowcore.svc_int_code;
+ regs->psw = get_lowcore()->svc_old_psw;
+ regs->int_code = get_lowcore()->svc_int_code;
update_timer_sys();
if (static_branch_likely(&cpu_has_bear))
current->thread.last_break = regs->last_break;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index fb9f31f36628..b713effe0579 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -131,7 +131,7 @@ void clock_comparator_work(void)
{
struct clock_event_device *cd;
- S390_lowcore.clock_comparator = clock_comparator_max;
+ get_lowcore()->clock_comparator = clock_comparator_max;
cd = this_cpu_ptr(&comparators);
cd->event_handler(cd);
}
@@ -139,8 +139,8 @@ void clock_comparator_work(void)
static int s390_next_event(unsigned long delta,
struct clock_event_device *evt)
{
- S390_lowcore.clock_comparator = get_tod_clock() + delta;
- set_clock_comparator(S390_lowcore.clock_comparator);
+ get_lowcore()->clock_comparator = get_tod_clock() + delta;
+ set_clock_comparator(get_lowcore()->clock_comparator);
return 0;
}
@@ -153,8 +153,8 @@ void init_cpu_timer(void)
struct clock_event_device *cd;
int cpu;
- S390_lowcore.clock_comparator = clock_comparator_max;
- set_clock_comparator(S390_lowcore.clock_comparator);
+ get_lowcore()->clock_comparator = clock_comparator_max;
+ set_clock_comparator(get_lowcore()->clock_comparator);
cpu = smp_processor_id();
cd = &per_cpu(comparators, cpu);
@@ -184,8 +184,8 @@ static void clock_comparator_interrupt(struct ext_code ext_code,
unsigned long param64)
{
inc_irq_stat(IRQEXT_CLK);
- if (S390_lowcore.clock_comparator == clock_comparator_max)
- set_clock_comparator(S390_lowcore.clock_comparator);
+ if (get_lowcore()->clock_comparator == clock_comparator_max)
+ set_clock_comparator(get_lowcore()->clock_comparator);
}
static void stp_timing_alert(struct stp_irq_parm *);
@@ -408,12 +408,12 @@ static void clock_sync_global(long delta)
static void clock_sync_local(long delta)
{
/* Add the delta to the clock comparator. */
- if (S390_lowcore.clock_comparator != clock_comparator_max) {
- S390_lowcore.clock_comparator += delta;
- set_clock_comparator(S390_lowcore.clock_comparator);
+ if (get_lowcore()->clock_comparator != clock_comparator_max) {
+ get_lowcore()->clock_comparator += delta;
+ set_clock_comparator(get_lowcore()->clock_comparator);
}
/* Adjust the last_update_clock time-stamp. */
- S390_lowcore.last_update_clock += delta;
+ get_lowcore()->last_update_clock += delta;
}
/* Single threaded workqueue used for stp sync events */
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 89e91b8ce842..98ef6dc7916b 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -320,16 +320,10 @@ static int __arch_update_cpu_topology(void)
int arch_update_cpu_topology(void)
{
- struct device *dev;
- int cpu, rc;
+ int rc;
rc = __arch_update_cpu_topology();
on_each_cpu(__arch_update_dedicated_flag, NULL, 0);
- for_each_online_cpu(cpu) {
- dev = get_cpu_device(cpu);
- if (dev)
- kobject_uevent(&dev->kobj, KOBJ_CHANGE);
- }
return rc;
}
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 52578b5cecbd..a7c211a3a0c9 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -288,15 +288,16 @@ static void __init test_monitor_call(void)
void __init trap_init(void)
{
+ struct lowcore *lc = get_lowcore();
unsigned long flags;
struct ctlreg cr0;
local_irq_save(flags);
cr0 = local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT);
- psw_bits(S390_lowcore.external_new_psw).mcheck = 1;
- psw_bits(S390_lowcore.program_new_psw).mcheck = 1;
- psw_bits(S390_lowcore.svc_new_psw).mcheck = 1;
- psw_bits(S390_lowcore.io_new_psw).mcheck = 1;
+ psw_bits(lc->external_new_psw).mcheck = 1;
+ psw_bits(lc->program_new_psw).mcheck = 1;
+ psw_bits(lc->svc_new_psw).mcheck = 1;
+ psw_bits(lc->io_new_psw).mcheck = 1;
local_ctl_load(0, &cr0);
local_irq_restore(flags);
local_mcck_enable();
@@ -307,11 +308,12 @@ static void (*pgm_check_table[128])(struct pt_regs *regs);
void noinstr __do_pgm_check(struct pt_regs *regs)
{
- unsigned int trapnr;
+ struct lowcore *lc = get_lowcore();
irqentry_state_t state;
+ unsigned int trapnr;
- regs->int_code = S390_lowcore.pgm_int_code;
- regs->int_parm_long = S390_lowcore.trans_exc_code;
+ regs->int_code = lc->pgm_int_code;
+ regs->int_parm_long = lc->trans_exc_code;
state = irqentry_enter(regs);
@@ -324,19 +326,19 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
current->thread.last_break = regs->last_break;
}
- if (S390_lowcore.pgm_code & 0x0200) {
+ if (lc->pgm_code & 0x0200) {
/* transaction abort */
- current->thread.trap_tdb = S390_lowcore.pgm_tdb;
+ current->thread.trap_tdb = lc->pgm_tdb;
}
- if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) {
+ if (lc->pgm_code & PGM_INT_CODE_PER) {
if (user_mode(regs)) {
struct per_event *ev = &current->thread.per_event;
set_thread_flag(TIF_PER_TRAP);
- ev->address = S390_lowcore.per_address;
- ev->cause = S390_lowcore.per_code_combined;
- ev->paid = S390_lowcore.per_access_id;
+ ev->address = lc->per_address;
+ ev->cause = lc->per_code_combined;
+ ev->paid = lc->per_access_id;
} else {
/* PER event in kernel is kprobes */
__arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER);
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 265fea37e030..fa62fa0e369f 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(uv_pin_shared);
*
* @paddr: Absolute host address of page to be destroyed
*/
-static int uv_destroy_page(unsigned long paddr)
+static int uv_destroy(unsigned long paddr)
{
struct uv_cb_cfs uvcb = {
.header.cmd = UVC_CMD_DESTR_SEC_STOR,
@@ -131,28 +131,40 @@ static int uv_destroy_page(unsigned long paddr)
}
/*
- * The caller must already hold a reference to the page
+ * The caller must already hold a reference to the folio
*/
-int uv_destroy_owned_page(unsigned long paddr)
+int uv_destroy_folio(struct folio *folio)
{
- struct page *page = phys_to_page(paddr);
int rc;
- get_page(page);
- rc = uv_destroy_page(paddr);
+ /* See gmap_make_secure(): large folios cannot be secure */
+ if (unlikely(folio_test_large(folio)))
+ return 0;
+
+ folio_get(folio);
+ rc = uv_destroy(folio_to_phys(folio));
if (!rc)
- clear_bit(PG_arch_1, &page->flags);
- put_page(page);
+ clear_bit(PG_arch_1, &folio->flags);
+ folio_put(folio);
return rc;
}
/*
+ * The present PTE still indirectly holds a folio reference through the mapping.
+ */
+int uv_destroy_pte(pte_t pte)
+{
+ VM_WARN_ON(!pte_present(pte));
+ return uv_destroy_folio(pfn_folio(pte_pfn(pte)));
+}
+
+/*
* Requests the Ultravisor to encrypt a guest page and make it
* accessible to the host for paging (export).
*
* @paddr: Absolute host address of page to be exported
*/
-int uv_convert_from_secure(unsigned long paddr)
+static int uv_convert_from_secure(unsigned long paddr)
{
struct uv_cb_cfs uvcb = {
.header.cmd = UVC_CMD_CONV_FROM_SEC_STOR,
@@ -166,22 +178,34 @@ int uv_convert_from_secure(unsigned long paddr)
}
/*
- * The caller must already hold a reference to the page
+ * The caller must already hold a reference to the folio.
*/
-int uv_convert_owned_from_secure(unsigned long paddr)
+static int uv_convert_from_secure_folio(struct folio *folio)
{
- struct page *page = phys_to_page(paddr);
int rc;
- get_page(page);
- rc = uv_convert_from_secure(paddr);
+ /* See gmap_make_secure(): large folios cannot be secure */
+ if (unlikely(folio_test_large(folio)))
+ return 0;
+
+ folio_get(folio);
+ rc = uv_convert_from_secure(folio_to_phys(folio));
if (!rc)
- clear_bit(PG_arch_1, &page->flags);
- put_page(page);
+ clear_bit(PG_arch_1, &folio->flags);
+ folio_put(folio);
return rc;
}
/*
+ * The present PTE still indirectly holds a folio reference through the mapping.
+ */
+int uv_convert_from_secure_pte(pte_t pte)
+{
+ VM_WARN_ON(!pte_present(pte));
+ return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte)));
+}
+
+/*
* Calculate the expected ref_count for a folio that would otherwise have no
* further pins. This was cribbed from similar functions in other places in
* the kernel, but with some slight modifications. We know that a secure
@@ -267,6 +291,36 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str
}
/*
+ * Drain LRU caches: the local one on first invocation and the ones of all
+ * CPUs on successive invocations. Returns "true" on the first invocation.
+ */
+static bool drain_lru(bool *drain_lru_called)
+{
+ /*
+ * If we have tried a local drain and the folio refcount
+ * still does not match our expected safe value, try with a
+ * system wide drain. This is needed if the pagevecs holding
+ * the page are on a different CPU.
+ */
+ if (*drain_lru_called) {
+ lru_add_drain_all();
+ /* We give up here, don't retry immediately. */
+ return false;
+ }
+ /*
+ * We are here if the folio refcount does not match the
+ * expected safe value. The main culprits are usually
+ * pagevecs. With lru_add_drain() we drain the pagevecs
+ * on the local CPU so that hopefully the refcount will
+ * reach the expected safe value.
+ */
+ lru_add_drain();
+ *drain_lru_called = true;
+ /* The caller should try again immediately */
+ return true;
+}
+
+/*
* Requests the Ultravisor to make a page accessible to a guest.
* If it's brought in the first time, it will be cleared. If
* it has been exported before, it will be decrypted and integrity
@@ -275,7 +329,7 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str
int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
{
struct vm_area_struct *vma;
- bool local_drain = false;
+ bool drain_lru_called = false;
spinlock_t *ptelock;
unsigned long uaddr;
struct folio *folio;
@@ -308,52 +362,63 @@ again:
goto out;
if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
folio = page_folio(pte_page(*ptep));
- rc = -EINVAL;
- if (folio_test_large(folio))
- goto unlock;
rc = -EAGAIN;
- if (folio_trylock(folio)) {
+ if (folio_test_large(folio)) {
+ rc = -E2BIG;
+ } else if (folio_trylock(folio)) {
if (should_export_before_import(uvcb, gmap->mm))
uv_convert_from_secure(PFN_PHYS(folio_pfn(folio)));
rc = make_folio_secure(folio, uvcb);
folio_unlock(folio);
}
+
+ /*
+ * Once we drop the PTL, the folio may get unmapped and
+ * freed immediately. We need a temporary reference.
+ */
+ if (rc == -EAGAIN || rc == -E2BIG)
+ folio_get(folio);
}
-unlock:
pte_unmap_unlock(ptep, ptelock);
out:
mmap_read_unlock(gmap->mm);
- if (rc == -EAGAIN) {
+ switch (rc) {
+ case -E2BIG:
+ folio_lock(folio);
+ rc = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+
+ switch (rc) {
+ case 0:
+ /* Splitting succeeded, try again immediately. */
+ goto again;
+ case -EAGAIN:
+ /* Additional folio references. */
+ if (drain_lru(&drain_lru_called))
+ goto again;
+ return -EAGAIN;
+ case -EBUSY:
+ /* Unexpected race. */
+ return -EAGAIN;
+ }
+ WARN_ON_ONCE(1);
+ return -ENXIO;
+ case -EAGAIN:
/*
* If we are here because the UVC returned busy or partial
* completion, this is just a useless check, but it is safe.
*/
folio_wait_writeback(folio);
- } else if (rc == -EBUSY) {
- /*
- * If we have tried a local drain and the folio refcount
- * still does not match our expected safe value, try with a
- * system wide drain. This is needed if the pagevecs holding
- * the page are on a different CPU.
- */
- if (local_drain) {
- lru_add_drain_all();
- /* We give up here, and let the caller try again */
- return -EAGAIN;
- }
- /*
- * We are here if the folio refcount does not match the
- * expected safe value. The main culprits are usually
- * pagevecs. With lru_add_drain() we drain the pagevecs
- * on the local CPU so that hopefully the refcount will
- * reach the expected safe value.
- */
- lru_add_drain();
- local_drain = true;
- /* And now we try again immediately after draining */
- goto again;
- } else if (rc == -ENXIO) {
+ folio_put(folio);
+ return -EAGAIN;
+ case -EBUSY:
+ /* Additional folio references. */
+ if (drain_lru(&drain_lru_called))
+ goto again;
+ return -EAGAIN;
+ case -ENXIO:
if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE))
return -EFAULT;
return -EAGAIN;
@@ -388,6 +453,7 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
{
struct vm_area_struct *vma;
unsigned long uaddr;
+ struct folio *folio;
struct page *page;
int rc;
@@ -411,7 +477,8 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- rc = uv_destroy_owned_page(page_to_phys(page));
+ folio = page_folio(page);
+ rc = uv_destroy_folio(folio);
/*
* Fault handlers can race; it is possible that two CPUs will fault
* on the same secure page. One CPU can destroy the page, reboot,
@@ -422,8 +489,8 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
* we instead try to export the page.
*/
if (rc)
- rc = uv_convert_owned_from_secure(page_to_phys(page));
- put_page(page);
+ rc = uv_convert_from_secure_folio(folio);
+ folio_put(folio);
out:
mmap_read_unlock(gmap->mm);
return rc;
@@ -431,47 +498,51 @@ out:
EXPORT_SYMBOL_GPL(gmap_destroy_page);
/*
- * To be called with the page locked or with an extra reference! This will
- * prevent gmap_make_secure from touching the page concurrently. Having 2
- * parallel make_page_accessible is fine, as the UV calls will become a
- * no-op if the page is already exported.
+ * To be called with the folio locked or with an extra reference! This will
+ * prevent gmap_make_secure from touching the folio concurrently. Having 2
+ * parallel arch_make_folio_accessible is fine, as the UV calls will become a
+ * no-op if the folio is already exported.
*/
-int arch_make_page_accessible(struct page *page)
+int arch_make_folio_accessible(struct folio *folio)
{
int rc = 0;
- /* Hugepage cannot be protected, so nothing to do */
- if (PageHuge(page))
+ /* See gmap_make_secure(): large folios cannot be secure */
+ if (unlikely(folio_test_large(folio)))
return 0;
/*
- * PG_arch_1 is used in 3 places:
- * 1. for kernel page tables during early boot
- * 2. for storage keys of huge pages and KVM
- * 3. As an indication that this page might be secure. This can
+ * PG_arch_1 is used in 2 places:
+ * 1. for storage keys of hugetlb folios and KVM
+ * 2. As an indication that this small folio might be secure. This can
* overindicate, e.g. we set the bit before calling
* convert_to_secure.
- * As secure pages are never huge, all 3 variants can co-exists.
+ * As secure pages are never large folios, both variants can co-exists.
*/
- if (!test_bit(PG_arch_1, &page->flags))
+ if (!test_bit(PG_arch_1, &folio->flags))
return 0;
- rc = uv_pin_shared(page_to_phys(page));
+ rc = uv_pin_shared(folio_to_phys(folio));
if (!rc) {
- clear_bit(PG_arch_1, &page->flags);
+ clear_bit(PG_arch_1, &folio->flags);
return 0;
}
- rc = uv_convert_from_secure(page_to_phys(page));
+ rc = uv_convert_from_secure(folio_to_phys(folio));
if (!rc) {
- clear_bit(PG_arch_1, &page->flags);
+ clear_bit(PG_arch_1, &folio->flags);
return 0;
}
return rc;
}
-EXPORT_SYMBOL_GPL(arch_make_page_accessible);
+EXPORT_SYMBOL_GPL(arch_make_folio_accessible);
+int arch_make_page_accessible(struct page *page)
+{
+ return arch_make_folio_accessible(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(arch_make_page_accessible);
#endif
#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index ffc1db0cbf9c..234a0ba30510 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -35,14 +35,15 @@ static DEFINE_PER_CPU(u64, mt_scaling_jiffies);
static inline void set_vtimer(u64 expires)
{
+ struct lowcore *lc = get_lowcore();
u64 timer;
asm volatile(
" stpt %0\n" /* Store current cpu timer value */
" spt %1" /* Set new value imm. afterwards */
: "=Q" (timer) : "Q" (expires));
- S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer;
- S390_lowcore.last_update_timer = expires;
+ lc->system_timer += lc->last_update_timer - timer;
+ lc->last_update_timer = expires;
}
static inline int virt_timer_forward(u64 elapsed)
@@ -117,22 +118,23 @@ static void account_system_index_scaled(struct task_struct *p, u64 cputime,
static int do_account_vtime(struct task_struct *tsk)
{
u64 timer, clock, user, guest, system, hardirq, softirq;
+ struct lowcore *lc = get_lowcore();
- timer = S390_lowcore.last_update_timer;
- clock = S390_lowcore.last_update_clock;
+ timer = lc->last_update_timer;
+ clock = lc->last_update_clock;
asm volatile(
" stpt %0\n" /* Store current cpu timer value */
" stckf %1" /* Store current tod clock value */
- : "=Q" (S390_lowcore.last_update_timer),
- "=Q" (S390_lowcore.last_update_clock)
+ : "=Q" (lc->last_update_timer),
+ "=Q" (lc->last_update_clock)
: : "cc");
- clock = S390_lowcore.last_update_clock - clock;
- timer -= S390_lowcore.last_update_timer;
+ clock = lc->last_update_clock - clock;
+ timer -= lc->last_update_timer;
if (hardirq_count())
- S390_lowcore.hardirq_timer += timer;
+ lc->hardirq_timer += timer;
else
- S390_lowcore.system_timer += timer;
+ lc->system_timer += timer;
/* Update MT utilization calculation */
if (smp_cpu_mtid &&
@@ -141,16 +143,16 @@ static int do_account_vtime(struct task_struct *tsk)
/* Calculate cputime delta */
user = update_tsk_timer(&tsk->thread.user_timer,
- READ_ONCE(S390_lowcore.user_timer));
+ READ_ONCE(lc->user_timer));
guest = update_tsk_timer(&tsk->thread.guest_timer,
- READ_ONCE(S390_lowcore.guest_timer));
+ READ_ONCE(lc->guest_timer));
system = update_tsk_timer(&tsk->thread.system_timer,
- READ_ONCE(S390_lowcore.system_timer));
+ READ_ONCE(lc->system_timer));
hardirq = update_tsk_timer(&tsk->thread.hardirq_timer,
- READ_ONCE(S390_lowcore.hardirq_timer));
+ READ_ONCE(lc->hardirq_timer));
softirq = update_tsk_timer(&tsk->thread.softirq_timer,
- READ_ONCE(S390_lowcore.softirq_timer));
- S390_lowcore.steal_timer +=
+ READ_ONCE(lc->softirq_timer));
+ lc->steal_timer +=
clock - user - guest - system - hardirq - softirq;
/* Push account value */
@@ -176,17 +178,19 @@ static int do_account_vtime(struct task_struct *tsk)
void vtime_task_switch(struct task_struct *prev)
{
+ struct lowcore *lc = get_lowcore();
+
do_account_vtime(prev);
- prev->thread.user_timer = S390_lowcore.user_timer;
- prev->thread.guest_timer = S390_lowcore.guest_timer;
- prev->thread.system_timer = S390_lowcore.system_timer;
- prev->thread.hardirq_timer = S390_lowcore.hardirq_timer;
- prev->thread.softirq_timer = S390_lowcore.softirq_timer;
- S390_lowcore.user_timer = current->thread.user_timer;
- S390_lowcore.guest_timer = current->thread.guest_timer;
- S390_lowcore.system_timer = current->thread.system_timer;
- S390_lowcore.hardirq_timer = current->thread.hardirq_timer;
- S390_lowcore.softirq_timer = current->thread.softirq_timer;
+ prev->thread.user_timer = lc->user_timer;
+ prev->thread.guest_timer = lc->guest_timer;
+ prev->thread.system_timer = lc->system_timer;
+ prev->thread.hardirq_timer = lc->hardirq_timer;
+ prev->thread.softirq_timer = lc->softirq_timer;
+ lc->user_timer = current->thread.user_timer;
+ lc->guest_timer = current->thread.guest_timer;
+ lc->system_timer = current->thread.system_timer;
+ lc->hardirq_timer = current->thread.hardirq_timer;
+ lc->softirq_timer = current->thread.softirq_timer;
}
/*
@@ -196,28 +200,29 @@ void vtime_task_switch(struct task_struct *prev)
*/
void vtime_flush(struct task_struct *tsk)
{
+ struct lowcore *lc = get_lowcore();
u64 steal, avg_steal;
if (do_account_vtime(tsk))
virt_timer_expire();
- steal = S390_lowcore.steal_timer;
- avg_steal = S390_lowcore.avg_steal_timer;
+ steal = lc->steal_timer;
+ avg_steal = lc->avg_steal_timer;
if ((s64) steal > 0) {
- S390_lowcore.steal_timer = 0;
+ lc->steal_timer = 0;
account_steal_time(cputime_to_nsecs(steal));
avg_steal += steal;
}
- S390_lowcore.avg_steal_timer = avg_steal / 2;
+ lc->avg_steal_timer = avg_steal / 2;
}
static u64 vtime_delta(void)
{
- u64 timer = S390_lowcore.last_update_timer;
-
- S390_lowcore.last_update_timer = get_cpu_timer();
+ struct lowcore *lc = get_lowcore();
+ u64 timer = lc->last_update_timer;
- return timer - S390_lowcore.last_update_timer;
+ lc->last_update_timer = get_cpu_timer();
+ return timer - lc->last_update_timer;
}
/*
@@ -226,12 +231,13 @@ static u64 vtime_delta(void)
*/
void vtime_account_kernel(struct task_struct *tsk)
{
+ struct lowcore *lc = get_lowcore();
u64 delta = vtime_delta();
if (tsk->flags & PF_VCPU)
- S390_lowcore.guest_timer += delta;
+ lc->guest_timer += delta;
else
- S390_lowcore.system_timer += delta;
+ lc->system_timer += delta;
virt_timer_forward(delta);
}
@@ -241,7 +247,7 @@ void vtime_account_softirq(struct task_struct *tsk)
{
u64 delta = vtime_delta();
- S390_lowcore.softirq_timer += delta;
+ get_lowcore()->softirq_timer += delta;
virt_timer_forward(delta);
}
@@ -250,7 +256,7 @@ void vtime_account_hardirq(struct task_struct *tsk)
{
u64 delta = vtime_delta();
- S390_lowcore.hardirq_timer += delta;
+ get_lowcore()->hardirq_timer += delta;
virt_timer_forward(delta);
}