diff options
Diffstat (limited to 'arch/arm64/kvm')
42 files changed, 2489 insertions, 2168 deletions
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 1215df590418..231c0cd9c7b4 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -30,6 +30,7 @@ static u32 host_vtimer_irq_flags; static u32 host_ptimer_irq_flags; static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); +DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key); static const u8 default_ppi[] = { [TIMER_PTIMER] = 30, @@ -101,21 +102,6 @@ u64 timer_get_cval(struct arch_timer_context *ctxt) } } -static u64 timer_get_offset(struct arch_timer_context *ctxt) -{ - u64 offset = 0; - - if (!ctxt) - return 0; - - if (ctxt->offset.vm_offset) - offset += *ctxt->offset.vm_offset; - if (ctxt->offset.vcpu_offset) - offset += *ctxt->offset.vcpu_offset; - - return offset; -} - static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) { struct kvm_vcpu *vcpu = ctxt->vcpu; @@ -441,11 +427,30 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu) regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; } +static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) +{ + /* + * Paper over NV2 brokenness by publishing the interrupt status + * bit. This still results in a poor quality of emulation (guest + * writes will have no effect until the next exit). + * + * But hey, it's fast, right? + */ + if (is_hyp_ctxt(ctx->vcpu) && + (ctx == vcpu_vtimer(ctx->vcpu) || ctx == vcpu_ptimer(ctx->vcpu))) { + unsigned long val = timer_get_ctl(ctx); + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level); + timer_set_ctl(ctx, val); + } +} + static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { int ret; + kvm_timer_update_status(timer_ctx, new_level); + timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), timer_ctx->irq.level); @@ -466,10 +471,10 @@ static void timer_emulate(struct arch_timer_context *ctx) trace_kvm_timer_emulate(ctx, should_fire); - if (should_fire != ctx->irq.level) { + if (should_fire != ctx->irq.level) kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); - return; - } + + kvm_timer_update_status(ctx, should_fire); /* * If the timer can fire now, we don't need to have a soft timer @@ -513,7 +518,12 @@ static void timer_save_state(struct arch_timer_context *ctx) case TIMER_VTIMER: case TIMER_HVTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL)); - timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL)); + cval = read_sysreg_el0(SYS_CNTV_CVAL); + + if (has_broken_cntvoff()) + cval -= timer_get_offset(ctx); + + timer_set_cval(ctx, cval); /* Disable the timer */ write_sysreg_el0(0, SYS_CNTV_CTL); @@ -618,8 +628,15 @@ static void timer_restore_state(struct arch_timer_context *ctx) case TIMER_VTIMER: case TIMER_HVTIMER: - set_cntvoff(timer_get_offset(ctx)); - write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL); + cval = timer_get_cval(ctx); + offset = timer_get_offset(ctx); + if (has_broken_cntvoff()) { + set_cntvoff(0); + cval += offset; + } else { + set_cntvoff(offset); + } + write_sysreg_el0(cval, SYS_CNTV_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL); break; @@ -742,27 +759,12 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, timer_irq(map->direct_ptimer), &arch_timer_irq_ops); WARN_ON_ONCE(ret); - - /* - * The virtual offset behaviour is "interesting", as it - * always applies when HCR_EL2.E2H==0, but only when - * accessed from EL1 when HCR_EL2.E2H==1. So make sure we - * track E2H when putting the HV timer in "direct" mode. - */ - if (map->direct_vtimer == vcpu_hvtimer(vcpu)) { - struct arch_timer_offset *offs = &map->direct_vtimer->offset; - - if (vcpu_el2_e2h_is_set(vcpu)) - offs->vcpu_offset = NULL; - else - offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); - } } } static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) { - bool tpt, tpc; + bool tvt, tpt, tvc, tpc, tvt02, tpt02; u64 clr, set; /* @@ -777,7 +779,29 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) * within this function, reality kicks in and we start adding * traps based on emulation requirements. */ - tpt = tpc = false; + tvt = tpt = tvc = tpc = false; + tvt02 = tpt02 = false; + + /* + * NV2 badly breaks the timer semantics by redirecting accesses to + * the EL1 timer state to memory, so let's call ECV to the rescue if + * available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses. + * + * The treatment slightly varies depending whether we run a nVHE or + * VHE guest: nVHE will use the _EL0 registers directly, while VHE + * will use the _EL02 accessors. This translates in different trap + * bits. + * + * None of the trapping is required when running in non-HYP context, + * unless required by the L1 hypervisor settings once we advertise + * ECV+NV in the guest, or that we need trapping for other reasons. + */ + if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) { + if (vcpu_el2_e2h_is_set(vcpu)) + tvt02 = tpt02 = true; + else + tvt = tpt = true; + } /* * We have two possibility to deal with a physical offset: @@ -793,9 +817,20 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) tpt = tpc = true; /* + * For the poor sods that could not correctly substract one value + * from another, trap the full virtual timer and counter. + */ + if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer)) + tvt = tvc = true; + + /* * Apply the enable bits that the guest hypervisor has requested for * its own guest. We can only add traps that wouldn't have been set * above. + * Implementation choices: we do not support NV when E2H=0 in the + * guest, and we don't support configuration where E2H is writable + * by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but + * not both). This simplifies the handling of the EL1NV* bits. */ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); @@ -806,6 +841,9 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) tpt |= !(val & (CNTHCTL_EL1PCEN << 10)); tpc |= !(val & (CNTHCTL_EL1PCTEN << 10)); + + tpt02 |= (val & CNTHCTL_EL1NVPCT); + tvt02 |= (val & CNTHCTL_EL1NVVCT); } /* @@ -817,6 +855,10 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr); assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr); + assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set); + assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set); + assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set); + assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set); /* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */ sysreg_clear_set(cnthctl_el2, clr, set); @@ -905,6 +947,44 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) kvm_timer_blocking(vcpu); } +void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) +{ + /* + * When NV2 is on, guest hypervisors have their EL1 timer register + * accesses redirected to the VNCR page. Any guest action taken on + * the timer is postponed until the next exit, leading to a very + * poor quality of emulation. + * + * This is an unmitigated disaster, only papered over by FEAT_ECV, + * which allows trapping of the timer registers even with NV2. + * Still, this is still worse than FEAT_NV on its own. Meh. + */ + if (!cpus_have_final_cap(ARM64_HAS_ECV)) { + /* + * For a VHE guest hypervisor, the EL2 state is directly + * stored in the host EL1 timers, while the emulated EL1 + * state is stored in the VNCR page. The latter could have + * been updated behind our back, and we must reset the + * emulation of the timers. + * + * A non-VHE guest hypervisor doesn't have any direct access + * to its timers: the EL2 registers trap despite being + * notionally direct (we use the EL1 HW, as for VHE), while + * the EL1 registers access memory. + * + * In both cases, process the emulated timers on each guest + * exit. Boo. + */ + struct timer_map map; + get_timer_map(vcpu, &map); + + soft_timer_cancel(&map.emul_vtimer->hrtimer); + soft_timer_cancel(&map.emul_ptimer->hrtimer); + timer_emulate(map.emul_vtimer); + timer_emulate(map.emul_ptimer); + } +} + /* * With a userspace irqchip we have to check if the guest de-asserted the * timer and if so, unmask the timer irq signal on the host interrupt @@ -1363,6 +1443,37 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) return 0; } +static void kvm_timer_handle_errata(void) +{ + u64 mmfr0, mmfr1, mmfr4; + + /* + * CNTVOFF_EL2 is broken on some implementations. For those, we trap + * all virtual timer/counter accesses, requiring FEAT_ECV. + * + * However, a hypervisor supporting nesting is likely to mitigate the + * erratum at L0, and not require other levels to mitigate it (which + * would otherwise be a terrible performance sink due to trap + * amplification). + * + * Given that the affected HW implements both FEAT_VHE and FEAT_E2H0, + * and that NV is likely not to (because of limitations of the + * architecture), only enable the workaround when FEAT_VHE and + * FEAT_E2H0 are both detected. Time will tell if this actually holds. + */ + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1); + if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1) && + !SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4) && + SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0) && + (has_vhe() || has_hvhe()) && + cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) { + static_branch_enable(&broken_cntvoff_key); + kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n"); + } +} + int __init kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; @@ -1431,6 +1542,7 @@ int __init kvm_timer_hyp_init(bool has_gic) goto out_free_vtimer_irq; } + kvm_timer_handle_errata(); return 0; out_free_ptimer_irq: diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a102c3aebdbc..071a7d75be68 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -61,7 +61,7 @@ static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTR DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); -DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base); DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); @@ -80,31 +80,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -/* - * This functions as an allow-list of protected VM capabilities. - * Features not explicitly allowed by this function are denied. - */ -static bool pkvm_ext_allowed(struct kvm *kvm, long ext) -{ - switch (ext) { - case KVM_CAP_IRQCHIP: - case KVM_CAP_ARM_PSCI: - case KVM_CAP_ARM_PSCI_0_2: - case KVM_CAP_NR_VCPUS: - case KVM_CAP_MAX_VCPUS: - case KVM_CAP_MAX_VCPU_ID: - case KVM_CAP_MSI_DEVID: - case KVM_CAP_ARM_VM_IPA_SIZE: - case KVM_CAP_ARM_PMU_V3: - case KVM_CAP_ARM_SVE: - case KVM_CAP_ARM_PTRAUTH_ADDRESS: - case KVM_CAP_ARM_PTRAUTH_GENERIC: - return true; - default: - return false; - } -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -113,7 +88,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, if (cap->flags) return -EINVAL; - if (kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, cap->cap)) + if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap)) return -EINVAL; switch (cap->cap) { @@ -311,7 +286,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; - if (kvm && kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, ext)) + if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext)) return 0; switch (ext) { @@ -476,8 +451,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_pmu_vcpu_init(vcpu); - kvm_arm_reset_debug_ptr(vcpu); - kvm_arm_pvtime_vcpu_init(&vcpu->arch); vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; @@ -502,7 +475,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + if (!is_protected_kvm_enabled()) + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + else + free_hyp_memcache(&vcpu->arch.pkvm_memcache); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); kvm_vgic_vcpu_destroy(vcpu); @@ -574,6 +550,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct kvm_s2_mmu *mmu; int *last_ran; + if (is_protected_kvm_enabled()) + goto nommu; + if (vcpu_has_nv(vcpu)) kvm_vcpu_load_hw_mmu(vcpu); @@ -594,10 +573,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) *last_ran = vcpu->vcpu_idx; } +nommu: vcpu->cpu = cpu; kvm_vgic_load(vcpu); kvm_timer_vcpu_load(vcpu); + kvm_vcpu_load_debug(vcpu); if (has_vhe()) kvm_vcpu_load_vhe(vcpu); kvm_arch_vcpu_load_fp(vcpu); @@ -617,7 +598,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu_set_pauth_traps(vcpu); - kvm_arch_vcpu_load_debug_state_flags(vcpu); + if (is_protected_kvm_enabled()) { + kvm_call_hyp_nvhe(__pkvm_vcpu_load, + vcpu->kvm->arch.pkvm.handle, + vcpu->vcpu_idx, vcpu->arch.hcr_el2); + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, + &vcpu->arch.vgic_cpu.vgic_v3); + } if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) vcpu_set_on_unsupported_cpu(vcpu); @@ -625,7 +612,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_arch_vcpu_put_debug_state_flags(vcpu); + if (is_protected_kvm_enabled()) { + kvm_call_hyp(__vgic_v3_save_vmcr_aprs, + &vcpu->arch.vgic_cpu.vgic_v3); + kvm_call_hyp_nvhe(__pkvm_vcpu_put); + } + + kvm_vcpu_put_debug(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) kvm_vcpu_put_vhe(vcpu); @@ -808,8 +801,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) kvm_init_mpidr_data(kvm); - kvm_arm_vcpu_init_debug(vcpu); - if (likely(irqchip_in_kernel(kvm))) { /* * Map the VGIC hardware resources before running a vcpu the @@ -1187,7 +1178,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) continue; } - kvm_arm_setup_debug(vcpu); kvm_arch_vcpu_ctxflush_fp(vcpu); /************************************************************** @@ -1204,8 +1194,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * Back from guest *************************************************************/ - kvm_arm_clear_debug(vcpu); - /* * We must sync the PMU state before the vgic state so * that the vgic can properly sample the updated state of the @@ -1228,6 +1216,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); + if (is_hyp_ctxt(vcpu)) + kvm_timer_sync_nested(vcpu); + kvm_arch_vcpu_ctxsync_fp(vcpu); /* @@ -1571,7 +1562,6 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, } vcpu_reset_hcr(vcpu); - vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); /* * Handle the "start in power-off" case. @@ -1990,8 +1980,7 @@ static int kvm_init_vector_slots(void) static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); - u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); - unsigned long tcr; + unsigned long tcr, ips; /* * Calculate the raw per-cpu offset without a translation from the @@ -2005,6 +1994,7 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) params->mair_el2 = read_sysreg(mair_el1); tcr = read_sysreg(tcr_el1); + ips = FIELD_GET(TCR_IPS_MASK, tcr); if (cpus_have_final_cap(ARM64_KVM_HVHE)) { tcr |= TCR_EPD1_MASK; } else { @@ -2014,8 +2004,8 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) tcr &= ~TCR_T0SZ_MASK; tcr |= TCR_T0SZ(hyp_va_bits); tcr &= ~TCR_EL2_PS_MASK; - tcr |= FIELD_PREP(TCR_EL2_PS_MASK, kvm_get_parange(mmfr0)); - if (kvm_lpa2_is_enabled()) + tcr |= FIELD_PREP(TCR_EL2_PS_MASK, ips); + if (lpa2_is_enabled()) tcr |= TCR_EL2_DS; params->tcr_el2 = tcr; @@ -2109,6 +2099,7 @@ static void cpu_set_hyp_vector(void) static void cpu_hyp_init_context(void) { kvm_init_host_cpu_context(host_data_ptr(host_ctxt)); + kvm_init_host_debug_data(); if (!is_kernel_in_hyp_mode()) cpu_init_hyp_mode(); @@ -2117,7 +2108,6 @@ static void cpu_hyp_init_context(void) static void cpu_hyp_init_features(void) { cpu_set_hyp_vector(); - kvm_arm_init_debug(); if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); @@ -2300,6 +2290,19 @@ static int __init init_subsystems(void) break; case -ENODEV: case -ENXIO: + /* + * No VGIC? No pKVM for you. + * + * Protected mode assumes that VGICv3 is present, so no point + * in trying to hobble along if vgic initialization fails. + */ + if (is_protected_kvm_enabled()) + goto out; + + /* + * Otherwise, userspace could choose to implement a GIC for its + * guest on non-cooperative hardware. + */ vgic_present = false; err = 0; break; @@ -2339,7 +2342,7 @@ static void __init teardown_hyp_mode(void) free_hyp_pgds(); for_each_possible_cpu(cpu) { - free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); + free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT); free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); if (free_sve) { @@ -2410,6 +2413,13 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); kvm_nvhe_sym(__icache_flags) = __icache_flags; kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; + + /* + * Flush entire BSS since part of its data containing init symbols is read + * while the MMU is off. + */ + kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start), + kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start)); } static int __init kvm_hyp_init_protection(u32 hyp_va_bits) @@ -2527,15 +2537,15 @@ static int __init init_hyp_mode(void) * Allocate stack pages for Hypervisor-mode */ for_each_possible_cpu(cpu) { - unsigned long stack_page; + unsigned long stack_base; - stack_page = __get_free_page(GFP_KERNEL); - if (!stack_page) { + stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT); + if (!stack_base) { err = -ENOMEM; goto out_err; } - per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; + per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base; } /* @@ -2604,9 +2614,9 @@ static int __init init_hyp_mode(void) */ for_each_possible_cpu(cpu) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); - char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); + char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu); - err = create_hyp_stack(__pa(stack_page), ¶ms->stack_hyp_va); + err = create_hyp_stack(__pa(stack_base), ¶ms->stack_hyp_va); if (err) { kvm_err("Cannot map hyp stack\n"); goto out_err; @@ -2618,7 +2628,7 @@ static int __init init_hyp_mode(void) * __hyp_pa() won't do the right thing there, since the stack * has been mapped in the flexible private VA space. */ - params->stack_pa = __pa(stack_page); + params->stack_pa = __pa(stack_base); } for_each_possible_cpu(cpu) { diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 8c5d7990e5b3..3a96c96816e9 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -111,7 +111,7 @@ static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; case TR_EL10: return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && - (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1x_PIE); + (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1_PIE); default: BUG(); } @@ -140,8 +140,8 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) } val = __vcpu_sys_reg(vcpu, TCR2_EL1); - wi->poe = val & TCR2_EL1x_POE; - wi->e0poe = val & TCR2_EL1x_E0POE; + wi->poe = val & TCR2_EL1_POE; + wi->e0poe = val & TCR2_EL1_E0POE; } } @@ -739,8 +739,15 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, final_attr = s1_parattr; break; default: - /* MemAttr[2]=0, Device from S2 */ - final_attr = s2_memattr & GENMASK(1,0) << 2; + /* + * MemAttr[2]=0, Device from S2. + * + * FWB does not influence the way that stage 1 + * memory types and attributes are combined + * with stage 2 Device type and attributes. + */ + final_attr = min(s2_memattr_to_attr(s2_memattr), + s1_parattr); } } else { /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index ce8886122ed3..0e4c805e7e89 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -3,7 +3,8 @@ * Debug and Guest Debug support * * Copyright (C) 2015 - Linaro Ltd - * Author: Alex Bennée <alex.bennee@linaro.org> + * Authors: Alex Bennée <alex.bennee@linaro.org> + * Oliver Upton <oliver.upton@linux.dev> */ #include <linux/kvm_host.h> @@ -14,72 +15,6 @@ #include <asm/kvm_arm.h> #include <asm/kvm_emulate.h> -#include "trace.h" - -/* These are the bits of MDSCR_EL1 we may manipulate */ -#define MDSCR_EL1_DEBUG_MASK (DBG_MDSCR_SS | \ - DBG_MDSCR_KDE | \ - DBG_MDSCR_MDE) - -static DEFINE_PER_CPU(u64, mdcr_el2); - -/* - * save/restore_guest_debug_regs - * - * For some debug operations we need to tweak some guest registers. As - * a result we need to save the state of those registers before we - * make those modifications. - * - * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled - * after we have restored the preserved value to the main context. - * - * When single-step is enabled by userspace, we tweak PSTATE.SS on every - * guest entry. Preserve PSTATE.SS so we can restore the original value - * for the vcpu after the single-step is disabled. - */ -static void save_guest_debug_regs(struct kvm_vcpu *vcpu) -{ - u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - - vcpu->arch.guest_debug_preserved.mdscr_el1 = val; - - trace_kvm_arm_set_dreg32("Saved MDSCR_EL1", - vcpu->arch.guest_debug_preserved.mdscr_el1); - - vcpu->arch.guest_debug_preserved.pstate_ss = - (*vcpu_cpsr(vcpu) & DBG_SPSR_SS); -} - -static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) -{ - u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1; - - vcpu_write_sys_reg(vcpu, val, MDSCR_EL1); - - trace_kvm_arm_set_dreg32("Restored MDSCR_EL1", - vcpu_read_sys_reg(vcpu, MDSCR_EL1)); - - if (vcpu->arch.guest_debug_preserved.pstate_ss) - *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; - else - *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; -} - -/** - * kvm_arm_init_debug - grab what we need for debug - * - * Currently the sole task of this function is to retrieve the initial - * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has - * presumably been set-up by some knowledgeable bootcode. - * - * It is called once per-cpu during CPU hyp initialisation. - */ - -void kvm_arm_init_debug(void) -{ - __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2)); -} - /** * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value * @@ -95,11 +30,14 @@ void kvm_arm_init_debug(void) */ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) { + preempt_disable(); + /* * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK * to disable guest access to the profiling and trace buffers */ - vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; + vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, + *host_data_ptr(nr_event_counters)); vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | MDCR_EL2_TPMS | MDCR_EL2_TTRF | @@ -113,233 +51,215 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; /* - * Trap debug register access when one of the following is true: - * - Userspace is using the hardware to debug the guest - * (KVM_GUESTDBG_USE_HW is set). - * - The guest is not using debug (DEBUG_DIRTY clear). - * - The guest has enabled the OS Lock (debug exceptions are blocked). + * Trap debug registers if the guest doesn't have ownership of them. */ - if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) || - !vcpu_get_flag(vcpu, DEBUG_DIRTY) || - kvm_vcpu_os_lock_enabled(vcpu)) + if (!kvm_guest_owns_debug_regs(vcpu)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; - trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); -} + /* Write MDCR_EL2 directly if we're already at EL2 */ + if (has_vhe()) + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); -/** - * kvm_arm_vcpu_init_debug - setup vcpu debug traps - * - * @vcpu: the vcpu pointer - * - * Set vcpu initial mdcr_el2 value. - */ -void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - kvm_arm_setup_mdcr_el2(vcpu); preempt_enable(); } -/** - * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state - * @vcpu: the vcpu pointer - */ - -void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) +void kvm_init_host_debug_data(void) { - vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state; + u64 dfr0 = read_sysreg(id_aa64dfr0_el1); + + if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0) + *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N, + read_sysreg(pmcr_el0)); + + *host_data_ptr(debug_brps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0); + *host_data_ptr(debug_wrps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0); + + if (has_vhe()) + return; + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && + !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P)) + host_data_set_flag(HAS_SPE); + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) { + /* Force disable trace in protected mode in case of no TRBE */ + if (is_protected_kvm_enabled()) + host_data_set_flag(EL1_TRACING_CONFIGURED); + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && + !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) + host_data_set_flag(HAS_TRBE); + } } -/** - * kvm_arm_setup_debug - set up debug related stuff +/* + * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host + * has taken over MDSCR_EL1. * - * @vcpu: the vcpu pointer + * - Userspace is single-stepping the guest, and MDSCR_EL1.SS is forced to 1. * - * This is called before each entry into the hypervisor to setup any - * debug related registers. + * - Userspace is using the breakpoint/watchpoint registers to debug the + * guest, and MDSCR_EL1.MDE is forced to 1. * - * Additionally, KVM only traps guest accesses to the debug registers if - * the guest is not actively using them (see the DEBUG_DIRTY - * flag on vcpu->arch.iflags). Since the guest must not interfere - * with the hardware state when debugging the guest, we must ensure that - * trapping is enabled whenever we are debugging the guest using the - * debug registers. + * - The guest has enabled the OS Lock, and KVM is forcing MDSCR_EL1.MDE to 0, + * masking all debug exceptions affected by the OS Lock. */ - -void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) +static void setup_external_mdscr(struct kvm_vcpu *vcpu) { - unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2; + /* + * Use the guest's MDSCR_EL1 as a starting point, since there are + * several other features controlled by MDSCR_EL1 that are not relevant + * to the host. + * + * Clear the bits that KVM may use which also satisfies emulation of + * the OS Lock as MDSCR_EL1.MDE is cleared. + */ + u64 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1) & ~(MDSCR_EL1_SS | + MDSCR_EL1_MDE | + MDSCR_EL1_KDE); - trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + mdscr |= MDSCR_EL1_SS; - kvm_arm_setup_mdcr_el2(vcpu); + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + mdscr |= MDSCR_EL1_MDE | MDSCR_EL1_KDE; - /* Check if we need to use the debug registers. */ + vcpu->arch.external_mdscr_el1 = mdscr; +} + +void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) +{ + u64 mdscr; + + /* Must be called before kvm_vcpu_load_vhe() */ + KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm); + + /* + * Determine which of the possible debug states we're in: + * + * - VCPU_DEBUG_HOST_OWNED: KVM has taken ownership of the guest's + * breakpoint/watchpoint registers, or needs to use MDSCR_EL1 to do + * software step or emulate the effects of the OS Lock being enabled. + * + * - VCPU_DEBUG_GUEST_OWNED: The guest has debug exceptions enabled, and + * the breakpoint/watchpoint registers need to be loaded eagerly. + * + * - VCPU_DEBUG_FREE: Neither of the above apply, no breakpoint/watchpoint + * context needs to be loaded on the CPU. + */ if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { - /* Save guest debug state */ - save_guest_debug_regs(vcpu); + vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED; + setup_external_mdscr(vcpu); /* - * Single Step (ARM ARM D2.12.3 The software step state - * machine) - * - * If we are doing Single Step we need to manipulate - * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the - * step has occurred the hypervisor will trap the - * debug exception and we return to userspace. - * - * If the guest attempts to single step its userspace - * we would have to deal with a trapped exception - * while in the guest kernel. Because this would be - * hard to unwind we suppress the guest's ability to - * do so by masking MDSCR_EL.SS. - * - * This confuses guest debuggers which use - * single-step behind the scenes but everything - * returns to normal once the host is no longer - * debugging the system. + * Steal the guest's single-step state machine if userspace wants + * single-step the guest. */ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - /* - * If the software step state at the last guest exit - * was Active-pending, we don't set DBG_SPSR_SS so - * that the state is maintained (to not run another - * single-step until the pending Software Step - * exception is taken). - */ - if (!vcpu_get_flag(vcpu, DBG_SS_ACTIVE_PENDING)) + if (*vcpu_cpsr(vcpu) & DBG_SPSR_SS) + vcpu_clear_flag(vcpu, GUEST_SS_ACTIVE_PENDING); + else + vcpu_set_flag(vcpu, GUEST_SS_ACTIVE_PENDING); + + if (!vcpu_get_flag(vcpu, HOST_SS_ACTIVE_PENDING)) *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; else *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; - - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr |= DBG_MDSCR_SS; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - } else { - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr &= ~DBG_MDSCR_SS; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); } + } else { + mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu)); - - /* - * HW Breakpoints and watchpoints - * - * We simply switch the debug_ptr to point to our new - * external_debug_state which has been populated by the - * debug ioctl. The existing DEBUG_DIRTY mechanism ensures - * the registers are updated on the world switch. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - /* Enable breakpoints/watchpoints */ - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr |= DBG_MDSCR_MDE; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - - vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; - vcpu_set_flag(vcpu, DEBUG_DIRTY); - - trace_kvm_arm_set_regset("BKPTS", get_num_brps(), - &vcpu->arch.debug_ptr->dbg_bcr[0], - &vcpu->arch.debug_ptr->dbg_bvr[0]); - - trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), - &vcpu->arch.debug_ptr->dbg_wcr[0], - &vcpu->arch.debug_ptr->dbg_wvr[0]); - - /* - * The OS Lock blocks debug exceptions in all ELs when it is - * enabled. If the guest has enabled the OS Lock, constrain its - * effects to the guest. Emulate the behavior by clearing - * MDSCR_EL1.MDE. In so doing, we ensure that host debug - * exceptions are unaffected by guest configuration of the OS - * Lock. - */ - } else if (kvm_vcpu_os_lock_enabled(vcpu)) { - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr &= ~DBG_MDSCR_MDE; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - } + if (mdscr & (MDSCR_EL1_KDE | MDSCR_EL1_MDE)) + vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + else + vcpu->arch.debug_owner = VCPU_DEBUG_FREE; } - BUG_ON(!vcpu->guest_debug && - vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state); - - /* If KDE or MDE are set, perform a full save/restore cycle. */ - if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) - vcpu_set_flag(vcpu, DEBUG_DIRTY); - - /* Write mdcr_el2 changes since vcpu_load on VHE systems */ - if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2) - write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - - trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1)); + kvm_arm_setup_mdcr_el2(vcpu); } -void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) +void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu) { - trace_kvm_arm_clear_debug(vcpu->guest_debug); + if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + return; /* - * Restore the guest's debug registers if we were using them. + * Save the host's software step state and restore the guest's before + * potentially returning to userspace. */ - if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) - /* - * Mark the vcpu as ACTIVE_PENDING - * until Software Step exception is taken. - */ - vcpu_set_flag(vcpu, DBG_SS_ACTIVE_PENDING); - } - - restore_guest_debug_regs(vcpu); + if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) + vcpu_set_flag(vcpu, HOST_SS_ACTIVE_PENDING); + else + vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); - /* - * If we were using HW debug we need to restore the - * debug_ptr to the guest debug state. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - kvm_arm_reset_debug_ptr(vcpu); + if (vcpu_get_flag(vcpu, GUEST_SS_ACTIVE_PENDING)) + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; +} - trace_kvm_arm_set_regset("BKPTS", get_num_brps(), - &vcpu->arch.debug_ptr->dbg_bcr[0], - &vcpu->arch.debug_ptr->dbg_bvr[0]); +/* + * Updates ownership of the debug registers after a trapped guest access to a + * breakpoint/watchpoint register. Host ownership of the debug registers is of + * strictly higher priority, and it is the responsibility of the VMM to emulate + * guest debug exceptions in this configuration. + */ +void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) +{ + if (kvm_host_owns_debug_regs(vcpu)) + return; - trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), - &vcpu->arch.debug_ptr->dbg_wcr[0], - &vcpu->arch.debug_ptr->dbg_wvr[0]); - } - } + vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + kvm_arm_setup_mdcr_el2(vcpu); } -void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu) +void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) { - u64 dfr0; + if (val & OSLAR_EL1_OSLK) + __vcpu_sys_reg(vcpu, OSLSR_EL1) |= OSLSR_EL1_OSLK; + else + __vcpu_sys_reg(vcpu, OSLSR_EL1) &= ~OSLSR_EL1_OSLK; - /* For VHE, there is nothing to do */ - if (has_vhe()) + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); +} + +void kvm_enable_trbe(void) +{ + if (has_vhe() || is_protected_kvm_enabled() || + WARN_ON_ONCE(preemptible())) return; - dfr0 = read_sysreg(id_aa64dfr0_el1); - /* - * If SPE is present on this CPU and is available at current EL, - * we may need to check if the host state needs to be saved. - */ - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && - !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(PMBIDR_EL1_P_SHIFT))) - vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_SPE); + host_data_set_flag(TRBE_ENABLED); +} +EXPORT_SYMBOL_GPL(kvm_enable_trbe); + +void kvm_disable_trbe(void) +{ + if (has_vhe() || is_protected_kvm_enabled() || + WARN_ON_ONCE(preemptible())) + return; - /* Check if we have TRBE implemented and available at the host */ - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && - !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) - vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_TRBE); + host_data_clear_flag(TRBE_ENABLED); } +EXPORT_SYMBOL_GPL(kvm_disable_trbe); -void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu) +void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) { - vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_SPE); - vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_TRBE); + if (is_protected_kvm_enabled() || WARN_ON_ONCE(preemptible())) + return; + + if (has_vhe()) { + write_sysreg_s(trfcr_while_in_guest, SYS_TRFCR_EL12); + return; + } + + *host_data_ptr(trfcr_while_in_guest) = trfcr_while_in_guest; + if (read_sysreg_s(SYS_TRFCR_EL1) != trfcr_while_in_guest) + host_data_set_flag(EL1_TRACING_CONFIGURED); + else + host_data_clear_flag(EL1_TRACING_CONFIGURED); } +EXPORT_SYMBOL_GPL(kvm_tracing_set_el1_configuration); diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 1ffbfd1c3cf2..607d37bab70b 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -89,6 +89,9 @@ enum cgt_group_id { CGT_HCRX_EnFPM, CGT_HCRX_TCR2En, + CGT_CNTHCTL_EL1TVT, + CGT_CNTHCTL_EL1TVCT, + CGT_ICH_HCR_TC, CGT_ICH_HCR_TALL0, CGT_ICH_HCR_TALL1, @@ -124,6 +127,8 @@ enum cgt_group_id { __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PTEN, + CGT_CNTHCTL_EL1NVPCT, + CGT_CNTHCTL_EL1NVVCT, CGT_CPTR_TTA, CGT_MDCR_HPMN, @@ -393,6 +398,18 @@ static const struct trap_bits coarse_trap_bits[] = { .mask = HCRX_EL2_TCR2En, .behaviour = BEHAVE_FORWARD_RW, }, + [CGT_CNTHCTL_EL1TVT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVT, + .mask = CNTHCTL_EL1TVT, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CNTHCTL_EL1TVCT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVCT, + .mask = CNTHCTL_EL1TVCT, + .behaviour = BEHAVE_FORWARD_READ, + }, [CGT_ICH_HCR_TC] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TC, @@ -487,6 +504,32 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) return BEHAVE_FORWARD_RW; } +static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu) +{ + u64 val; + + val = __vcpu_sys_reg(vcpu, HCR_EL2); + return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV)); +} + +static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) { u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2); @@ -494,7 +537,7 @@ static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) if (!vcpu_el2_e2h_is_set(vcpu)) val = translate_cptr_el2_to_cpacr_el1(val); - if (val & CPACR_ELx_TTA) + if (val & CPACR_EL1_TTA) return BEHAVE_FORWARD_RW; return BEHAVE_HANDLE_LOCALLY; @@ -534,6 +577,8 @@ static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu) static const complex_condition_check ccc[] = { CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten), CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten), + CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct), + CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct), CCC(CGT_CPTR_TTA, check_cptr_tta), CCC(CGT_MDCR_HPMN, check_mdcr_hpmn), }; @@ -850,11 +895,15 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SYS_CNTHP_CVAL_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2, SYS_CNTHV_CVAL_EL2, CGT_HCR_NV), - /* All _EL02, _EL12 registers */ + /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/ SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0), sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV), SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0), - sys_reg(3, 5, 14, 15, 7), CGT_HCR_NV), + sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV), + SR_TRAP(SYS_CNTP_CTL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTP_CVAL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTV_CTL_EL02, CGT_CNTHCTL_EL1NVVCT), + SR_TRAP(SYS_CNTV_CVAL_EL02, CGT_CNTHCTL_EL1NVVCT), SR_TRAP(OP_AT_S1E2R, CGT_HCR_NV), SR_TRAP(OP_AT_S1E2W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E1R, CGT_HCR_NV), @@ -1184,6 +1233,11 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_CNTV_TVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CTL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTVCT_EL0, CGT_CNTHCTL_EL1TVCT), + SR_TRAP(SYS_CNTVCTSS_EL0, CGT_CNTHCTL_EL1TVCT), SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM), /* * IMPDEF choice: @@ -2345,14 +2399,14 @@ inject: return true; } -static bool forward_traps(struct kvm_vcpu *vcpu, u64 control_bit) +static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit) { bool control_bit_set; if (!vcpu_has_nv(vcpu)) return false; - control_bit_set = __vcpu_sys_reg(vcpu, HCR_EL2) & control_bit; + control_bit_set = __vcpu_sys_reg(vcpu, reg) & control_bit; if (!is_hyp_ctxt(vcpu) && control_bit_set) { kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return true; @@ -2360,9 +2414,24 @@ static bool forward_traps(struct kvm_vcpu *vcpu, u64 control_bit) return false; } +static bool forward_hcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + return __forward_traps(vcpu, HCR_EL2, control_bit); +} + bool forward_smc_trap(struct kvm_vcpu *vcpu) { - return forward_traps(vcpu, HCR_TSC); + return forward_hcr_traps(vcpu, HCR_TSC); +} + +static bool forward_mdcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + return __forward_traps(vcpu, MDCR_EL2, control_bit); +} + +bool forward_debug_exception(struct kvm_vcpu *vcpu) +{ + return forward_mdcr_traps(vcpu, MDCR_EL2_TDE); } static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) @@ -2406,7 +2475,7 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) * Forward this trap to the virtual EL2 if the virtual * HCR_EL2.NV bit is set and this is coming from !EL2. */ - if (forward_traps(vcpu, HCR_NV)) + if (forward_hcr_traps(vcpu, HCR_NV)) return; spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index ea5484ce1f3b..4d3d1a2eb157 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -65,14 +65,14 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) *host_data_ptr(fpsimd_state) = kern_hyp_va(¤t->thread.uw.fpsimd_state); *host_data_ptr(fpmr_ptr) = kern_hyp_va(¤t->thread.uw.fpmr); - vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); + host_data_clear_flag(HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) - vcpu_set_flag(vcpu, HOST_SVE_ENABLED); + host_data_set_flag(HOST_SVE_ENABLED); if (system_supports_sme()) { - vcpu_clear_flag(vcpu, HOST_SME_ENABLED); + host_data_clear_flag(HOST_SME_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) - vcpu_set_flag(vcpu, HOST_SME_ENABLED); + host_data_set_flag(HOST_SME_ENABLED); /* * If PSTATE.SM is enabled then save any pending FP @@ -168,8 +168,8 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) */ if (has_vhe() && system_supports_sme()) { /* Also restore EL0 state seen on entry */ - if (vcpu_get_flag(vcpu, HOST_SME_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_SMEN); + if (host_data_test_flag(HOST_SME_ENABLED)) + sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_SMEN); else sysreg_clear_set(CPACR_EL1, CPACR_EL1_SMEN_EL0EN, @@ -227,7 +227,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) * for EL0. To avoid spurious traps, restore the trap state * seen by kvm_arch_vcpu_load_fp(): */ - if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED)) + if (host_data_test_flag(HOST_SVE_ENABLED)) sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); else sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 12dad841f2a5..2196979a24a3 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -917,31 +917,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int ret = 0; - trace_kvm_set_guest_debug(vcpu, dbg->control); - if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) { - ret = -EINVAL; - goto out; - } - - if (dbg->control & KVM_GUESTDBG_ENABLE) { - vcpu->guest_debug = dbg->control; - - /* Hardware assisted Break and Watch points */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - vcpu->arch.external_debug_state = dbg->arch; - } + if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) + return -EINVAL; - } else { - /* If not enabled clear all flags */ + if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { vcpu->guest_debug = 0; - vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); + return 0; } -out: - return ret; + vcpu->guest_debug = dbg->control; + + /* Hardware assisted Break and Watch points */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + vcpu->arch.external_debug_state = dbg->arch; + + return 0; } int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index d7c2990e7c9e..512d152233ff 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -183,6 +183,9 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; u64 esr = kvm_vcpu_get_esr(vcpu); + if (!vcpu->guest_debug && forward_debug_exception(vcpu)) + return 1; + run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = lower_32_bits(esr); run->debug.arch.hsr_high = upper_32_bits(esr); @@ -193,7 +196,7 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) run->debug.arch.far = vcpu->arch.fault.far_el2; break; case ESR_ELx_EC_SOFTSTP_LOW: - vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; break; } diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index d00093699aaf..502a5b73ee70 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -88,15 +88,26 @@ default: write_debug(ptr[0], reg, 0); \ } +static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.debug_owner) { + case VCPU_DEBUG_FREE: + WARN_ON_ONCE(1); + fallthrough; + case VCPU_DEBUG_GUEST_OWNED: + return &vcpu->arch.vcpu_debug_state; + case VCPU_DEBUG_HOST_OWNED: + return &vcpu->arch.external_debug_state; + } + + return NULL; +} + static void __debug_save_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { - u64 aa64dfr0; - int brps, wrps; - - aa64dfr0 = read_sysreg(id_aa64dfr0_el1); - brps = (aa64dfr0 >> 12) & 0xf; - wrps = (aa64dfr0 >> 20) & 0xf; + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); save_debug(dbg->dbg_bcr, dbgbcr, brps); save_debug(dbg->dbg_bvr, dbgbvr, brps); @@ -109,13 +120,8 @@ static void __debug_save_state(struct kvm_guest_debug_arch *dbg, static void __debug_restore_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { - u64 aa64dfr0; - int brps, wrps; - - aa64dfr0 = read_sysreg(id_aa64dfr0_el1); - - brps = (aa64dfr0 >> 12) & 0xf; - wrps = (aa64dfr0 >> 20) & 0xf; + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); restore_debug(dbg->dbg_bcr, dbgbcr, brps); restore_debug(dbg->dbg_bvr, dbgbvr, brps); @@ -132,13 +138,13 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (!kvm_debug_regs_in_use(vcpu)) return; host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; host_dbg = host_data_ptr(host_debug_state.regs); - guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); + guest_dbg = __vcpu_debug_regs(vcpu); __debug_save_state(host_dbg, host_ctxt); __debug_restore_state(guest_dbg, guest_ctxt); @@ -151,18 +157,16 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (!kvm_debug_regs_in_use(vcpu)) return; host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; host_dbg = host_data_ptr(host_debug_state.regs); - guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); + guest_dbg = __vcpu_debug_regs(vcpu); __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - - vcpu_clear_flag(vcpu, DEBUG_DIRTY); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 34f53707892d..f838a45665f2 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -419,9 +419,9 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* First disable enough traps to allow us to update the registers */ if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) - cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); + cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); else - cpacr_clear_set(0, CPACR_ELx_FPEN); + cpacr_clear_set(0, CPACR_EL1_FPEN); isb(); /* Write out the host state if it's in the registers */ @@ -501,7 +501,12 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) return true; } -static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) +static inline u64 compute_counter_value(struct arch_timer_context *ctxt) +{ + return arch_timer_read_cntpct_el0() - timer_get_offset(ctxt); +} + +static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) { struct arch_timer_context *ctxt; u32 sysreg; @@ -511,18 +516,19 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) * We only get here for 64bit guests, 32bit guests will hit * the long and winding road all the way to the standard * handling. Yes, it sucks to be irrelevant. + * + * Also, we only deal with non-hypervisor context here (either + * an EL1 guest, or a non-HYP context of an EL2 guest). */ + if (is_hyp_ctxt(vcpu)) + return false; + sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); switch (sysreg) { case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: if (vcpu_has_nv(vcpu)) { - if (is_hyp_ctxt(vcpu)) { - ctxt = vcpu_hptimer(vcpu); - break; - } - /* Check for guest hypervisor trapping */ val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) @@ -534,16 +540,23 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) ctxt = vcpu_ptimer(vcpu); break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (vcpu_has_nv(vcpu)) { + /* Check for guest hypervisor trapping */ + val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + if (val & CNTHCTL_EL1TVCT) + return false; + } + + ctxt = vcpu_vtimer(vcpu); + break; default: return false; } - val = arch_timer_read_cntpct_el0(); - - if (ctxt->offset.vm_offset) - val -= *kern_hyp_va(ctxt->offset.vm_offset); - if (ctxt->offset.vcpu_offset) - val -= *kern_hyp_va(ctxt->offset.vcpu_offset); + val = compute_counter_value(ctxt); vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); __kvm_skip_instr(vcpu); @@ -588,7 +601,7 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) __vgic_v3_perform_cpuif_access(vcpu) == 1) return true; - if (kvm_hyp_handle_cntpct(vcpu)) + if (kvm_handle_cntxct(vcpu)) return true; return false; diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index a651c43ad679..76ff095c6b6e 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -18,9 +18,34 @@ static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); +static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; + + if (!vcpu) + vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); + + return vcpu; +} + +static inline bool ctxt_is_guest(struct kvm_cpu_context *ctxt) +{ + return host_data_ptr(host_ctxt) != ctxt; +} + +static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); + + if (ctxt_is_guest(ctxt) && kvm_host_owns_debug_regs(vcpu)) + return &vcpu->arch.external_mdscr_el1; + + return &ctxt_sys_reg(ctxt, MDSCR_EL1); +} + static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { - ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1); + *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1); // POR_EL0 can affect uaccess, so must be saved/restored early. if (ctxt_has_s1poe(ctxt)) @@ -33,16 +58,6 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0); } -static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) -{ - struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; - - if (!vcpu) - vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); - - return vcpu; -} - static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); @@ -139,7 +154,7 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { - write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1); + write_sysreg(*ctxt_mdscr_el1(ctxt), mdscr_el1); // POR_EL0 can affect uaccess, so must be saved/restored early. if (ctxt_has_s1poe(ctxt)) @@ -283,7 +298,7 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); - if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); } @@ -300,7 +315,7 @@ static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2); write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2); - if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h deleted file mode 100644 index f957890c7e38..000000000000 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ /dev/null @@ -1,223 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2021 Google LLC - * Author: Fuad Tabba <tabba@google.com> - */ - -#ifndef __ARM64_KVM_FIXED_CONFIG_H__ -#define __ARM64_KVM_FIXED_CONFIG_H__ - -#include <asm/sysreg.h> - -/* - * This file contains definitions for features to be allowed or restricted for - * guest virtual machines, depending on the mode KVM is running in and on the - * type of guest that is running. - * - * The ALLOW masks represent a bitmask of feature fields that are allowed - * without any restrictions as long as they are supported by the system. - * - * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for - * features that are restricted to support at most the specified feature. - * - * If a feature field is not present in either, than it is not supported. - * - * The approach taken for protected VMs is to allow features that are: - * - Needed by common Linux distributions (e.g., floating point) - * - Trivial to support, e.g., supporting the feature does not introduce or - * require tracking of additional state in KVM - * - Cannot be trapped or prevent the guest from using anyway - */ - -/* - * Allow for protected VMs: - * - Floating-point and Advanced SIMD - * - Data Independent Timing - * - Spectre/Meltdown Mitigation - */ -#define PVM_ID_AA64PFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) \ - ) - -/* - * Restrict to the following *unsigned* features for protected VMs: - * - AArch64 guests only (no support for AArch32 guests): - * AArch32 adds complexity in trap handling, emulation, condition codes, - * etc... - * - RAS (v1) - * Supported by KVM - */ -#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, RAS, IMP) \ - ) - -/* - * Allow for protected VMs: - * - Branch Target Identification - * - Speculative Store Bypassing - */ -#define PVM_ID_AA64PFR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \ - ) - -#define PVM_ID_AA64PFR2_ALLOW 0ULL - -/* - * Allow for protected VMs: - * - Mixed-endian - * - Distinction between Secure and Non-secure Memory - * - Mixed-endian at EL0 only - * - Non-context synchronizing exception entry and exit - */ -#define PVM_ID_AA64MMFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \ - ) - -/* - * Restrict to the following *unsigned* features for protected VMs: - * - 40-bit IPA - * - 16-bit ASID - */ -#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \ - ) - -/* - * Allow for protected VMs: - * - Hardware translation table updates to Access flag and Dirty state - * - Number of VMID bits from CPU - * - Hierarchical Permission Disables - * - Privileged Access Never - * - SError interrupt exceptions from speculative reads - * - Enhanced Translation Synchronization - * - Control for cache maintenance permission - */ -#define PVM_ID_AA64MMFR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_CMOW) \ - ) - -/* - * Allow for protected VMs: - * - Common not Private translations - * - User Access Override - * - IESB bit in the SCTLR_ELx registers - * - Unaligned single-copy atomicity and atomic functions - * - ESR_ELx.EC value on an exception by read access to feature ID space - * - TTL field in address operations. - * - Break-before-make sequences when changing translation block size - * - E0PDx mechanism - */ -#define PVM_ID_AA64MMFR2_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \ - ) - -#define PVM_ID_AA64MMFR3_ALLOW (0ULL) - -/* - * No support for Scalable Vectors for protected VMs: - * Requires additional support from KVM, e.g., context-switching and - * trapping at EL2 - */ -#define PVM_ID_AA64ZFR0_ALLOW (0ULL) - -/* - * No support for debug, including breakpoints, and watchpoints for protected - * VMs: - * The Arm architecture mandates support for at least the Armv8 debug - * architecture, which would include at least 2 hardware breakpoints and - * watchpoints. Providing that support to protected guests adds - * considerable state and complexity. Therefore, the reserved value of 0 is - * used for debug-related fields. - */ -#define PVM_ID_AA64DFR0_ALLOW (0ULL) -#define PVM_ID_AA64DFR1_ALLOW (0ULL) - -/* - * No support for implementation defined features. - */ -#define PVM_ID_AA64AFR0_ALLOW (0ULL) -#define PVM_ID_AA64AFR1_ALLOW (0ULL) - -/* - * No restrictions on instructions implemented in AArch64. - */ -#define PVM_ID_AA64ISAR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \ - ) - -/* Restrict pointer authentication to the basic version. */ -#define PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), ID_AA64ISAR1_EL1_APA_PAuth) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), ID_AA64ISAR1_EL1_API_PAuth) \ - ) - -#define PVM_ID_AA64ISAR2_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ - ) - -#define PVM_ID_AA64ISAR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \ - ) - -#define PVM_ID_AA64ISAR2_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_ATS1A)| \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS) \ - ) - -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); -bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); -bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); -int kvm_check_pvm_sysreg_table(void); - -#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index 97c527ef53c2..3766333bace9 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -7,7 +7,7 @@ #include <nvhe/memory.h> #include <nvhe/spinlock.h> -#define HYP_NO_ORDER USHRT_MAX +#define HYP_NO_ORDER ((u8)(~0)) struct hyp_pool { /* @@ -19,11 +19,11 @@ struct hyp_pool { struct list_head free_area[NR_PAGE_ORDERS]; phys_addr_t range_start; phys_addr_t range_end; - unsigned short max_order; + u8 max_order; }; /* Allocation */ -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order); +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order); void hyp_split_page(struct hyp_page *page); void hyp_get_page(struct hyp_pool *pool, void *addr); void hyp_put_page(struct hyp_pool *pool, void *addr); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 0972faccc2af..978f38c386ee 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -11,40 +11,10 @@ #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> #include <asm/virt.h> +#include <nvhe/memory.h> #include <nvhe/pkvm.h> #include <nvhe/spinlock.h> -/* - * SW bits 0-1 are reserved to track the memory ownership state of each page: - * 00: The page is owned exclusively by the page-table owner. - * 01: The page is owned by the page-table owner, but is shared - * with another entity. - * 10: The page is shared with, but not owned by the page-table owner. - * 11: Reserved for future use (lending). - */ -enum pkvm_page_state { - PKVM_PAGE_OWNED = 0ULL, - PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, - PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, - __PKVM_PAGE_RESERVED = KVM_PGTABLE_PROT_SW0 | - KVM_PGTABLE_PROT_SW1, - - /* Meta-states which aren't encoded directly in the PTE's SW bits */ - PKVM_NOPAGE, -}; - -#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) -static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, - enum pkvm_page_state state) -{ - return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state; -} - -static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) -{ - return prot & PKVM_PAGE_STATE_PROT_MASK; -} - struct host_mmu { struct kvm_arch arch; struct kvm_pgtable pgt; @@ -69,6 +39,13 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot); +int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); +int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm); +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index ab205c4d6774..34233d586060 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -7,9 +7,47 @@ #include <linux/types.h> +/* + * Bits 0-1 are reserved to track the memory ownership state of each page: + * 00: The page is owned exclusively by the page-table owner. + * 01: The page is owned by the page-table owner, but is shared + * with another entity. + * 10: The page is shared with, but not owned by the page-table owner. + * 11: Reserved for future use (lending). + */ +enum pkvm_page_state { + PKVM_PAGE_OWNED = 0ULL, + PKVM_PAGE_SHARED_OWNED = BIT(0), + PKVM_PAGE_SHARED_BORROWED = BIT(1), + __PKVM_PAGE_RESERVED = BIT(0) | BIT(1), + + /* Meta-states which aren't encoded directly in the PTE's SW bits */ + PKVM_NOPAGE = BIT(2), +}; +#define PKVM_PAGE_META_STATES_MASK (~__PKVM_PAGE_RESERVED) + +#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) +static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, + enum pkvm_page_state state) +{ + prot &= ~PKVM_PAGE_STATE_PROT_MASK; + prot |= FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, state); + return prot; +} + +static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) +{ + return FIELD_GET(PKVM_PAGE_STATE_PROT_MASK, prot); +} + struct hyp_page { - unsigned short refcount; - unsigned short order; + u16 refcount; + u8 order; + + /* Host (non-meta) state. Guarded by the host stage-2 lock. */ + enum pkvm_page_state host_state : 8; + + u32 host_share_guest_count; }; extern u64 __hyp_vmemmap; @@ -29,7 +67,13 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) #define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) #define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT)) -#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)]) + +static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys) +{ + BUILD_BUG_ON(sizeof(struct hyp_page) != sizeof(u64)); + return &hyp_vmemmap[hyp_phys_to_pfn(phys)]; +} + #define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt)) #define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt)) diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 24a9a8330d19..e42bf68c8848 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -20,6 +20,12 @@ struct pkvm_hyp_vcpu { /* Backpointer to the host's (untrusted) vCPU instance. */ struct kvm_vcpu *host_vcpu; + + /* + * If this hyp vCPU is loaded, then this is a backpointer to the + * per-cpu pointer tracking us. Otherwise, NULL if not loaded. + */ + struct pkvm_hyp_vcpu **loaded_hyp_vcpu; }; /* @@ -47,6 +53,8 @@ struct pkvm_hyp_vm { struct pkvm_hyp_vcpu *vcpus[]; }; +extern hyp_spinlock_t vm_table_lock; + static inline struct pkvm_hyp_vm * pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) { @@ -58,6 +66,11 @@ static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu) return vcpu_is_protected(&hyp_vcpu->vcpu); } +static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm) +{ + return kvm_vm_is_protected(&hyp_vm->kvm); +} + void pkvm_hyp_vm_table_init(void *tbl); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, @@ -69,5 +82,15 @@ int __pkvm_teardown_vm(pkvm_handle_t handle); struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx); void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void); + +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle); +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle); +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm); + +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); +int kvm_check_pvm_sysreg_table(void); #endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 53efda0235cf..2f4a4f5036bb 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -51,42 +51,55 @@ static void __debug_restore_spe(u64 pmscr_el1) write_sysreg_el1(pmscr_el1, SYS_PMSCR); } -static void __debug_save_trace(u64 *trfcr_el1) +static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr) { - *trfcr_el1 = 0; + *saved_trfcr = read_sysreg_el1(SYS_TRFCR); + write_sysreg_el1(new_trfcr, SYS_TRFCR); +} - /* Check if the TRBE is enabled */ - if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E)) - return; - /* - * Prohibit trace generation while we are in guest. - * Since access to TRFCR_EL1 is trapped, the guest can't - * modify the filtering set by the host. - */ - *trfcr_el1 = read_sysreg_el1(SYS_TRFCR); - write_sysreg_el1(0, SYS_TRFCR); - isb(); - /* Drain the trace buffer to memory */ - tsb_csync(); +static bool __trace_needs_drain(void) +{ + if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE)) + return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E; + + return host_data_test_flag(TRBE_ENABLED); } -static void __debug_restore_trace(u64 trfcr_el1) +static bool __trace_needs_switch(void) { - if (!trfcr_el1) - return; + return host_data_test_flag(TRBE_ENABLED) || + host_data_test_flag(EL1_TRACING_CONFIGURED); +} - /* Restore trace filter controls */ - write_sysreg_el1(trfcr_el1, SYS_TRFCR); +static void __trace_switch_to_guest(void) +{ + /* Unsupported with TRBE so disable */ + if (host_data_test_flag(TRBE_ENABLED)) + *host_data_ptr(trfcr_while_in_guest) = 0; + + __trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1), + *host_data_ptr(trfcr_while_in_guest)); + + if (__trace_needs_drain()) { + isb(); + tsb_csync(); + } +} + +static void __trace_switch_to_host(void) +{ + __trace_do_switch(host_data_ptr(trfcr_while_in_guest), + *host_data_ptr(host_debug_state.trfcr_el1)); } void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) + if (host_data_test_flag(HAS_SPE)) __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1)); - /* Disable and flush Self-Hosted Trace generation */ - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) - __debug_save_trace(host_data_ptr(host_debug_state.trfcr_el1)); + + if (__trace_needs_switch()) + __trace_switch_to_guest(); } void __debug_switch_to_guest(struct kvm_vcpu *vcpu) @@ -96,18 +109,13 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) + if (host_data_test_flag(HAS_SPE)) __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1)); - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) - __debug_restore_trace(*host_data_ptr(host_debug_state.trfcr_el1)); + if (__trace_needs_switch()) + __trace_switch_to_host(); } void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); } - -u64 __kvm_get_mdcr_el2(void) -{ - return read_sysreg(mdcr_el2); -} diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 3d610fc51f4d..58f0cb2298cc 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -188,12 +188,12 @@ SYM_FUNC_END(__host_hvc) /* * Test whether the SP has overflowed, without corrupting a GPR. - * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit + * nVHE hypervisor stacks are aligned so that the NVHE_STACK_SHIFT bit * of SP should always be 1. */ add sp, sp, x0 // sp' = sp + x0 sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp - tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@ + tbz x0, #NVHE_STACK_SHIFT, .L__hyp_sp_overflow\@ sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 6aa0b13d86e5..6e12c070832f 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -68,7 +68,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (!guest_owns_fp_regs()) return; - cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); + cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); isb(); if (vcpu_has_sve(vcpu)) @@ -91,11 +91,34 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; } +static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state; +} + +static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state; +} + static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) { struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; fpsimd_sve_flush(); + flush_debug_state(hyp_vcpu); hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; @@ -103,8 +126,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) /* Limit guest vector length to the maximum supported by the host. */ hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl); - hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; - hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & @@ -112,8 +133,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; - hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr); - hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3; @@ -127,6 +146,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) unsigned int i; fpsimd_sve_sync(&hyp_vcpu->vcpu); + sync_debug_state(hyp_vcpu); host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; @@ -141,16 +161,46 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; } +static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2); + DECLARE_REG(u64, hcr_el2, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx); + if (!hyp_vcpu) + return; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) { + /* Propagate WFx trapping flags */ + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI); + hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI); + } +} + +static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (hyp_vcpu) + pkvm_put_hyp_vcpu(hyp_vcpu); +} + static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1); int ret; - host_vcpu = kern_hyp_va(host_vcpu); - if (unlikely(is_protected_kvm_enabled())) { - struct pkvm_hyp_vcpu *hyp_vcpu; - struct kvm *host_kvm; + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); /* * KVM (and pKVM) doesn't support SME guests for now, and @@ -163,9 +213,6 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) goto out; } - host_kvm = kern_hyp_va(host_vcpu->kvm); - hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle, - host_vcpu->vcpu_idx); if (!hyp_vcpu) { ret = -EINVAL; goto out; @@ -176,12 +223,141 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) ret = __kvm_vcpu_run(&hyp_vcpu->vcpu); sync_hyp_vcpu(hyp_vcpu); - pkvm_put_hyp_vcpu(hyp_vcpu); } else { /* The host is fully trusted, run its vCPU directly. */ - ret = __kvm_vcpu_run(host_vcpu); + ret = __kvm_vcpu_run(kern_hyp_va(host_vcpu)); } +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, + host_vcpu->arch.pkvm_memcache.nr_pages, + &host_vcpu->arch.pkvm_memcache); +} + +static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = pkvm_refill_memcache(hyp_vcpu); + if (ret) + goto out; + + ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_unshare_guest(gfn, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 2); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_relax_perms_guest(gfn, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_wrprotect_guest(gfn, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(bool, mkold, host_ctxt, 3); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_test_clear_young_guest(gfn, mkold, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + ret = __pkvm_host_mkyoung_guest(gfn, hyp_vcpu); out: cpu_reg(host_ctxt, 1) = ret; } @@ -233,6 +409,22 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); } +static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + struct pkvm_hyp_vm *hyp_vm; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + return; + + __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); + put_pkvm_hyp_vm(hyp_vm); +} + static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); @@ -264,11 +456,6 @@ static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) __vgic_v3_init_lrs(); } -static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt) -{ - cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2(); -} - static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); @@ -384,7 +571,6 @@ typedef void (*hcall_t)(struct kvm_cpu_context *); static const hcall_t host_hcall[] = { /* ___kvm_hyp_init */ - HANDLE_FUNC(__kvm_get_mdcr_el2), HANDLE_FUNC(__pkvm_init), HANDLE_FUNC(__pkvm_create_private_mapping), HANDLE_FUNC(__pkvm_cpu_set_vector), @@ -395,6 +581,12 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_host_unshare_hyp), + HANDLE_FUNC(__pkvm_host_share_guest), + HANDLE_FUNC(__pkvm_host_unshare_guest), + HANDLE_FUNC(__pkvm_host_relax_perms_guest), + HANDLE_FUNC(__pkvm_host_wrprotect_guest), + HANDLE_FUNC(__pkvm_host_test_clear_young_guest), + HANDLE_FUNC(__pkvm_host_mkyoung_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), @@ -409,6 +601,9 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), + HANDLE_FUNC(__pkvm_vcpu_load), + HANDLE_FUNC(__pkvm_vcpu_put), + HANDLE_FUNC(__pkvm_tlb_flush_vmid), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) @@ -481,7 +676,7 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) handle_host_smc(host_ctxt); break; case ESR_ELx_EC_SVE: - cpacr_clear_set(0, CPACR_ELx_ZEN); + cpacr_clear_set(0, CPACR_EL1_ZEN); isb(); sve_cond_update_zcr_vq(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index caba3e4bd09e..7ad7b133b81a 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -201,8 +201,8 @@ static void *guest_s2_zalloc_page(void *mc) memset(addr, 0, PAGE_SIZE); p = hyp_virt_to_page(addr); - memset(p, 0, sizeof(*p)); p->refcount = 1; + p->order = 0; return addr; } @@ -268,6 +268,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) { + struct hyp_page *page; void *addr; /* Dump all pgtable pages in the hyp_pool */ @@ -279,7 +280,9 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) /* Drain the hyp_pool into the memcache */ addr = hyp_alloc_pages(&vm->pool, 0); while (addr) { - memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page)); + page = hyp_virt_to_page(addr); + page->refcount = 0; + page->order = 0; push_hyp_memcache(mc, addr, hyp_virt_to_phys); WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); addr = hyp_alloc_pages(&vm->pool, 0); @@ -382,19 +385,28 @@ bool addr_is_memory(phys_addr_t phys) return !!find_mem_range(phys, &range); } -static bool addr_is_allowed_memory(phys_addr_t phys) +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) +{ + return range->start <= addr && addr < range->end; +} + +static int check_range_allowed_memory(u64 start, u64 end) { struct memblock_region *reg; struct kvm_mem_range range; - reg = find_mem_range(phys, &range); + /* + * Callers can't check the state of a range that overlaps memory and + * MMIO regions, so ensure [start, end[ is in the same kvm_mem_range. + */ + reg = find_mem_range(start, &range); + if (!is_in_mem_range(end - 1, &range)) + return -EINVAL; - return reg && !(reg->flags & MEMBLOCK_NOMAP); -} + if (!reg || reg->flags & MEMBLOCK_NOMAP) + return -EPERM; -static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) -{ - return range->start <= addr && addr < range->end; + return 0; } static bool range_is_memory(u64 start, u64 end) @@ -454,8 +466,10 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) if (kvm_pte_valid(pte)) return -EAGAIN; - if (pte) + if (pte) { + WARN_ON(addr_is_memory(addr) && hyp_phys_to_page(addr)->host_state != PKVM_NOPAGE); return -EPERM; + } do { u64 granule = kvm_granule_size(level); @@ -477,10 +491,33 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size, return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); } +static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state) +{ + phys_addr_t end = addr + size; + + for (; addr < end; addr += PAGE_SIZE) + hyp_phys_to_page(addr)->host_state = state; +} + int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) { - return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, - addr, size, &host_s2_pool, owner_id); + int ret; + + if (!addr_is_memory(addr)) + return -EPERM; + + ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, + addr, size, &host_s2_pool, owner_id); + if (ret) + return ret; + + /* Don't forget to update the vmemmap tracking for the host */ + if (owner_id == PKVM_ID_HOST) + __host_update_page_state(addr, size, PKVM_PAGE_OWNED); + else + __host_update_page_state(addr, size, PKVM_NOPAGE); + + return 0; } static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) @@ -546,39 +583,6 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) BUG_ON(ret && ret != -EAGAIN); } -struct pkvm_mem_transition { - u64 nr_pages; - - struct { - enum pkvm_component_id id; - /* Address in the initiator's address space */ - u64 addr; - - union { - struct { - /* Address in the completer's address space */ - u64 completer_addr; - } host; - struct { - u64 completer_addr; - } hyp; - }; - } initiator; - - struct { - enum pkvm_component_id id; - } completer; -}; - -struct pkvm_mem_share { - const struct pkvm_mem_transition tx; - const enum kvm_pgtable_prot completer_prot; -}; - -struct pkvm_mem_donation { - const struct pkvm_mem_transition tx; -}; - struct check_walk_data { enum pkvm_page_state desired; enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr); @@ -604,115 +608,38 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, return kvm_pgtable_walk(pgt, addr, size, &walker); } -static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr) -{ - if (!addr_is_allowed_memory(addr)) - return PKVM_NOPAGE; - - if (!kvm_pte_valid(pte) && pte) - return PKVM_NOPAGE; - - return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); -} - static int __host_check_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - struct check_walk_data d = { - .desired = state, - .get_page_state = host_get_page_state, - }; + u64 end = addr + size; + int ret; + + ret = check_range_allowed_memory(addr, end); + if (ret) + return ret; hyp_assert_lock_held(&host_mmu.lock); - return check_page_state_range(&host_mmu.pgt, addr, size, &d); + for (; addr < end; addr += PAGE_SIZE) { + if (hyp_phys_to_page(addr)->host_state != state) + return -EPERM; + } + + return 0; } static int __host_set_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state); - - return host_stage2_idmap_locked(addr, size, prot); -} - -static int host_request_owned_transition(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED); -} - -static int host_request_unshare(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_check_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); -} - -static int host_initiate_share(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); -} - -static int host_initiate_unshare(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED); -} - -static int host_initiate_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u8 owner_id = tx->completer.id; - u64 size = tx->nr_pages * PAGE_SIZE; - - *completer_addr = tx->initiator.host.completer_addr; - return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id); -} - -static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) -{ - return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || - tx->initiator.id != PKVM_ID_HYP); -} - -static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx, - enum pkvm_page_state state) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - - if (__host_ack_skip_pgtable_check(tx)) - return 0; - - return __host_check_page_state_range(addr, size, state); -} + if (hyp_phys_to_page(addr)->host_state == PKVM_NOPAGE) { + int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT); -static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - return __host_ack_transition(addr, tx, PKVM_NOPAGE); -} + if (ret) + return ret; + } -static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u8 host_id = tx->completer.id; + __host_update_page_state(addr, size, state); - return host_stage2_set_owner_locked(addr, size, host_id); + return 0; } static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr) @@ -735,576 +662,418 @@ static int __hyp_check_page_state_range(u64 addr, u64 size, return check_page_state_range(&pkvm_pgtable, addr, size, &d); } -static int hyp_request_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) +static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) { - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; + if (!kvm_pte_valid(pte)) + return PKVM_NOPAGE; - *completer_addr = tx->initiator.hyp.completer_addr; - return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED); + return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); } -static int hyp_initiate_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) +static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr, + u64 size, enum pkvm_page_state state) { - u64 size = tx->nr_pages * PAGE_SIZE; - int ret; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + struct check_walk_data d = { + .desired = state, + .get_page_state = guest_get_page_state, + }; - *completer_addr = tx->initiator.hyp.completer_addr; - ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size); - return (ret != size) ? -EFAULT : 0; + hyp_assert_lock_held(&vm->lock); + return check_page_state_range(&vm->pgt, addr, size, &d); } -static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) +int __pkvm_host_share_hyp(u64 pfn) { - return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || - tx->initiator.id != PKVM_ID_HOST); -} + u64 phys = hyp_pfn_to_phys(pfn); + void *virt = __hyp_va(phys); + enum kvm_pgtable_prot prot; + u64 size = PAGE_SIZE; + int ret; -static int hyp_ack_share(u64 addr, const struct pkvm_mem_transition *tx, - enum kvm_pgtable_prot perms) -{ - u64 size = tx->nr_pages * PAGE_SIZE; + host_lock_component(); + hyp_lock_component(); - if (perms != PAGE_HYP) - return -EPERM; + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE); + if (ret) + goto unlock; + } - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot)); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED)); - return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); +unlock: + hyp_unlock_component(); + host_unlock_component(); + + return ret; } -static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) +int __pkvm_host_unshare_hyp(u64 pfn) { - u64 size = tx->nr_pages * PAGE_SIZE; - - if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr)) - return -EBUSY; + u64 phys = hyp_pfn_to_phys(pfn); + u64 virt = (u64)__hyp_va(phys); + u64 size = PAGE_SIZE; + int ret; - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; + host_lock_component(); + hyp_lock_component(); - return __hyp_check_page_state_range(addr, size, - PKVM_PAGE_SHARED_BORROWED); -} + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + if (hyp_page_count((void *)virt)) { + ret = -EBUSY; + goto unlock; + } -static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED)); - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; +unlock: + hyp_unlock_component(); + host_unlock_component(); - return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); + return ret; } -static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx, - enum kvm_pgtable_prot perms) +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) { - void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + void *virt = __hyp_va(phys); enum kvm_pgtable_prot prot; + int ret; - prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED); - return pkvm_create_mappings_locked(start, end, prot); -} + host_lock_component(); + hyp_lock_component(); -static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - int ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, addr, size); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE); + if (ret) + goto unlock; + } - return (ret != size) ? -EFAULT : 0; -} + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot)); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP)); -static int hyp_complete_donation(u64 addr, - const struct pkvm_mem_transition *tx) -{ - void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); - enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); +unlock: + hyp_unlock_component(); + host_unlock_component(); - return pkvm_create_mappings_locked(start, end, prot); + return ret; } -static int check_share(struct pkvm_mem_share *share) +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + u64 virt = (u64)__hyp_va(phys); int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_owned_transition(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + host_lock_component(); + hyp_lock_component(); + ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_OWNED); if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_ack_share(completer_addr, tx, share->completer_prot); - break; - case PKVM_ID_FFA: - /* - * We only check the host; the secure side will check the other - * end when we forward the FFA call. - */ - ret = 0; - break; - default: - ret = -EINVAL; + goto unlock; + if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; } + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST)); + +unlock: + hyp_unlock_component(); + host_unlock_component(); + return ret; } -static int __do_share(struct pkvm_mem_share *share) +int hyp_pin_shared_mem(void *from, void *to) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + u64 size = end - start; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_share(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + host_lock_component(); + hyp_lock_component(); + ret = __host_check_page_state_range(__hyp_pa(start), size, + PKVM_PAGE_SHARED_OWNED); if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_complete_share(completer_addr, tx, share->completer_prot); - break; - case PKVM_ID_FFA: - /* - * We're not responsible for any secure page-tables, so there's - * nothing to do here. - */ - ret = 0; - break; - default: - ret = -EINVAL; - } + goto unlock; - return ret; -} + ret = __hyp_check_page_state_range(start, size, + PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; -/* - * do_share(): - * - * The page owner grants access to another component with a given set - * of permissions. - * - * Initiator: OWNED => SHARED_OWNED - * Completer: NOPAGE => SHARED_BORROWED - */ -static int do_share(struct pkvm_mem_share *share) -{ - int ret; + for (cur = start; cur < end; cur += PAGE_SIZE) + hyp_page_ref_inc(hyp_virt_to_page(cur)); - ret = check_share(share); - if (ret) - return ret; +unlock: + hyp_unlock_component(); + host_unlock_component(); - return WARN_ON(__do_share(share)); + return ret; } -static int check_unshare(struct pkvm_mem_share *share) +void hyp_unpin_shared_mem(void *from, void *to) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; - int ret; - - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_unshare(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); - if (ret) - return ret; + host_lock_component(); + hyp_lock_component(); - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_ack_unshare(completer_addr, tx); - break; - case PKVM_ID_FFA: - /* See check_share() */ - ret = 0; - break; - default: - ret = -EINVAL; - } + for (cur = start; cur < end; cur += PAGE_SIZE) + hyp_page_ref_dec(hyp_virt_to_page(cur)); - return ret; + hyp_unlock_component(); + host_unlock_component(); } -static int __do_unshare(struct pkvm_mem_share *share) +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_unshare(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_complete_unshare(completer_addr, tx); - break; - case PKVM_ID_FFA: - /* See __do_share() */ - ret = 0; - break; - default: - ret = -EINVAL; - } + host_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + host_unlock_component(); return ret; } -/* - * do_unshare(): - * - * The page owner revokes access from another component for a range of - * pages which were previously shared using do_share(). - * - * Initiator: SHARED_OWNED => OWNED - * Completer: SHARED_BORROWED => NOPAGE - */ -static int do_unshare(struct pkvm_mem_share *share) +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) { + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; int ret; - ret = check_unshare(share); - if (ret) - return ret; + host_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_OWNED); + host_unlock_component(); - return WARN_ON(__do_unshare(share)); + return ret; } -static int check_donation(struct pkvm_mem_donation *donation) +int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot) { - const struct pkvm_mem_transition *tx = &donation->tx; - u64 completer_addr; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys = hyp_pfn_to_phys(pfn); + u64 ipa = hyp_pfn_to_phys(gfn); + struct hyp_page *page; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_owned_transition(&completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_request_donation(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); if (ret) return ret; - switch (tx->completer.id) { - case PKVM_ID_HOST: - ret = host_ack_donation(completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_ack_donation(completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - return ret; -} + host_lock_component(); + guest_lock_component(vm); -static int __do_donate(struct pkvm_mem_donation *donation) -{ - const struct pkvm_mem_transition *tx = &donation->tx; - u64 completer_addr; - int ret; + ret = __guest_check_page_state_range(vcpu, ipa, PAGE_SIZE, PKVM_NOPAGE); + if (ret) + goto unlock; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_donation(&completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_initiate_donation(&completer_addr, tx); + page = hyp_phys_to_page(phys); + switch (page->host_state) { + case PKVM_PAGE_OWNED: + WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED)); break; + case PKVM_PAGE_SHARED_OWNED: + if (page->host_share_guest_count) + break; + /* Only host to np-guest multi-sharing is tolerated */ + WARN_ON(1); + fallthrough; default: - ret = -EINVAL; + ret = -EPERM; + goto unlock; } - if (ret) - return ret; + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys, + pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); + page->host_share_guest_count++; - switch (tx->completer.id) { - case PKVM_ID_HOST: - ret = host_complete_donation(completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_complete_donation(completer_addr, tx); - break; - default: - ret = -EINVAL; - } +unlock: + guest_unlock_component(vm); + host_unlock_component(); return ret; } -/* - * do_donate(): - * - * The page owner transfers ownership to another component, losing access - * as a consequence. - * - * Initiator: OWNED => NOPAGE - * Completer: NOPAGE => OWNED - */ -static int do_donate(struct pkvm_mem_donation *donation) +static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa) { + enum pkvm_page_state state; + struct hyp_page *page; + kvm_pte_t pte; + u64 phys; + s8 level; int ret; - ret = check_donation(donation); + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); if (ret) return ret; + if (level != KVM_PGTABLE_LAST_LEVEL) + return -E2BIG; + if (!kvm_pte_valid(pte)) + return -ENOENT; - return WARN_ON(__do_donate(donation)); -} - -int __pkvm_host_share_hyp(u64 pfn) -{ - int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_share share = { - .tx = { - .nr_pages = 1, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - .completer_prot = PAGE_HYP, - }; + state = guest_get_page_state(pte, ipa); + if (state != PKVM_PAGE_SHARED_BORROWED) + return -EPERM; - host_lock_component(); - hyp_lock_component(); + phys = kvm_pte_to_phys(pte); + ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); + if (WARN_ON(ret)) + return ret; - ret = do_share(&share); + page = hyp_phys_to_page(phys); + if (page->host_state != PKVM_PAGE_SHARED_OWNED) + return -EPERM; + if (WARN_ON(!page->host_share_guest_count)) + return -EINVAL; - hyp_unlock_component(); - host_unlock_component(); + *__phys = phys; - return ret; + return 0; } -int __pkvm_host_unshare_hyp(u64 pfn) +int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm) { + u64 ipa = hyp_pfn_to_phys(gfn); + struct hyp_page *page; + u64 phys; int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_share share = { - .tx = { - .nr_pages = 1, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - .completer_prot = PAGE_HYP, - }; host_lock_component(); - hyp_lock_component(); + guest_lock_component(vm); + + ret = __check_host_shared_guest(vm, &phys, ipa); + if (ret) + goto unlock; + + ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE); + if (ret) + goto unlock; - ret = do_unshare(&share); + page = hyp_phys_to_page(phys); + page->host_share_guest_count--; + if (!page->host_share_guest_count) + WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED)); - hyp_unlock_component(); +unlock: + guest_unlock_component(vm); host_unlock_component(); return ret; } -int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) { + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); + u64 phys; int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_donation donation = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - }; + + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; host_lock_component(); - hyp_lock_component(); + guest_lock_component(vm); - ret = do_donate(&donation); + ret = __check_host_shared_guest(vm, &phys, ipa); + if (!ret) + ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); - hyp_unlock_component(); + guest_unlock_component(vm); host_unlock_component(); return ret; } -int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) +int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) { + u64 ipa = hyp_pfn_to_phys(gfn); + u64 phys; int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_donation donation = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HYP, - .addr = hyp_addr, - .hyp = { - .completer_addr = host_addr, - }, - }, - .completer = { - .id = PKVM_ID_HOST, - }, - }, - }; host_lock_component(); - hyp_lock_component(); + guest_lock_component(vm); - ret = do_donate(&donation); + ret = __check_host_shared_guest(vm, &phys, ipa); + if (!ret) + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); - hyp_unlock_component(); + guest_unlock_component(vm); host_unlock_component(); return ret; } -int hyp_pin_shared_mem(void *from, void *to) +int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) { - u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); - u64 end = PAGE_ALIGN((u64)to); - u64 size = end - start; + u64 ipa = hyp_pfn_to_phys(gfn); + u64 phys; int ret; host_lock_component(); - hyp_lock_component(); - - ret = __host_check_page_state_range(__hyp_pa(start), size, - PKVM_PAGE_SHARED_OWNED); - if (ret) - goto unlock; - - ret = __hyp_check_page_state_range(start, size, - PKVM_PAGE_SHARED_BORROWED); - if (ret) - goto unlock; + guest_lock_component(vm); - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_inc(hyp_virt_to_page(cur)); + ret = __check_host_shared_guest(vm, &phys, ipa); + if (!ret) + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); -unlock: - hyp_unlock_component(); + guest_unlock_component(vm); host_unlock_component(); return ret; } -void hyp_unpin_shared_mem(void *from, void *to) -{ - u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); - u64 end = PAGE_ALIGN((u64)to); - - host_lock_component(); - hyp_lock_component(); - - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_dec(hyp_virt_to_page(cur)); - - hyp_unlock_component(); - host_unlock_component(); -} - -int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) { + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); + u64 phys; int ret; - struct pkvm_mem_share share = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = hyp_pfn_to_phys(pfn), - }, - .completer = { - .id = PKVM_ID_FFA, - }, - }, - }; host_lock_component(); - ret = do_share(&share); - host_unlock_component(); + guest_lock_component(vm); - return ret; -} + ret = __check_host_shared_guest(vm, &phys, ipa); + if (!ret) + kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); -int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) -{ - int ret; - struct pkvm_mem_share share = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = hyp_pfn_to_phys(pfn), - }, - .completer = { - .id = PKVM_ID_FFA, - }, - }, - }; - - host_lock_component(); - ret = do_unshare(&share); + guest_unlock_component(vm); host_unlock_component(); return ret; diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 8850b591d775..f41c7440b34b 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -360,10 +360,10 @@ int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) prev_base = __io_map_base; /* - * Efficient stack verification using the PAGE_SHIFT bit implies + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies * an alignment of our allocation on the order of the size. */ - size = PAGE_SIZE * 2; + size = NVHE_STACK_SIZE * 2; addr = ALIGN(__io_map_base, size); ret = __pkvm_alloc_private_va_range(addr, size); @@ -373,12 +373,12 @@ int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) * at the higher address and leave the lower guard page * unbacked. * - * Any valid stack address now has the PAGE_SHIFT bit as 1 + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. */ - ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + PAGE_SIZE, - PAGE_SIZE, phys, PAGE_HYP); + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + NVHE_STACK_SIZE, + NVHE_STACK_SIZE, phys, PAGE_HYP); if (ret) __io_map_base = prev_base; } diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index e691290d3765..a1eb27a1a747 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -32,7 +32,7 @@ u64 __hyp_vmemmap; */ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { phys_addr_t addr = hyp_page_to_phys(p); @@ -51,7 +51,7 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, /* Find a buddy page currently available for allocation */ static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order); @@ -94,7 +94,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, struct hyp_page *p) { phys_addr_t phys = hyp_page_to_phys(p); - unsigned short order = p->order; + u8 order = p->order; struct hyp_page *buddy; memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); @@ -129,7 +129,7 @@ insert: static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy; @@ -183,7 +183,7 @@ void hyp_get_page(struct hyp_pool *pool, void *addr) void hyp_split_page(struct hyp_page *p) { - unsigned short order = p->order; + u8 order = p->order; unsigned int i; p->order = 0; @@ -195,10 +195,10 @@ void hyp_split_page(struct hyp_page *p) } } -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order) { - unsigned short i = order; struct hyp_page *p; + u8 i = order; hyp_spin_lock(&pool->lock); diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 01616c39a810..3927fe52a3dd 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -9,7 +9,6 @@ #include <asm/kvm_emulate.h> -#include <nvhe/fixed_config.h> #include <nvhe/mem_protect.h> #include <nvhe/memory.h> #include <nvhe/pkvm.h> @@ -24,232 +23,160 @@ unsigned int kvm_arm_vmid_bits; unsigned int kvm_host_sve_max_vl; /* - * Set trap register values based on features in ID_AA64PFR0. + * The currently loaded hyp vCPU for each physical CPU. Used only when + * protected KVM is enabled, but for both protected and non-protected VMs. */ -static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); - u64 hcr_set = HCR_RW; - u64 hcr_clear = 0; - u64 cptr_set = 0; - u64 cptr_clear = 0; - - /* Protected KVM does not support AArch32 guests. */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_EL0_IMP); - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_EL1_IMP); +static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); - /* - * Linux guests assume support for floating-point and Advanced SIMD. Do - * not change the trapping behavior for these from the KVM default. - */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), - PVM_ID_AA64PFR0_ALLOW)); - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), - PVM_ID_AA64PFR0_ALLOW)); +static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; if (has_hvhe()) - hcr_set |= HCR_E2H; + vcpu->arch.hcr_el2 |= HCR_E2H; - /* Trap RAS unless all current versions are supported */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) < - ID_AA64PFR0_EL1_RAS_V1P1) { - hcr_set |= HCR_TERR | HCR_TEA; - hcr_clear |= HCR_FIEN; + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; } - /* Trap AMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) { - hcr_clear |= HCR_AMVOFFEN; - cptr_set |= CPTR_EL2_TAM; - } + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; - /* Trap SVE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { - if (has_hvhe()) - cptr_clear |= CPACR_ELx_ZEN; - else - cptr_set |= CPTR_EL2_TZ; - } + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; - vcpu->arch.cptr_el2 &= ~cptr_clear; + if (vcpu_has_ptrauth(vcpu)) + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); + + if (kvm_has_mte(vcpu->kvm)) + vcpu->arch.hcr_el2 |= HCR_ATA; } -/* - * Set trap register values based on features in ID_AA64PFR1. - */ -static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu) +static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); - u64 hcr_set = 0; - u64 hcr_clear = 0; + struct kvm *kvm = vcpu->kvm; + u64 val = vcpu->arch.hcr_el2; - /* Memory Tagging: Trap and Treat as Untagged if not supported. */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), feature_ids)) { - hcr_set |= HCR_TID5; - hcr_clear |= HCR_DCT | HCR_ATA; + /* No support for AArch32. */ + val |= HCR_RW; + + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; + + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { + val |= HCR_TERR | HCR_TEA; + val &= ~(HCR_FIEN); } - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; -} + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) + val &= ~(HCR_AMVOFFEN); -/* - * Set trap register values based on features in ID_AA64DFR0. - */ -static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); - u64 mdcr_set = 0; - u64 mdcr_clear = 0; - u64 cptr_set = 0; - - /* Trap/constrain PMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; - mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME | - MDCR_EL2_HPMN_MASK; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) { + val |= HCR_TID5; + val &= ~(HCR_DCT | HCR_ATA); } - /* Trap Debug */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids)) - mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) + val |= HCR_TLOR; - /* Trap OS Double Lock */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids)) - mdcr_set |= MDCR_EL2_TDOSA; + vcpu->arch.hcr_el2 = val; +} + +static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 val = vcpu->arch.mdcr_el2; - /* Trap SPE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPMS; - mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) { + val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK); } - /* Trap Trace Filter */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids)) - mdcr_set |= MDCR_EL2_TTRF; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP)) + val |= MDCR_EL2_TDRA | MDCR_EL2_TDA; - /* Trap Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) { - if (has_hvhe()) - cptr_set |= CPACR_EL1_TTA; - else - cptr_set |= CPTR_EL2_TTA; - } + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) + val |= MDCR_EL2_TDOSA; - /* Trap External Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids)) - mdcr_clear |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) { + val |= MDCR_EL2_TPMS; + val &= ~MDCR_EL2_E2PB_MASK; + } - vcpu->arch.mdcr_el2 |= mdcr_set; - vcpu->arch.mdcr_el2 &= ~mdcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; -} + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) + val |= MDCR_EL2_TTRF; -/* - * Set trap register values based on features in ID_AA64MMFR0. - */ -static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); - u64 mdcr_set = 0; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP)) + val |= MDCR_EL2_E2TB_MASK; /* Trap Debug Communications Channel registers */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), feature_ids)) - mdcr_set |= MDCR_EL2_TDCC; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + val |= MDCR_EL2_TDCC; - vcpu->arch.mdcr_el2 |= mdcr_set; + vcpu->arch.mdcr_el2 = val; } /* - * Set trap register values based on features in ID_AA64MMFR1. + * Check that cpu features that are neither trapped nor supported are not + * enabled for protected VMs. */ -static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu) +static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); - u64 hcr_set = 0; - - /* Trap LOR */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), feature_ids)) - hcr_set |= HCR_TLOR; + struct kvm *kvm = vcpu->kvm; - vcpu->arch.hcr_el2 |= hcr_set; -} - -/* - * Set baseline trap register values. - */ -static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) -{ - const u64 hcr_trap_feat_regs = HCR_TID3; - const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1; + /* Protected KVM does not support AArch32 guests. */ + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) + return -EINVAL; /* - * Always trap: - * - Feature id registers: to control features exposed to guests - * - Implementation-defined features + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. */ - vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef; - - /* Clear res0 and set res1 bits to trap potential new features. */ - vcpu->arch.hcr_el2 &= ~(HCR_RES0); - vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); - if (!has_hvhe()) { - vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; - vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); - } -} - -static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; - - if (has_hvhe()) - vcpu->arch.hcr_el2 |= HCR_E2H; - - if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { - /* route synchronous external abort exceptions to EL2 */ - vcpu->arch.hcr_el2 |= HCR_TEA; - /* trap error record accesses */ - vcpu->arch.hcr_el2 |= HCR_TERR; - } - - if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) - vcpu->arch.hcr_el2 |= HCR_FWB; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) || + !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP)) + return -EINVAL; - if (cpus_have_final_cap(ARM64_HAS_EVT) && - !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) - vcpu->arch.hcr_el2 |= HCR_TID4; - else - vcpu->arch.hcr_el2 |= HCR_TID2; + /* No SME support in KVM right now. Check to catch if it changes. */ + if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) + return -EINVAL; - if (vcpu_has_ptrauth(vcpu)) - vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); + return 0; } /* * Initialize trap register values in protected mode. */ -static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) { - vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + int ret; + vcpu->arch.mdcr_el2 = 0; pkvm_vcpu_reset_hcr(vcpu); - if ((!vcpu_is_protected(vcpu))) - return; + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) + return 0; + + ret = pkvm_check_pvm_cpu_features(vcpu); + if (ret) + return ret; + + pvm_init_traps_hcr(vcpu); + pvm_init_traps_mdcr(vcpu); - pvm_init_trap_regs(vcpu); - pvm_init_traps_aa64pfr0(vcpu); - pvm_init_traps_aa64pfr1(vcpu); - pvm_init_traps_aa64dfr0(vcpu); - pvm_init_traps_aa64mmfr0(vcpu); - pvm_init_traps_aa64mmfr1(vcpu); + return 0; } /* @@ -270,10 +197,10 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx) /* * Spinlock for protecting state related to the VM table. Protects writes - * to 'vm_table' and 'nr_table_entries' as well as reads and writes to - * 'last_hyp_vcpu_lookup'. + * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization. + * Also protects reads and writes to 'last_hyp_vcpu_lookup'. */ -static DEFINE_HYP_SPINLOCK(vm_table_lock); +DEFINE_HYP_SPINLOCK(vm_table_lock); /* * The table of VM entries for protected VMs in hyp. @@ -306,15 +233,30 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, struct pkvm_hyp_vcpu *hyp_vcpu = NULL; struct pkvm_hyp_vm *hyp_vm; + /* Cannot load a new vcpu without putting the old one first. */ + if (__this_cpu_read(loaded_hyp_vcpu)) + return NULL; + hyp_spin_lock(&vm_table_lock); hyp_vm = get_vm_by_handle(handle); if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx) goto unlock; hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + + /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */ + if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) { + hyp_vcpu = NULL; + goto unlock; + } + + hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu); hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); unlock: hyp_spin_unlock(&vm_table_lock); + + if (hyp_vcpu) + __this_cpu_write(loaded_hyp_vcpu, hyp_vcpu); return hyp_vcpu; } @@ -323,17 +265,63 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); hyp_spin_lock(&vm_table_lock); + hyp_vcpu->loaded_hyp_vcpu = NULL; + __this_cpu_write(loaded_hyp_vcpu, NULL); + hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); +} + +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void) +{ + return __this_cpu_read(loaded_hyp_vcpu); + +} + +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (hyp_vm) + hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); + + return hyp_vm; +} + +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm) +{ + hyp_spin_lock(&vm_table_lock); hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); hyp_spin_unlock(&vm_table_lock); } +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle); + + if (hyp_vm && pkvm_hyp_vm_is_protected(hyp_vm)) { + put_pkvm_hyp_vm(hyp_vm); + hyp_vm = NULL; + } + + return hyp_vm; +} + static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) { struct kvm *kvm = &hyp_vm->kvm; + unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags); DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) + set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); + /* No restrictions for non-protected VMs. */ if (!kvm_vm_is_protected(kvm)) { + hyp_vm->kvm.arch.flags = host_arch_flags; + bitmap_copy(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); @@ -342,50 +330,26 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); - /* - * For protected VMs, always allow: - * - CPU starting in poweroff state - * - PSCI v0.2 - */ - set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features); set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); - /* - * Check if remaining features are allowed: - * - Performance Monitoring - * - Scalable Vectors - * - Pointer Authentication - */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), PVM_ID_AA64DFR0_ALLOW)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3)) set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW)) - set_bit(KVM_ARM_VCPU_SVE, allowed_features); - - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED) && - FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS)) set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) && - FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), PVM_ID_AA64ISAR1_ALLOW)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) { + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE); + } + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, allowed_features, KVM_VCPU_MAX_FEATURES); } -static void pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu) -{ - struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; - - if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || - vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) { - kvm_vcpu_enable_ptrauth(vcpu); - } else { - vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_PTRAUTH); - } -} - static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) { if (host_vcpu) @@ -408,6 +372,7 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, hyp_vm->kvm.created_vcpus = nr_vcpus; hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + hyp_vm->kvm.arch.flags = 0; pkvm_init_features_from_host(hyp_vm, host_kvm); } @@ -415,10 +380,8 @@ static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu * { struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; - if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { - vcpu_clear_flag(vcpu, GUEST_HAS_SVE); + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); - } } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, @@ -446,9 +409,14 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + + ret = pkvm_vcpu_init_traps(hyp_vcpu); + if (ret) + goto done; + pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); - pkvm_vcpu_init_ptrauth(hyp_vcpu); - pkvm_vcpu_init_traps(&hyp_vcpu->vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); @@ -693,8 +661,6 @@ unlock: return ret; } - hyp_vcpu->vcpu.arch.cptr_el2 = kvm_get_reset_cptr_el2(&hyp_vcpu->vcpu); - return 0; } @@ -746,6 +712,14 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) /* Push the metadata pages to the teardown memcache */ for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; + struct kvm_hyp_memcache *vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; + + while (vcpu_mc->nr_pages) { + void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); + + push_hyp_memcache(mc, addr, hyp_virt_to_phys); + unmap_donated_memory_noclear(addr, PAGE_SIZE); + } teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu)); } diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index cbdd18cd3f98..d62bcb5634a2 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -12,7 +12,6 @@ #include <nvhe/early_alloc.h> #include <nvhe/ffa.h> -#include <nvhe/fixed_config.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> #include <nvhe/mem_protect.h> @@ -180,7 +179,6 @@ static void hpool_put_page(void *addr) static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - enum kvm_pgtable_prot prot; enum pkvm_page_state state; phys_addr_t phys; @@ -203,16 +201,16 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, case PKVM_PAGE_OWNED: return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); case PKVM_PAGE_SHARED_OWNED: - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); + hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_BORROWED; break; case PKVM_PAGE_SHARED_BORROWED: - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_OWNED; break; default: return -EINVAL; } - return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); + return 0; } static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx, diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index ed6b58b19cfa..5b6eeab1a774 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -28,7 +28,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info); struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); - stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE); + stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - NVHE_STACK_SIZE); stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack); stacktrace_info->fp = fp; stacktrace_info->pc = pc; @@ -54,7 +54,7 @@ static struct stack_info stackinfo_get_hyp(void) { struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); unsigned long high = params->stack_hyp_va; - unsigned long low = high - PAGE_SIZE; + unsigned long low = high - NVHE_STACK_SIZE; return (struct stack_info) { .low = low, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index cc69106734ca..6c846d033d24 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -26,7 +26,6 @@ #include <asm/debug-monitors.h> #include <asm/processor.h> -#include <nvhe/fixed_config.h> #include <nvhe/mem_protect.h> /* Non-VHE specific context */ @@ -36,33 +35,46 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); -static void __activate_traps(struct kvm_vcpu *vcpu) +static void __activate_cptr_traps(struct kvm_vcpu *vcpu) { - u64 val; + u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - ___activate_traps(vcpu, vcpu->arch.hcr_el2); - __activate_traps_common(vcpu); + if (has_hvhe()) { + val |= CPACR_EL1_TTA; - val = vcpu->arch.cptr_el2; - val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA; - if (cpus_have_final_cap(ARM64_SME)) { - if (has_hvhe()) - val &= ~CPACR_ELx_SMEN; - else - val |= CPTR_EL2_TSM; - } + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } + } else { + val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; - if (!guest_owns_fp_regs()) { - if (has_hvhe()) - val &= ~(CPACR_ELx_FPEN | CPACR_ELx_ZEN); - else - val |= CPTR_EL2_TFP | CPTR_EL2_TZ; + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; - __activate_traps_fpsimd32(vcpu); + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; } + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); + kvm_write_cptr_el2(val); +} + +static void __activate_traps(struct kvm_vcpu *vcpu) +{ + ___activate_traps(vcpu, vcpu->arch.hcr_el2); + __activate_traps_common(vcpu); + __activate_cptr_traps(vcpu); + write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { @@ -192,7 +204,7 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) /* Re-enable SVE traps if not supported for the guest vcpu. */ if (!vcpu_has_sve(vcpu)) - cpacr_clear_set(CPACR_ELx_ZEN, 0); + cpacr_clear_set(CPACR_EL1_ZEN, 0); } else { __fpsimd_save_state(*host_data_ptr(fpsimd_state)); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 2860548d4250..1ddd9ed3cbb3 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -11,7 +11,7 @@ #include <hyp/adjust_pc.h> -#include <nvhe/fixed_config.h> +#include <nvhe/pkvm.h> #include "../../sys_regs.h" @@ -28,222 +28,255 @@ u64 id_aa64mmfr1_el1_sys_val; u64 id_aa64mmfr2_el1_sys_val; u64 id_aa64smfr0_el1_sys_val; -/* - * Inject an unknown/undefined exception to an AArch64 guest while most of its - * sysregs are live. - */ -static void inject_undef64(struct kvm_vcpu *vcpu) -{ - u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - - *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); - *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); - - kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); - - __kvm_adjust_pc(vcpu); - - write_sysreg_el1(esr, SYS_ESR); - write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); - write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); - write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); -} - -/* - * Returns the restricted features values of the feature register based on the - * limitations in restrict_fields. - * A feature id field value of 0b0000 does not impose any restrictions. - * Note: Use only for unsigned feature field values. - */ -static u64 get_restricted_features_unsigned(u64 sys_reg_val, - u64 restrict_fields) -{ - u64 value = 0UL; - u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); +struct pvm_ftr_bits { + bool sign; + u8 shift; + u8 width; + u8 max_val; + bool (*vm_supported)(const struct kvm *kvm); +}; - /* - * According to the Arm Architecture Reference Manual, feature fields - * use increasing values to indicate increases in functionality. - * Iterate over the restricted feature fields and calculate the minimum - * unsigned value between the one supported by the system, and what the - * value is being restricted to. - */ - while (sys_reg_val && restrict_fields) { - value |= min(sys_reg_val & mask, restrict_fields & mask); - sys_reg_val &= ~mask; - restrict_fields &= ~mask; - mask <<= ARM64_FEATURE_FIELD_BITS; +#define __MAX_FEAT_FUNC(id, fld, max, func, sgn) \ + { \ + .sign = sgn, \ + .shift = id##_##fld##_SHIFT, \ + .width = id##_##fld##_WIDTH, \ + .max_val = id##_##fld##_##max, \ + .vm_supported = func, \ } - return value; -} - -/* - * Functions that return the value of feature id registers for protected VMs - * based on allowed features, system features, and KVM support. - */ +#define MAX_FEAT_FUNC(id, fld, max, func) \ + __MAX_FEAT_FUNC(id, fld, max, func, id##_##fld##_SIGNED) -static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask = 0; - u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; +#define MAX_FEAT(id, fld, max) \ + MAX_FEAT_FUNC(id, fld, max, NULL) - set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); +#define MAX_FEAT_ENUM(id, fld, max) \ + __MAX_FEAT_FUNC(id, fld, max, NULL, false) - return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; -} +#define FEAT_END { .width = 0, } -static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu) +static bool vm_has_ptrauth(const struct kvm *kvm) { - const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); - u64 allow_mask = PVM_ID_AA64PFR1_ALLOW; - - if (!kvm_has_mte(kvm)) - allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); - - return id_aa64pfr1_el1_sys_val & allow_mask; -} - -static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for Scalable Vectors, therefore, hyp has no sanitized - * copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, including breakpoints, and watchpoints, - * therefore, pKVM has no sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, therefore, hyp has no sanitized copy of the - * feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL); - return 0; -} + if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) + return false; -static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL); - return 0; + return (cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || + cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC); } -static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu) +static bool vm_has_sve(const struct kvm *kvm) { - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL); - return 0; + return system_supports_sve() && kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_SVE); } -static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu) -{ - return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW; -} +/* + * Definitions for features to be allowed or restricted for protected guests. + * + * Each field in the masks represents the highest supported value for the + * feature. If a feature field is not present, it is not supported. Moreover, + * these are used to generate the guest's view of the feature registers. + * + * The approach for protected VMs is to at least support features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ -static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW; +static const struct pvm_ftr_bits pvmid_aa64pfr0[] = { + MAX_FEAT(ID_AA64PFR0_EL1, EL0, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL1, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL3, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, FP, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, AdvSIMD, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, GIC, IMP), + MAX_FEAT_FUNC(ID_AA64PFR0_EL1, SVE, IMP, vm_has_sve), + MAX_FEAT(ID_AA64PFR0_EL1, RAS, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, DIT, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV3, IMP), + FEAT_END +}; - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); +static const struct pvm_ftr_bits pvmid_aa64pfr1[] = { + MAX_FEAT(ID_AA64PFR1_EL1, BT, IMP), + MAX_FEAT(ID_AA64PFR1_EL1, SSBS, SSBS2), + MAX_FEAT_ENUM(ID_AA64PFR1_EL1, MTE_frac, NI), + FEAT_END +}; - return id_aa64isar1_el1_sys_val & allow_mask; -} +static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = { + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40), + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGEND, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, SNSMEM, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGENDEL0, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, EXS, IMP), + FEAT_END +}; -static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW; +static const struct pvm_ftr_bits pvmid_aa64mmfr1[] = { + MAX_FEAT(ID_AA64MMFR1_EL1, HAFDBS, DBM), + MAX_FEAT_ENUM(ID_AA64MMFR1_EL1, VMIDBits, 16), + MAX_FEAT(ID_AA64MMFR1_EL1, HPDS, HPDS2), + MAX_FEAT(ID_AA64MMFR1_EL1, PAN, PAN3), + MAX_FEAT(ID_AA64MMFR1_EL1, SpecSEI, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, ETS, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, CMOW, IMP), + FEAT_END +}; - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); +static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = { + MAX_FEAT(ID_AA64MMFR2_EL1, CnP, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP), + MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18), + MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2), + MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP), + FEAT_END +}; - return id_aa64isar2_el1_sys_val & allow_mask; -} +static const struct pvm_ftr_bits pvmid_aa64isar1[] = { + MAX_FEAT(ID_AA64ISAR1_EL1, DPB, DPB2), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, APA, PAuth, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, API, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR1_EL1, JSCVT, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FCMA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, LRCPC, LRCPC3), + MAX_FEAT(ID_AA64ISAR1_EL1, GPA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, GPI, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FRINTTS, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SB, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX), + MAX_FEAT(ID_AA64ISAR1_EL1, BF16, EBF16), + MAX_FEAT(ID_AA64ISAR1_EL1, DGH, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, I8MM, IMP), + FEAT_END +}; -static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask; +static const struct pvm_ftr_bits pvmid_aa64isar2[] = { + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, GPA3, IMP, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, APA3, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR2_EL1, ATS1A, IMP), + FEAT_END +}; - set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, - PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED); +/* + * None of the features in ID_AA64DFR0_EL1 nor ID_AA64MMFR4_EL1 are supported. + * However, both have Not-Implemented values that are non-zero. Define them + * so they can be used when getting the value of these registers. + */ +#define ID_AA64DFR0_EL1_NONZERO_NI \ +( \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI) | \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, MTPMU, NI) \ +) - return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; -} +#define ID_AA64MMFR4_EL1_NONZERO_NI \ + SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI) -static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu) +/* + * Returns the value of the feature registers based on the system register + * value, the vcpu support for the revelant features, and the additional + * restrictions for protected VMs. + */ +static u64 get_restricted_features(const struct kvm_vcpu *vcpu, + u64 sys_reg_val, + const struct pvm_ftr_bits restrictions[]) { - return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW; -} + u64 val = 0UL; + int i; + + for (i = 0; restrictions[i].width != 0; i++) { + bool (*vm_supported)(const struct kvm *) = restrictions[i].vm_supported; + bool sign = restrictions[i].sign; + int shift = restrictions[i].shift; + int width = restrictions[i].width; + u64 min_signed = (1UL << width) - 1UL; + u64 sign_bit = 1UL << (width - 1); + u64 mask = GENMASK_ULL(width + shift - 1, shift); + u64 sys_val = (sys_reg_val & mask) >> shift; + u64 pvm_max = restrictions[i].max_val; + + if (vm_supported && !vm_supported(vcpu->kvm)) + val |= (sign ? min_signed : 0) << shift; + else if (sign && (sys_val >= sign_bit || pvm_max >= sign_bit)) + val |= max(sys_val, pvm_max) << shift; + else + val |= min(sys_val, pvm_max) << shift; + } -static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) -{ - return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; + return val; } -/* Read a sanitized cpufeature ID register by its encoding */ -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) { switch (id) { case SYS_ID_AA64PFR0_EL1: - return get_pvm_id_aa64pfr0(vcpu); + return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0); case SYS_ID_AA64PFR1_EL1: - return get_pvm_id_aa64pfr1(vcpu); - case SYS_ID_AA64ZFR0_EL1: - return get_pvm_id_aa64zfr0(vcpu); - case SYS_ID_AA64DFR0_EL1: - return get_pvm_id_aa64dfr0(vcpu); - case SYS_ID_AA64DFR1_EL1: - return get_pvm_id_aa64dfr1(vcpu); - case SYS_ID_AA64AFR0_EL1: - return get_pvm_id_aa64afr0(vcpu); - case SYS_ID_AA64AFR1_EL1: - return get_pvm_id_aa64afr1(vcpu); + return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1); case SYS_ID_AA64ISAR0_EL1: - return get_pvm_id_aa64isar0(vcpu); + return id_aa64isar0_el1_sys_val; case SYS_ID_AA64ISAR1_EL1: - return get_pvm_id_aa64isar1(vcpu); + return get_restricted_features(vcpu, id_aa64isar1_el1_sys_val, pvmid_aa64isar1); case SYS_ID_AA64ISAR2_EL1: - return get_pvm_id_aa64isar2(vcpu); + return get_restricted_features(vcpu, id_aa64isar2_el1_sys_val, pvmid_aa64isar2); case SYS_ID_AA64MMFR0_EL1: - return get_pvm_id_aa64mmfr0(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr0_el1_sys_val, pvmid_aa64mmfr0); case SYS_ID_AA64MMFR1_EL1: - return get_pvm_id_aa64mmfr1(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr1_el1_sys_val, pvmid_aa64mmfr1); case SYS_ID_AA64MMFR2_EL1: - return get_pvm_id_aa64mmfr2(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr2_el1_sys_val, pvmid_aa64mmfr2); + case SYS_ID_AA64DFR0_EL1: + return ID_AA64DFR0_EL1_NONZERO_NI; + case SYS_ID_AA64MMFR4_EL1: + return ID_AA64MMFR4_EL1_NONZERO_NI; default: /* Unhandled ID register, RAZ */ return 0; } } +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + + __kvm_adjust_pc(vcpu); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); +} + static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { - return pvm_read_id_reg(vcpu, reg_to_encoding(r)); + struct kvm *kvm = vcpu->kvm; + u32 reg = reg_to_encoding(r); + + if (WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))) + return 0; + + if (reg >= sys_reg(3, 0, 0, 1, 0) && reg <= sys_reg(3, 0, 0, 7, 7)) + return kvm->arch.id_regs[IDREG_IDX(reg)]; + + return 0; } /* Handler to RAZ/WI sysregs */ @@ -271,13 +304,6 @@ static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, return false; } - /* - * No support for AArch32 guests, therefore, pKVM has no sanitized copy - * of AArch32 feature id registers. - */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_EL1_EL1_IMP); - return pvm_access_raz_wi(vcpu, p, r); } @@ -449,6 +475,30 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { }; /* + * Initializes feature registers for protected vms. + */ +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_arch *ka = &kvm->arch; + u32 r; + + hyp_assert_lock_held(&vm_table_lock); + + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return; + + /* + * Initialize only AArch64 id registers since AArch32 isn't supported + * for protected VMs. + */ + for (r = sys_reg(3, 0, 0, 4, 0); r <= sys_reg(3, 0, 0, 7, 7); r += sys_reg(0, 0, 0, 0, 1)) + ka->id_regs[IDREG_IDX(r)] = pvm_calc_id_reg(vcpu, r); + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); +} + +/* * Checks that the sysreg table is unique and in-order. * * Returns 0 if the table is consistent, or 1 otherwise. diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c index 3aaab20ae5b4..ff176f4ce7de 100644 --- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c @@ -22,15 +22,16 @@ void __kvm_timer_set_cntvoff(u64 cntvoff) */ void __timer_disable_traps(struct kvm_vcpu *vcpu) { - u64 val, shift = 0; + u64 set, clr, shift = 0; if (has_hvhe()) shift = 10; /* Allow physical timer/counter access for the host */ - val = read_sysreg(cnthctl_el2); - val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; - write_sysreg(val, cnthctl_el2); + set = (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; + clr = CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + + sysreg_clear_set(cnthctl_el2, clr, set); } /* @@ -58,5 +59,12 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu) set <<= 10; } + /* + * Trap the virtual counter/timer if we have a broken cntvoff + * implementation. + */ + if (has_broken_cntvoff()) + set |= CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + sysreg_clear_set(cnthctl_el2, clr, set); } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 40bd55966540..df5cc74a7dd0 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -35,14 +35,6 @@ static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); } -static bool kvm_phys_is_valid(u64 phys) -{ - u64 parange_max = kvm_get_parange_max(); - u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max); - - return phys < BIT(shift); -} - static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) { u64 granule = kvm_granule_size(ctx->level); @@ -53,7 +45,7 @@ static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, if (granule > (ctx->end - ctx->addr)) return false; - if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) + if (!IS_ALIGNED(phys, granule)) return false; return IS_ALIGNED(ctx->addr, granule); @@ -587,6 +579,9 @@ struct stage2_map_data { /* Force mappings to page granularity */ bool force_pte; + + /* Walk should update owner_id only */ + bool annotation; }; u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) @@ -885,18 +880,7 @@ static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, { u64 phys = data->phys; - /* - * Stage-2 walks to update ownership data are communicated to the map - * walker using an invalid PA. Avoid offsetting an already invalid PA, - * which could overflow and make the address valid again. - */ - if (!kvm_phys_is_valid(phys)) - return phys; - - /* - * Otherwise, work out the correct PA based on how far the walk has - * gotten. - */ + /* Work out the correct PA based on how far the walk has gotten */ return phys + (ctx->addr - ctx->start); } @@ -908,6 +892,9 @@ static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) return false; + if (data->annotation) + return true; + return kvm_block_mapping_supported(ctx, phys); } @@ -923,7 +910,7 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_leaf_mapping_allowed(ctx, data)) return -E2BIG; - if (kvm_phys_is_valid(phys)) + if (!data->annotation) new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else new = kvm_init_invalid_leaf_owner(data->owner_id); @@ -1085,11 +1072,11 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, { int ret; struct stage2_map_data map_data = { - .phys = KVM_PHYS_INVALID, .mmu = pgt->mmu, .memcache = mc, .owner_id = owner_id, .force_pte = true, + .annotation = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -1245,14 +1232,13 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) NULL, NULL, 0); } -void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) +void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) { int ret; ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, - NULL, NULL, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + NULL, NULL, flags); if (!ret) dsb(ishst); } @@ -1308,7 +1294,7 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, } int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot) + enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { int ret; s8 level; @@ -1326,9 +1312,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_X) clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; - ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); if (!ret || ret == -EAGAIN) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c index 289689b2682d..0100339b09e0 100644 --- a/arch/arm64/kvm/hyp/vhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c @@ -19,8 +19,3 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); } - -u64 __kvm_get_mdcr_el2(void) -{ - return read_sysreg(mdcr_el2); -} diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 80581b1c3995..b5b9dbaf1fdd 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -77,12 +77,12 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM * shift value for trapping the AMU accesses. */ - u64 val = CPACR_ELx_TTA | CPTR_EL2_TAM; + u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM; if (guest_owns_fp_regs()) { - val |= CPACR_ELx_FPEN; + val |= CPACR_EL1_FPEN; if (vcpu_has_sve(vcpu)) - val |= CPACR_ELx_ZEN; + val |= CPACR_EL1_ZEN; } else { __activate_traps_fpsimd32(vcpu); } @@ -122,13 +122,13 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) * hypervisor has traps enabled to dispel any illusion of something more * complicated taking place. */ - if (!(SYS_FIELD_GET(CPACR_ELx, FPEN, cptr) & BIT(0))) - val &= ~CPACR_ELx_FPEN; - if (!(SYS_FIELD_GET(CPACR_ELx, ZEN, cptr) & BIT(0))) - val &= ~CPACR_ELx_ZEN; + if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_FPEN; + if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_ZEN; if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) - val |= cptr & CPACR_ELx_E0POE; + val |= cptr & CPACR_EL1_E0POE; val |= cptr & CPTR_EL2_TCPAC; @@ -256,6 +256,110 @@ void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu) host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL; } +static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg) +{ + unsigned long ctl; + u64 cval, cnt; + bool stat; + + switch (reg) { + case CNTP_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); + cnt = compute_counter_value(vcpu_ptimer(vcpu)); + break; + case CNTV_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); + cnt = compute_counter_value(vcpu_vtimer(vcpu)); + break; + default: + BUG(); + } + + stat = cval <= cnt; + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat); + + return ctl; +} + +static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr, val; + + /* + * Having FEAT_ECV allows for a better quality of timer emulation. + * However, this comes at a huge cost in terms of traps. Try and + * satisfy the reads from guest's hypervisor context without + * returning to the kernel if we can. + */ + if (!is_hyp_ctxt(vcpu)) + return false; + + esr = kvm_vcpu_get_esr(vcpu); + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ) + return false; + + switch (esr_sys64_to_sysreg(esr)) { + case SYS_CNTP_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTP_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + break; + case SYS_CNTP_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el0(SYS_CNTP_CVAL); + + if (!has_cntpoff()) + val -= timer_get_offset(vcpu_hptimer(vcpu)); + } else { + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + } + break; + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + val = compute_counter_value(vcpu_hptimer(vcpu)); + break; + case SYS_CNTV_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTV_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CVAL); + else + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + val = compute_counter_value(vcpu_hvtimer(vcpu)); + break; + default: + return false; + } + + vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); + __kvm_skip_instr(vcpu); + + return true; +} + static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 esr = kvm_vcpu_get_esr(vcpu); @@ -409,6 +513,9 @@ static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code)) return true; + if (kvm_hyp_handle_timer(vcpu, exit_code)) + return true; + if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code)) return true; diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 5f78a39053a7..90b018e06f2c 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -216,7 +216,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); - if (unlikely(__is_hyp_ctxt(guest_ctxt))) { + if (unlikely(is_hyp_ctxt(vcpu))) { __sysreg_restore_vel2_state(vcpu); } else { if (vcpu_has_nv(vcpu)) { @@ -260,7 +260,7 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); - if (unlikely(__is_hyp_ctxt(guest_ctxt))) + if (unlikely(is_hyp_ctxt(vcpu))) __sysreg_save_vel2_state(vcpu); else __sysreg_save_el1_state(guest_ctxt); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c9d46ad57e52..1f55b0c7b11d 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -15,6 +15,7 @@ #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> +#include <asm/kvm_pkvm.h> #include <asm/kvm_ras.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> @@ -29,8 +30,12 @@ static unsigned long __ro_after_init hyp_idmap_start; static unsigned long __ro_after_init hyp_idmap_end; static phys_addr_t __ro_after_init hyp_idmap_vector; +u32 __ro_after_init __hyp_va_bits; + static unsigned long __ro_after_init io_map_base; +#define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) + static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, phys_addr_t size) { @@ -147,7 +152,7 @@ static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, return -EINVAL; next = __stage2_range_addr_end(addr, end, chunk_size); - ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache); + ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); if (ret) break; } while (addr = next, addr != end); @@ -168,15 +173,23 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) */ int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { - kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); return 0; } int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { - kvm_tlb_flush_vmid_range(&kvm->arch.mmu, - gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); + u64 size = nr_pages << PAGE_SHIFT; + u64 addr = gfn << PAGE_SHIFT; + + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); return 0; } @@ -225,7 +238,7 @@ static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) void *pgtable = page_to_virt(page); s8 level = page_private(page); - kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); + KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); } static void stage2_free_unlinked_table(void *addr, s8 level) @@ -324,7 +337,7 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap, + WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), may_block)); } @@ -336,7 +349,7 @@ void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { - stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_flush); + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); } static void stage2_flush_memslot(struct kvm *kvm, @@ -704,10 +717,10 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) mutex_lock(&kvm_hyp_pgd_mutex); /* - * Efficient stack verification using the PAGE_SHIFT bit implies + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies * an alignment of our allocation on the order of the size. */ - size = PAGE_SIZE * 2; + size = NVHE_STACK_SIZE * 2; base = ALIGN_DOWN(io_map_base - size, size); ret = __hyp_alloc_private_va_range(base); @@ -724,12 +737,12 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) * at the higher address and leave the lower guard page * unbacked. * - * Any valid stack address now has the PAGE_SHIFT bit as 1 + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. */ - ret = __create_hyp_mappings(base + PAGE_SIZE, PAGE_SIZE, phys_addr, - PAGE_HYP); + ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, + phys_addr, PAGE_HYP); if (ret) kvm_err("Cannot map hyp stack\n"); @@ -942,10 +955,14 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return -ENOMEM; mmu->arch = &kvm->arch; - err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops); + err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); if (err) goto out_free_pgtable; + mmu->pgt = pgt; + if (is_protected_kvm_enabled()) + return 0; + mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); if (!mmu->last_vcpu_ran) { err = -ENOMEM; @@ -959,7 +976,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; mmu->split_page_cache.gfp_zero = __GFP_ZERO; - mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); if (kvm_is_nested_s2_mmu(kvm, mmu)) @@ -968,7 +984,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return 0; out_destroy_pgtable: - kvm_pgtable_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); out_free_pgtable: kfree(pgt); return err; @@ -1065,7 +1081,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) write_unlock(&kvm->mmu_lock); if (pgt) { - kvm_pgtable_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); kfree(pgt); } } @@ -1082,9 +1098,11 @@ static void *hyp_mc_alloc_fn(void *unused) void free_hyp_memcache(struct kvm_hyp_memcache *mc) { - if (is_protected_kvm_enabled()) - __free_hyp_memcache(mc, hyp_mc_free_fn, - kvm_host_va, NULL); + if (!is_protected_kvm_enabled()) + return; + + kfree(mc->mapping); + __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, NULL); } int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) @@ -1092,6 +1110,12 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) if (!is_protected_kvm_enabled()) return 0; + if (!mc->mapping) { + mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); + if (!mc->mapping) + return -ENOMEM; + } + return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, kvm_host_pa, NULL); } @@ -1130,8 +1154,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, break; write_lock(&kvm->mmu_lock); - ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, - &cache, 0); + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, + pa, prot, &cache, 0); write_unlock(&kvm->mmu_lock); if (ret) break; @@ -1151,7 +1175,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, */ void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { - stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect); + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); } /** @@ -1442,9 +1466,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long mmu_seq; phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; - struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; short vma_shift; + void *memcache; gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); @@ -1452,6 +1476,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; struct page *page; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; if (fault_is_perm) fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); @@ -1471,8 +1496,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * and a write fault needs to collapse a block entry into a table. */ if (!fault_is_perm || (logging_active && write_fault)) { - ret = kvm_mmu_topup_memory_cache(memcache, - kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu)); + int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); + + if (!is_protected_kvm_enabled()) { + memcache = &vcpu->arch.mmu_page_cache; + ret = kvm_mmu_topup_memory_cache(memcache, min_pages); + } else { + memcache = &vcpu->arch.pkvm_memcache; + ret = topup_hyp_memcache(memcache, min_pages); + } if (ret) return ret; } @@ -1493,7 +1525,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * logging_active is guaranteed to never be true for VM_PFNMAP * memslots. */ - if (logging_active) { + if (logging_active || is_protected_kvm_enabled()) { force_pte = true; vma_shift = PAGE_SHIFT; } else { @@ -1633,7 +1665,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot |= kvm_encode_nested_level(nested); } - read_lock(&kvm->mmu_lock); + kvm_fault_lock(kvm); pgt = vcpu->arch.hw_mmu->pgt; if (mmu_invalidate_retry(kvm, mmu_seq)) { ret = -EAGAIN; @@ -1695,18 +1727,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * PTE, which will be preserved. */ prot &= ~KVM_NV_GUEST_MAP_SZ; - ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); } else { - ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, - memcache, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + memcache, flags); } out_unlock: kvm_release_faultin_page(kvm, page, !!ret, writable); - read_unlock(&kvm->mmu_lock); + kvm_fault_unlock(kvm); /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) @@ -1718,13 +1748,14 @@ out_unlock: /* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); read_lock(&vcpu->kvm->mmu_lock); mmu = vcpu->arch.hw_mmu; - kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); + KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); read_unlock(&vcpu->kvm->mmu_lock); } @@ -1764,7 +1795,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) } /* Falls between the IPA range and the PARange? */ - if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) { + if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); if (is_iabt) @@ -1930,7 +1961,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; - return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, true); /* @@ -1946,7 +1977,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; - return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, false); } @@ -2056,6 +2087,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits) goto out_destroy_pgtable; io_map_base = hyp_idmap_start; + __hyp_va_bits = *hyp_va_bits; return 0; out_destroy_pgtable: diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 9b36218b48de..0c9387d2f507 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -67,26 +67,27 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) if (!tmp) return -ENOMEM; + swap(kvm->arch.nested_mmus, tmp); + /* * If we went through a realocation, adjust the MMU back-pointers in * the previously initialised kvm_pgtable structures. */ if (kvm->arch.nested_mmus != tmp) for (int i = 0; i < kvm->arch.nested_mmus_size; i++) - tmp[i].pgt->mmu = &tmp[i]; + kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) - ret = init_nested_s2_mmu(kvm, &tmp[i]); + ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); if (ret) { for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) - kvm_free_stage2_pgd(&tmp[i]); + kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); return ret; } kvm->arch.nested_mmus_size = num_mmus; - kvm->arch.nested_mmus = tmp; return 0; } @@ -830,8 +831,10 @@ static void limit_nv_id_regs(struct kvm *kvm) NV_FTR(PFR0, RAS) | NV_FTR(PFR0, EL3) | NV_FTR(PFR0, EL2) | - NV_FTR(PFR0, EL1)); - /* 64bit EL1/EL2/EL3 only */ + NV_FTR(PFR0, EL1) | + NV_FTR(PFR0, EL0)); + /* 64bit only at any EL */ + val |= FIELD_PREP(NV_FTR(PFR0, EL0), 0b0001); val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001); val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001); val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001); @@ -963,14 +966,15 @@ static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, kvm->arch.sysreg_masks->mask[i].res1 = res1; } -int kvm_init_nv_sysregs(struct kvm *kvm) +int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; u64 res0, res1; lockdep_assert_held(&kvm->arch.config_lock); if (kvm->arch.sysreg_masks) - return 0; + goto out; kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), GFP_KERNEL_ACCOUNT); @@ -1021,8 +1025,8 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= HCR_NV2; if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, IMP)) res0 |= (HCR_AT | HCR_NV1 | HCR_NV); - if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && - __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) + if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC))) res0 |= (HCR_API | HCR_APK); if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TME, IMP)) res0 |= BIT(39); @@ -1078,8 +1082,8 @@ int kvm_init_nv_sysregs(struct kvm *kvm) /* HFG[RW]TR_EL2 */ res0 = res1 = 0; - if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && - __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) + if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC))) res0 |= (HFGxTR_EL2_APDAKey | HFGxTR_EL2_APDBKey | HFGxTR_EL2_APGAKey | HFGxTR_EL2_APIAKey | HFGxTR_EL2_APIBKey); @@ -1271,6 +1275,25 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= MDCR_EL2_EnSTEPOP; set_sysreg_masks(kvm, MDCR_EL2, res0, res1); + /* CNTHCTL_EL2 */ + res0 = GENMASK(63, 20); + res1 = 0; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP)) + res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) { + res0 |= CNTHCTL_ECV; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP)) + res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT | + CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT); + } + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= GENMASK(11, 8); + set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1); + +out: + for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) + (void)__vcpu_sys_reg(vcpu, sr); + return 0; } diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 85117ea8f351..930b677eb9b0 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/kmemleak.h> #include <linux/kvm_host.h> +#include <asm/kvm_mmu.h> #include <linux/memblock.h> #include <linux/mutex.h> #include <linux/sort.h> @@ -268,3 +269,203 @@ static int __init finalize_pkvm(void) return ret; } device_initcall_sync(finalize_pkvm); + +static int cmp_mappings(struct rb_node *node, const struct rb_node *parent) +{ + struct pkvm_mapping *a = rb_entry(node, struct pkvm_mapping, node); + struct pkvm_mapping *b = rb_entry(parent, struct pkvm_mapping, node); + + if (a->gfn < b->gfn) + return -1; + if (a->gfn > b->gfn) + return 1; + return 0; +} + +static struct rb_node *find_first_mapping_node(struct rb_root *root, u64 gfn) +{ + struct rb_node *node = root->rb_node, *prev = NULL; + struct pkvm_mapping *mapping; + + while (node) { + mapping = rb_entry(node, struct pkvm_mapping, node); + if (mapping->gfn == gfn) + return node; + prev = node; + node = (gfn < mapping->gfn) ? node->rb_left : node->rb_right; + } + + return prev; +} + +/* + * __tmp is updated to rb_next(__tmp) *before* entering the body of the loop to allow freeing + * of __map inline. + */ +#define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \ + for (struct rb_node *__tmp = find_first_mapping_node(&(__pgt)->pkvm_mappings, \ + ((__start) >> PAGE_SHIFT)); \ + __tmp && ({ \ + __map = rb_entry(__tmp, struct pkvm_mapping, node); \ + __tmp = rb_next(__tmp); \ + true; \ + }); \ + ) \ + if (__map->gfn < ((__start) >> PAGE_SHIFT)) \ + continue; \ + else if (__map->gfn >= ((__end) >> PAGE_SHIFT)) \ + break; \ + else + +int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) +{ + pgt->pkvm_mappings = RB_ROOT; + pgt->mmu = mmu; + + return 0; +} + +void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + struct rb_node *node; + + if (!handle) + return; + + node = rb_first(&pgt->pkvm_mappings); + while (node) { + mapping = rb_entry(node, struct pkvm_mapping, node); + kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn); + node = rb_next(node); + rb_erase(&mapping->node, &pgt->pkvm_mappings); + kfree(mapping); + } +} + +int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + void *mc, enum kvm_pgtable_walk_flags flags) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + struct pkvm_mapping *mapping = NULL; + struct kvm_hyp_memcache *cache = mc; + u64 gfn = addr >> PAGE_SHIFT; + u64 pfn = phys >> PAGE_SHIFT; + int ret; + + if (size != PAGE_SIZE) + return -EINVAL; + + lockdep_assert_held_write(&kvm->mmu_lock); + ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, prot); + if (ret) { + /* Is the gfn already mapped due to a racing vCPU? */ + if (ret == -EPERM) + return -EAGAIN; + } + + swap(mapping, cache->mapping); + mapping->gfn = gfn; + mapping->pfn = pfn; + WARN_ON(rb_find_add(&mapping->node, &pgt->pkvm_mappings, cmp_mappings)); + + return ret; +} + +int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret = 0; + + lockdep_assert_held_write(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn); + if (WARN_ON(ret)) + break; + rb_erase(&mapping->node, &pgt->pkvm_mappings); + kfree(mapping); + } + + return ret; +} + +int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret = 0; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn); + if (WARN_ON(ret)) + break; + } + + return ret; +} + +int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + struct pkvm_mapping *mapping; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) + __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), PAGE_SIZE); + + return 0; +} + +bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + bool young = false; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) + young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn, + mkold); + + return young; +} + +int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, + enum kvm_pgtable_walk_flags flags) +{ + return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot); +} + +void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) +{ + WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT)); +} + +void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) +{ + WARN_ON_ONCE(1); +} + +kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, + enum kvm_pgtable_prot prot, void *mc, bool force_pte) +{ + WARN_ON_ONCE(1); + return NULL; +} + +int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 456102bc0b55..6c5950b9ceac 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -24,6 +24,7 @@ static DEFINE_MUTEX(arm_pmus_lock); static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); +static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc); static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) { @@ -327,48 +328,25 @@ u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); } -/** - * kvm_pmu_enable_counter_mask - enable selected PMU counters - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENSET register - * - * Call perf_event_enable to start counting the perf event - */ -void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc) { - int i; - if (!kvm_vcpu_has_pmu(vcpu)) - return; - - if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) || !val) + if (!pmc->perf_event) { + kvm_pmu_create_perf_event(pmc); return; + } - for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { - struct kvm_pmc *pmc; - - if (!(val & BIT(i))) - continue; - - pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + perf_event_enable(pmc->perf_event); + if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) + kvm_debug("fail to enable perf event\n"); +} - if (!pmc->perf_event) { - kvm_pmu_create_perf_event(pmc); - } else { - perf_event_enable(pmc->perf_event); - if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) - kvm_debug("fail to enable perf event\n"); - } - } +static void kvm_pmc_disable_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) + perf_event_disable(pmc->perf_event); } -/** - * kvm_pmu_disable_counter_mask - disable selected PMU counters - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMCNTENCLR register - * - * Call perf_event_disable to stop counting the perf event - */ -void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) +void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; @@ -376,16 +354,18 @@ void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) return; for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { - struct kvm_pmc *pmc; + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); if (!(val & BIT(i))) continue; - pmc = kvm_vcpu_idx_to_pmc(vcpu, i); - - if (pmc->perf_event) - perf_event_disable(pmc->perf_event); + if (kvm_pmu_counter_is_enabled(pmc)) + kvm_pmc_enable_perf_event(pmc); + else + kvm_pmc_disable_perf_event(pmc); } + + kvm_vcpu_pmu_restore_guest(vcpu); } /* @@ -626,27 +606,28 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) val &= ~ARMV8_PMU_PMCR_LP; + /* Request a reload of the PMU to enable/disable affected counters */ + if ((__vcpu_sys_reg(vcpu, PMCR_EL0) ^ val) & ARMV8_PMU_PMCR_E) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + /* The reset bits don't indicate any state, and shouldn't be saved. */ __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); - if (val & ARMV8_PMU_PMCR_E) { - kvm_pmu_enable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); - } else { - kvm_pmu_disable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); - } - if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { - unsigned long mask = kvm_pmu_accessible_counter_mask(vcpu); - mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); + /* + * Unlike other PMU sysregs, the controls in PMCR_EL0 always apply + * to the 'guest' range of counters and never the 'hyp' range. + */ + unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) & + ~kvm_pmu_hyp_counter_mask(vcpu) & + ~BIT(ARMV8_PMU_CYCLE_IDX); + for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } - kvm_vcpu_pmu_restore_guest(vcpu); } static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) @@ -910,11 +891,11 @@ void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { u64 mask = kvm_pmu_implemented_counter_mask(vcpu); - kvm_pmu_handle_pmcr(vcpu, kvm_vcpu_read_pmcr(vcpu)); - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= mask; __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= mask; __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= mask; + + kvm_pmu_reprogram_counter_mask(vcpu, mask); } int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 470524b31951..803e11b0dc8f 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -85,7 +85,7 @@ static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until * kvm_arm_vcpu_finalize(), which freezes the configuration. */ - vcpu_set_flag(vcpu, GUEST_HAS_SVE); + set_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &vcpu->kvm->arch.flags); } /* @@ -211,10 +211,6 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu) kvm_vcpu_reset_sve(vcpu); } - if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || - vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) - kvm_vcpu_enable_ptrauth(vcpu); - if (vcpu_el1_is_32bit(vcpu)) pstate = VCPU_RESET_PSTATE_SVC; else if (vcpu_has_nv(vcpu)) diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c index 3ace5b75813b..af5eec681127 100644 --- a/arch/arm64/kvm/stacktrace.c +++ b/arch/arm64/kvm/stacktrace.c @@ -19,6 +19,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <asm/kvm_mmu.h> #include <asm/stacktrace/nvhe.h> static struct stack_info stackinfo_get_overflow(void) @@ -50,7 +51,7 @@ static struct stack_info stackinfo_get_hyp(void) struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); unsigned long low = (unsigned long)stacktrace_info->stack_base; - unsigned long high = low + PAGE_SIZE; + unsigned long high = low + NVHE_STACK_SIZE; return (struct stack_info) { .low = low, @@ -60,8 +61,8 @@ static struct stack_info stackinfo_get_hyp(void) static struct stack_info stackinfo_get_hyp_kern_va(void) { - unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page); - unsigned long high = low + PAGE_SIZE; + unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_base); + unsigned long high = low + NVHE_STACK_SIZE; return (struct stack_info) { .low = low, @@ -145,7 +146,7 @@ static void unwind(struct unwind_state *state, */ static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) { - unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0); + unsigned long va_mask = GENMASK_ULL(__hyp_va_bits - 1, 0); unsigned long hyp_offset = (unsigned long)arg; /* Mask tags and convert to kern addr */ diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 83c6b4a07ef5..82430c1e1dd0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -570,17 +570,10 @@ static bool trap_oslar_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 oslsr; - if (!p->is_write) return read_from_write_only(vcpu, p, r); - /* Forward the OSLK bit to OSLSR */ - oslsr = __vcpu_sys_reg(vcpu, OSLSR_EL1) & ~OSLSR_EL1_OSLK; - if (p->regval & OSLAR_EL1_OSLK) - oslsr |= OSLSR_EL1_OSLK; - - __vcpu_sys_reg(vcpu, OSLSR_EL1) = oslsr; + kvm_debug_handle_oslar(vcpu, p->regval); return true; } @@ -621,43 +614,13 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, } } -/* - * We want to avoid world-switching all the DBG registers all the - * time: - * - * - If we've touched any debug register, it is likely that we're - * going to touch more of them. It then makes sense to disable the - * traps and start doing the save/restore dance - * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is - * then mandatory to save/restore the registers, as the guest - * depends on them. - * - * For this, we use a DIRTY bit, indicating the guest has modified the - * debug registers, used as follow: - * - * On guest entry: - * - If the dirty bit is set (because we're coming back from trapping), - * disable the traps, save host registers, restore guest registers. - * - If debug is actively in use (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), - * set the dirty bit, disable the traps, save host registers, - * restore guest registers. - * - Otherwise, enable the traps - * - * On guest exit: - * - If the dirty bit is set, save guest registers, restore host - * registers and clear the dirty bit. This ensure that the host can - * now use the debug registers. - */ static bool trap_debug_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { access_rw(vcpu, p, r); - if (p->is_write) - vcpu_set_flag(vcpu, DEBUG_DIRTY); - - trace_trap_reg(__func__, r->reg, p->is_write, p->regval); + kvm_debug_set_guest_ownership(vcpu); return true; } @@ -666,9 +629,6 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, * * A 32 bit write to a debug register leave top bits alone * A 32 bit read from a debug register only returns the bottom bits - * - * All writes will set the DEBUG_DIRTY flag to ensure the hyp code - * switches between host and guest values in future. */ static void reg_to_dbg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -683,8 +643,6 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu, val &= ~mask; val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; - - vcpu_set_flag(vcpu, DEBUG_DIRTY); } static void dbg_to_reg(struct kvm_vcpu *vcpu, @@ -698,152 +656,79 @@ static void dbg_to_reg(struct kvm_vcpu *vcpu, p->regval = (*dbg_reg & mask) >> shift; } -static bool trap_bvr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) +static u64 *demux_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); - - return true; -} - -static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = val; - return 0; -} + struct kvm_guest_debug_arch *dbg = &vcpu->arch.vcpu_debug_state; -static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - return 0; + switch (rd->Op2) { + case 0b100: + return &dbg->dbg_bvr[rd->CRm]; + case 0b101: + return &dbg->dbg_bcr[rd->CRm]; + case 0b110: + return &dbg->dbg_wvr[rd->CRm]; + case 0b111: + return &dbg->dbg_wcr[rd->CRm]; + default: + KVM_BUG_ON(1, vcpu->kvm); + return NULL; + } } -static u64 reset_bvr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; - return rd->val; -} + u64 *reg = demux_wb_reg(vcpu, rd); -static bool trap_bcr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; + if (!reg) + return false; if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); + reg_to_dbg(vcpu, p, rd, reg); else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); + dbg_to_reg(vcpu, p, rd, reg); + kvm_debug_set_guest_ownership(vcpu); return true; } -static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = val; - return 0; -} - -static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; - return 0; -} - -static u64 reset_bcr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; - return rd->val; -} - -static bool trap_wvr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) +static int set_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]); + u64 *reg = demux_wb_reg(vcpu, rd); - return true; -} - -static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = val; - return 0; -} + if (!reg) + return -EINVAL; -static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; + *reg = val; return 0; } -static u64 reset_wvr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; - return rd->val; -} - -static bool trap_wcr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) +static int get_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 *val) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); + u64 *reg = demux_wb_reg(vcpu, rd); - return true; -} + if (!reg) + return -EINVAL; -static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = val; + *val = *reg; return 0; } -static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) +static u64 reset_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - *val = vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - return 0; -} + u64 *reg = demux_wb_reg(vcpu, rd); -static u64 reset_wcr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; + /* + * Bail early if we couldn't find storage for the register, the + * KVM_BUG_ON() in demux_wb_reg() will prevent this VM from ever + * being run. + */ + if (!reg) + return 0; + + *reg = rd->val; return rd->val; } @@ -1208,16 +1093,14 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, mask = kvm_pmu_accessible_counter_mask(vcpu); if (p->is_write) { val = p->regval & mask; - if (r->Op2 & 0x1) { + if (r->Op2 & 0x1) /* accessing PMCNTENSET_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; - kvm_pmu_enable_counter_mask(vcpu, val); - kvm_vcpu_pmu_restore_guest(vcpu); - } else { + else /* accessing PMCNTENCLR_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; - kvm_pmu_disable_counter_mask(vcpu, val); - } + + kvm_pmu_reprogram_counter_mask(vcpu, val); } else { p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); } @@ -1352,13 +1235,17 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ - trap_bvr, reset_bvr, 0, 0, get_bvr, set_bvr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGBCRn_EL1(n)), \ - trap_bcr, reset_bcr, 0, 0, get_bcr, set_bcr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWVRn_EL1(n)), \ - trap_wvr, reset_wvr, 0, 0, get_wvr, set_wvr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWCRn_EL1(n)), \ - trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr } + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg } #define PMU_SYS_REG(name) \ SYS_DESC(SYS_##name), .reset = reset_pmu_reg, \ @@ -1412,26 +1299,146 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, switch (reg) { case SYS_CNTP_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTV_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_AARCH32_CNTP_TVAL: + case SYS_CNTP_TVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_TVAL; break; + + case SYS_CNTV_TVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHP_TVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHV_TVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_CNTP_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTV_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_AARCH32_CNTP_CTL: + case SYS_CNTP_CTL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CTL; break; + + case SYS_CNTV_CTL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHP_CTL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHV_CTL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_CNTP_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTV_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + case SYS_AARCH32_CNTP_CVAL: + case SYS_CNTP_CVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CVAL; break; + + case SYS_CNTV_CVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHP_CVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHV_CVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CVAL; + break; + case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CNT; + break; + case SYS_AARCH32_CNTPCT: + case SYS_AARCH32_CNTPCTSS: tmr = TIMER_PTIMER; treg = TIMER_REG_CNT; break; + + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + + case SYS_AARCH32_CNTVCT: + case SYS_AARCH32_CNTVCTSS: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + default: print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); return undef_access(vcpu, p, r); @@ -1445,6 +1452,16 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } +static bool access_hv_timer(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!vcpu_el2_e2h_is_set(vcpu)) + return undef_access(vcpu, p, r); + + return access_arch_timer(vcpu, p, r); +} + static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { @@ -1601,9 +1618,13 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); - if (!cpus_have_final_cap(ARM64_HAS_WFXT)) + if (!cpus_have_final_cap(ARM64_HAS_WFXT) || + has_broken_cntvoff()) val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); break; + case SYS_ID_AA64ISAR3_EL1: + val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX; + break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; break; @@ -1806,6 +1827,9 @@ static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) /* Hide SPE from guests */ val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; + /* Hide BRBE from guests */ + val &= ~ID_AA64DFR0_EL1_BRBE_MASK; + return val; } @@ -2450,6 +2474,26 @@ static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu, return __el2_visibility(vcpu, rd, s1pie_visibility); } +static bool access_mdcr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 old = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!access_rw(vcpu, p, r)) + return false; + + /* + * Request a reload of the PMU to enable/disable the counters affected + * by HPME. + */ + if ((old ^ __vcpu_sys_reg(vcpu, MDCR_EL2)) & MDCR_EL2_HPME) + kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); + + return true; +} + + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2608,7 +2652,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_WRITABLE(ID_AA64ISAR2_EL1, ~(ID_AA64ISAR2_EL1_RES0 | ID_AA64ISAR2_EL1_APA3 | ID_AA64ISAR2_EL1_GPA3)), - ID_UNALLOCATED(6,3), + ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT | + ID_AA64ISAR3_EL1_FAMINMAX)), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), ID_UNALLOCATED(6,6), @@ -2618,7 +2663,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_WRITABLE(ID_AA64MMFR0_EL1, ~(ID_AA64MMFR0_EL1_RES0 | ID_AA64MMFR0_EL1_TGRAN4_2 | ID_AA64MMFR0_EL1_TGRAN64_2 | - ID_AA64MMFR0_EL1_TGRAN16_2)), + ID_AA64MMFR0_EL1_TGRAN16_2 | + ID_AA64MMFR0_EL1_ASIDBITS)), ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | ID_AA64MMFR1_EL1_HCX | ID_AA64MMFR1_EL1_TWED | @@ -2901,11 +2947,17 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVTYPER1_EL0(15), { SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTVCT_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CTL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CVAL_EL0), access_arch_timer }, + /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), PMU_PMEVCNTR_EL0(1), @@ -2982,7 +3034,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), EL2_REG_VNCR(HCR_EL2, reset_hcr, 0), - EL2_REG(MDCR_EL2, access_rw, reset_val, 0), + EL2_REG(MDCR_EL2, access_mdcr, reset_val, 0), EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), EL2_REG_VNCR(HSTR_EL2, reset_val, 0), EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0), @@ -3057,9 +3109,24 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer }, + EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), + EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), + + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer }, + EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0), + EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0), { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, + { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer }, + + { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer }, + EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; @@ -3571,18 +3638,20 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu, * None of the other registers share their location, so treat them as * if they were 64bit. */ -#define DBG_BCR_BVR_WCR_WVR(n) \ - /* DBGBVRn */ \ - { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \ - /* DBGBCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \ - /* DBGWVRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \ - /* DBGWCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n } - -#define DBGBXVR(n) \ - { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_bvr, NULL, n } +#define DBG_BCR_BVR_WCR_WVR(n) \ + /* DBGBVRn */ \ + { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), \ + trap_dbg_wb_reg, NULL, n }, \ + /* DBGBCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_dbg_wb_reg, NULL, n }, \ + /* DBGWVRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_dbg_wb_reg, NULL, n }, \ + /* DBGWCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_dbg_wb_reg, NULL, n } + +#define DBGBXVR(n) \ + { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), \ + trap_dbg_wb_reg, NULL, n } /* * Trapped cp14 registers. We generally ignore most of the external @@ -3879,9 +3948,11 @@ static const struct sys_reg_desc cp15_64_regs[] = { { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer }, { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ + { SYS_DESC(SYS_AARCH32_CNTVCT), access_arch_timer }, { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTVCTSS), access_arch_timer }, }; static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, @@ -4396,6 +4467,9 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) reset_vcpu_ftr_id_reg(vcpu, r); else r->reset(vcpu, r); + + if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS) + (void)__vcpu_sys_reg(vcpu, r->reg); } set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); @@ -4972,6 +5046,14 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | HAFGRTR_EL2_RES1); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) { + kvm->arch.fgu[HDFGRTR_GROUP] |= (HDFGRTR_EL2_nBRBDATA | + HDFGRTR_EL2_nBRBCTL | + HDFGRTR_EL2_nBRBIDR); + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_nBRBINJ | + HFGITR_EL2_nBRBIALL); + } + set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); out: mutex_unlock(&kvm->arch.config_lock); @@ -4999,7 +5081,7 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) } if (vcpu_has_nv(vcpu)) { - int ret = kvm_init_nv_sysregs(kvm); + int ret = kvm_init_nv_sysregs(vcpu); if (ret) return ret; } diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 064a58c19f48..f85415db7713 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -46,38 +46,6 @@ TRACE_EVENT(kvm_hvc_arm64, __entry->vcpu_pc, __entry->r0, __entry->imm) ); -TRACE_EVENT(kvm_arm_setup_debug, - TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), - TP_ARGS(vcpu, guest_debug), - - TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->guest_debug = guest_debug; - ), - - TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) -); - -TRACE_EVENT(kvm_arm_clear_debug, - TP_PROTO(__u32 guest_debug), - TP_ARGS(guest_debug), - - TP_STRUCT__entry( - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->guest_debug = guest_debug; - ), - - TP_printk("flags: 0x%08x", __entry->guest_debug) -); - /* * The dreg32 name is a leftover from a distant past. This will really * output a 64bit value... @@ -99,49 +67,6 @@ TRACE_EVENT(kvm_arm_set_dreg32, TP_printk("%s: 0x%llx", __entry->name, __entry->value) ); -TRACE_DEFINE_SIZEOF(__u64); - -TRACE_EVENT(kvm_arm_set_regset, - TP_PROTO(const char *type, int len, __u64 *control, __u64 *value), - TP_ARGS(type, len, control, value), - TP_STRUCT__entry( - __field(const char *, name) - __field(int, len) - __array(u64, ctrls, 16) - __array(u64, values, 16) - ), - TP_fast_assign( - __entry->name = type; - __entry->len = len; - memcpy(__entry->ctrls, control, len << 3); - memcpy(__entry->values, value, len << 3); - ), - TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name, - __print_array(__entry->ctrls, __entry->len, sizeof(__u64)), - __print_array(__entry->values, __entry->len, sizeof(__u64))) -); - -TRACE_EVENT(trap_reg, - TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value), - TP_ARGS(fn, reg, is_write, write_value), - - TP_STRUCT__entry( - __field(const char *, fn) - __field(int, reg) - __field(bool, is_write) - __field(u64, write_value) - ), - - TP_fast_assign( - __entry->fn = fn; - __entry->reg = reg; - __entry->is_write = is_write; - __entry->write_value = write_value; - ), - - TP_printk("%s %s reg %d (0x%016llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) -); - TRACE_EVENT(kvm_handle_sys_reg, TP_PROTO(unsigned long hsr), TP_ARGS(hsr), diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index f4c4494645c3..fb96802799c6 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -608,12 +608,22 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, lockdep_assert_held(&its->its_lock); vgic_get_irq_kref(irq); + old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); + + /* + * Put the reference taken on @irq if the store fails. Intentionally do + * not return the error as the translation cache is best effort. + */ + if (xa_is_err(old)) { + vgic_put_irq(kvm, irq); + return; + } + /* * We could have raced with another CPU caching the same * translation behind our back, ensure we don't leak a * reference if that is the case. */ - old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); if (old) vgic_put_irq(kvm, old); } diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index f267bc2486a1..d7233ab982d0 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -6,6 +6,7 @@ #include <linux/kstrtox.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/string_choices.h> #include <kvm/arm_vgic.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> @@ -663,9 +664,9 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (info->has_v4) { kvm_vgic_global_state.has_gicv4 = gicv4_enable; kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; - kvm_info("GICv4%s support %sabled\n", + kvm_info("GICv4%s support %s\n", kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", - gicv4_enable ? "en" : "dis"); + str_enabled_disabled(gicv4_enable)); } kvm_vgic_global_state.vcpu_base = 0; @@ -734,7 +735,8 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); @@ -746,7 +748,8 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); WARN_ON(vgic_v4_put(vcpu)); if (has_vhe()) |