diff options
Diffstat (limited to 'arch/x86')
48 files changed, 2852 insertions, 1541 deletions
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 5187fcf4b610..68ad4f923664 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -9,8 +9,7 @@ BUILD_BUG_ON(1) * "static_call_update()" calls. * * KVM_X86_OP_OPTIONAL() can be used for those functions that can have - * a NULL definition, for example if "static_call_cond()" will be used - * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise + * a NULL definition. KVM_X86_OP_OPTIONAL_RET0() can be used likewise * to make a definition optional, but in this case the default will * be __static_call_return0. */ @@ -85,7 +84,6 @@ KVM_X86_OP_OPTIONAL(update_cr8_intercept) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP_OPTIONAL(hwapic_irr_update) KVM_X86_OP_OPTIONAL(hwapic_isr_update) -KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt) KVM_X86_OP_OPTIONAL(load_eoi_exitmap) KVM_X86_OP_OPTIONAL(set_virtual_apic_mode) KVM_X86_OP_OPTIONAL(set_apic_access_page_addr) @@ -103,7 +101,6 @@ KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) -KVM_X86_OP(sched_in) KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) KVM_X86_OP_OPTIONAL(vcpu_blocking) KVM_X86_OP_OPTIONAL(vcpu_unblocking) @@ -139,6 +136,9 @@ KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); KVM_X86_OP_OPTIONAL(get_untagged_addr) KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) +KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) +KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level) +KVM_X86_OP_OPTIONAL(gmem_invalidate) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index f852b13aeefe..9159bf1a4730 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -9,8 +9,7 @@ BUILD_BUG_ON(1) * "static_call_update()" calls. * * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have - * a NULL definition, for example if "static_call_cond()" will be used - * at the call sites. + * a NULL definition. */ KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) KVM_X86_PMU_OP(msr_idx_to_pmc) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8ca74e7678f..950a03e0181e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -121,6 +121,7 @@ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -159,7 +160,6 @@ #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 256 -#define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 #define ASYNC_PF_PER_VCPU 64 @@ -533,12 +533,16 @@ struct kvm_pmc { }; /* More counters may conflict with other existing Architectural MSRs */ -#define KVM_INTEL_PMC_MAX_GENERIC 8 -#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) -#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) -#define KVM_PMC_MAX_FIXED 3 -#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) -#define KVM_AMD_PMC_MAX_GENERIC 6 +#define KVM_MAX(a, b) ((a) >= (b) ? (a) : (b)) +#define KVM_MAX_NR_INTEL_GP_COUNTERS 8 +#define KVM_MAX_NR_AMD_GP_COUNTERS 6 +#define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \ + KVM_MAX_NR_AMD_GP_COUNTERS) + +#define KVM_MAX_NR_INTEL_FIXED_COUTNERS 3 +#define KVM_MAX_NR_AMD_FIXED_COUTNERS 0 +#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUTNERS, \ + KVM_MAX_NR_AMD_FIXED_COUTNERS) struct kvm_pmu { u8 version; @@ -546,16 +550,16 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; - u64 fixed_ctr_ctrl_mask; + u64 fixed_ctr_ctrl_rsvd; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; - u64 global_ctrl_mask; - u64 global_status_mask; + u64 global_ctrl_rsvd; + u64 global_status_rsvd; u64 reserved_bits; u64 raw_event_mask; - struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; - struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; + struct kvm_pmc gp_counters[KVM_MAX_NR_GP_COUNTERS]; + struct kvm_pmc fixed_counters[KVM_MAX_NR_FIXED_COUNTERS]; /* * Overlay the bitmap with a 64-bit atomic so that all bits can be @@ -571,9 +575,9 @@ struct kvm_pmu { u64 ds_area; u64 pebs_enable; - u64 pebs_enable_mask; + u64 pebs_enable_rsvd; u64 pebs_data_cfg; - u64 pebs_data_cfg_mask; + u64 pebs_data_cfg_rsvd; /* * If a guest counter is cross-mapped to host counter with different @@ -604,18 +608,12 @@ enum { KVM_DEBUGREG_WONT_EXIT = 2, }; -struct kvm_mtrr_range { - u64 base; - u64 mask; - struct list_head node; -}; - struct kvm_mtrr { - struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR]; - mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; + u64 var[KVM_NR_VAR_MTRR * 2]; + u64 fixed_64k; + u64 fixed_16k[2]; + u64 fixed_4k[8]; u64 deftype; - - struct list_head head; }; /* Hyper-V SynIC timer */ @@ -1207,7 +1205,7 @@ enum kvm_apicv_inhibit { * APIC acceleration is disabled by a module parameter * and/or not supported in hardware. */ - APICV_INHIBIT_REASON_DISABLE, + APICV_INHIBIT_REASON_DISABLED, /* * APIC acceleration is inhibited because AutoEOI feature is @@ -1277,8 +1275,27 @@ enum kvm_apicv_inhibit { * mapping between logical ID and vCPU. */ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, + + NR_APICV_INHIBIT_REASONS, }; +#define __APICV_INHIBIT_REASON(reason) \ + { BIT(APICV_INHIBIT_REASON_##reason), #reason } + +#define APICV_INHIBIT_REASONS \ + __APICV_INHIBIT_REASON(DISABLED), \ + __APICV_INHIBIT_REASON(HYPERV), \ + __APICV_INHIBIT_REASON(ABSENT), \ + __APICV_INHIBIT_REASON(BLOCKIRQ), \ + __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \ + __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \ + __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \ + __APICV_INHIBIT_REASON(NESTED), \ + __APICV_INHIBIT_REASON(IRQWIN), \ + __APICV_INHIBIT_REASON(PIT_REINJ), \ + __APICV_INHIBIT_REASON(SEV), \ + __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED) + struct kvm_arch { unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; @@ -1364,6 +1381,7 @@ struct kvm_arch { u32 default_tsc_khz; bool user_set_tsc; + u64 apic_bus_cycle_ns; seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; @@ -1708,13 +1726,11 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); const unsigned long required_apicv_inhibits; bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(int isr); - bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); @@ -1749,8 +1765,6 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - void (*sched_in)(struct kvm_vcpu *vcpu, int cpu); - /* * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero * value indicates CPU dirty logging is unsupported or disabled. @@ -1812,6 +1826,9 @@ struct kvm_x86_ops { gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); + int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); + void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); + int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn); }; struct kvm_x86_nested_ops { @@ -1819,7 +1836,7 @@ struct kvm_x86_nested_ops { bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector, u32 error_code); int (*check_events)(struct kvm_vcpu *vcpu); - bool (*has_events)(struct kvm_vcpu *vcpu); + bool (*has_events)(struct kvm_vcpu *vcpu, bool for_injection); void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, @@ -1853,11 +1870,13 @@ struct kvm_arch_async_pf { }; extern u32 __read_mostly kvm_nr_uret_msrs; -extern u64 __read_mostly host_efer; extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly enable_apicv; extern struct kvm_x86_ops kvm_x86_ops; +#define kvm_x86_call(func) static_call(kvm_x86_##func) +#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func) + #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_OPTIONAL KVM_X86_OP @@ -1881,7 +1900,7 @@ void kvm_arch_free_vm(struct kvm *kvm); static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { if (kvm_x86_ops.flush_remote_tlbs && - !static_call(kvm_x86_flush_remote_tlbs)(kvm)) + !kvm_x86_call(flush_remote_tlbs)(kvm)) return 0; else return -ENOTSUPP; @@ -1894,7 +1913,7 @@ static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, if (!kvm_x86_ops.flush_remote_tlbs_range) return -EOPNOTSUPP; - return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages); + return kvm_x86_call(flush_remote_tlbs_range)(kvm, gfn, nr_pages); } #endif /* CONFIG_HYPERV */ @@ -1939,6 +1958,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -2292,12 +2312,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { - static_call_cond(kvm_x86_vcpu_blocking)(vcpu); + kvm_x86_call(vcpu_blocking)(vcpu); } static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - static_call_cond(kvm_x86_vcpu_unblocking)(vcpu); + kvm_x86_call(vcpu_unblocking)(vcpu); } static inline int kvm_cpu_get_apicid(int mps_cpu) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index e90d403f2068..98726c2b04f8 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -59,6 +59,14 @@ #define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12 #define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0) +/* Preferred GHCB GPA Request */ +#define GHCB_MSR_PREF_GPA_REQ 0x010 +#define GHCB_MSR_GPA_VALUE_POS 12 +#define GHCB_MSR_GPA_VALUE_MASK GENMASK_ULL(51, 0) + +#define GHCB_MSR_PREF_GPA_RESP 0x011 +#define GHCB_MSR_PREF_GPA_NONE 0xfffffffffffff + /* GHCB GPA Register */ #define GHCB_MSR_REG_GPA_REQ 0x012 #define GHCB_MSR_REG_GPA_REQ_VAL(v) \ @@ -93,11 +101,17 @@ enum psc_op { /* GHCBData[11:0] */ \ GHCB_MSR_PSC_REQ) +#define GHCB_MSR_PSC_REQ_TO_GFN(msr) (((msr) & GENMASK_ULL(51, 12)) >> 12) +#define GHCB_MSR_PSC_REQ_TO_OP(msr) (((msr) & GENMASK_ULL(55, 52)) >> 52) + #define GHCB_MSR_PSC_RESP 0x015 #define GHCB_MSR_PSC_RESP_VAL(val) \ /* GHCBData[63:32] */ \ (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) +/* Set highest bit as a generic error response */ +#define GHCB_MSR_PSC_RESP_ERROR (BIT_ULL(63) | GHCB_MSR_PSC_RESP) + /* GHCB Run at VMPL Request/Response */ #define GHCB_MSR_VMPL_REQ 0x016 #define GHCB_MSR_VMPL_REQ_LEVEL(v) \ @@ -129,8 +143,19 @@ enum psc_op { * The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which * is a local stack variable in set_pages_state(). Do not increase this value * without evaluating the impact to stack usage. + * + * Use VMGEXIT_PSC_MAX_COUNT in cases where the actual GHCB-defined max value + * is needed, such as when processing GHCB requests on the hypervisor side. */ #define VMGEXIT_PSC_MAX_ENTRY 64 +#define VMGEXIT_PSC_MAX_COUNT 253 + +#define VMGEXIT_PSC_ERROR_GENERIC (0x100UL << 32) +#define VMGEXIT_PSC_ERROR_INVALID_HDR ((1UL << 32) | 1) +#define VMGEXIT_PSC_ERROR_INVALID_ENTRY ((1UL << 32) | 2) + +#define VMGEXIT_PSC_OP_PRIVATE 1 +#define VMGEXIT_PSC_OP_SHARED 2 struct psc_hdr { u16 cur_entry; diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index ac5886ce252e..79bbe2be900e 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -91,6 +91,9 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); /* RMUPDATE detected 4K page and 2MB page overlap. */ #define RMPUPDATE_FAIL_OVERLAP 4 +/* PSMASH failed due to concurrent access by another CPU */ +#define PSMASH_FAIL_INUSE 3 + /* RMP page size */ #define RMP_PG_SIZE_4K 0 #define RMP_PG_SIZE_2M 1 @@ -116,6 +119,54 @@ struct snp_req_data { unsigned int data_npages; }; +#define MAX_AUTHTAG_LEN 32 + +/* See SNP spec SNP_GUEST_REQUEST section for the structure */ +enum msg_type { + SNP_MSG_TYPE_INVALID = 0, + SNP_MSG_CPUID_REQ, + SNP_MSG_CPUID_RSP, + SNP_MSG_KEY_REQ, + SNP_MSG_KEY_RSP, + SNP_MSG_REPORT_REQ, + SNP_MSG_REPORT_RSP, + SNP_MSG_EXPORT_REQ, + SNP_MSG_EXPORT_RSP, + SNP_MSG_IMPORT_REQ, + SNP_MSG_IMPORT_RSP, + SNP_MSG_ABSORB_REQ, + SNP_MSG_ABSORB_RSP, + SNP_MSG_VMRK_REQ, + SNP_MSG_VMRK_RSP, + + SNP_MSG_TYPE_MAX +}; + +enum aead_algo { + SNP_AEAD_INVALID, + SNP_AEAD_AES_256_GCM, +}; + +struct snp_guest_msg_hdr { + u8 authtag[MAX_AUTHTAG_LEN]; + u64 msg_seqno; + u8 rsvd1[8]; + u8 algo; + u8 hdr_version; + u16 hdr_sz; + u8 msg_type; + u8 msg_version; + u16 msg_sz; + u32 rsvd2; + u8 msg_vmpck; + u8 rsvd3[35]; +} __packed; + +struct snp_guest_msg { + struct snp_guest_msg_hdr hdr; + u8 payload[4000]; +} __packed; + struct sev_guest_platform_data { u64 secrets_gpa; }; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 728c98175b9c..f0dea3750ca9 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -285,7 +285,14 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) -#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) +#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) +#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) +#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) +#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) + +#define SVM_SEV_FEAT_INT_INJ_MODES \ + (SVM_SEV_FEAT_RESTRICTED_INJECTION | \ + SVM_SEV_FEAT_ALTERNATE_INJECTION) struct vmcb_seg { u16 selector; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 9fae1b73b529..bf57a824f722 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -106,6 +106,7 @@ struct kvm_ioapic_state { #define KVM_RUN_X86_SMM (1 << 0) #define KVM_RUN_X86_BUS_LOCK (1 << 1) +#define KVM_RUN_X86_GUEST_MODE (1 << 2) /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { @@ -697,6 +698,11 @@ enum sev_cmd_id { /* Second time is the charm; improved versions of the above ioctls. */ KVM_SEV_INIT2, + /* SNP-specific commands */ + KVM_SEV_SNP_LAUNCH_START = 100, + KVM_SEV_SNP_LAUNCH_UPDATE, + KVM_SEV_SNP_LAUNCH_FINISH, + KVM_SEV_NR_MAX, }; @@ -824,6 +830,48 @@ struct kvm_sev_receive_update_data { __u32 pad2; }; +struct kvm_sev_snp_launch_start { + __u64 policy; + __u8 gosvw[16]; + __u16 flags; + __u8 pad0[6]; + __u64 pad1[4]; +}; + +/* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 +#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 +#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 +#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5 +#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6 + +struct kvm_sev_snp_launch_update { + __u64 gfn_start; + __u64 uaddr; + __u64 len; + __u8 type; + __u8 pad0; + __u16 flags; + __u32 pad1; + __u64 pad2[4]; +}; + +#define KVM_SEV_SNP_ID_BLOCK_SIZE 96 +#define KVM_SEV_SNP_ID_AUTH_SIZE 4096 +#define KVM_SEV_SNP_FINISH_DATA_SIZE 32 + +struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 vcek_disabled; + __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE]; + __u8 pad0[3]; + __u16 flags; + __u64 pad1[4]; +}; + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) @@ -874,5 +922,6 @@ struct kvm_hyperv_eventfd { #define KVM_X86_SW_PROTECTED_VM 1 #define KVM_X86_SEV_VM 2 #define KVM_X86_SEV_ES_VM 3 +#define KVM_X86_SNP_VM 4 #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fec95a770270..4287a8071a3a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -44,6 +44,7 @@ config KVM select KVM_VFIO select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING + select KVM_GENERIC_PRE_FAULT_MEMORY select KVM_WERROR if WERROR help Support hosting fully virtualized guest machines using hardware @@ -139,6 +140,9 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM + select KVM_GENERIC_PRIVATE_MEM + select HAVE_KVM_GMEM_PREPARE + select HAVE_KVM_GMEM_INVALIDATE help Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f2f2be5d1141..2617be544480 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -335,6 +335,18 @@ static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent) #endif } +static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + + entry = kvm_find_cpuid_entry(vcpu, 0); + if (!entry) + return false; + + return is_guest_vendor_amd(entry->ebx, entry->ecx, entry->edx) || + is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx); +} + static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -388,7 +400,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.cpuid_nent)); /* Invoke the vendor callback only after the above state is updated. */ - static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu); + kvm_x86_call(vcpu_after_set_cpuid)(vcpu); /* * Except for the MMU, which needs to do its thing any vendor specific diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 23dbb9eb277c..41697cca354e 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -102,24 +102,6 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, *reg &= ~__feature_bit(x86_feature); } -static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0); - return best && - (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || - is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); -} - -static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0); - return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); -} - static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu) { return vcpu->arch.is_amd_compatible; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c8cc578646d0..e72aed25d721 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2354,50 +2354,6 @@ setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss) ss->avl = 0; } -static bool vendor_intel(struct x86_emulate_ctxt *ctxt) -{ - u32 eax, ebx, ecx, edx; - - eax = ecx = 0; - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); - return is_guest_vendor_intel(ebx, ecx, edx); -} - -static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) -{ - const struct x86_emulate_ops *ops = ctxt->ops; - u32 eax, ebx, ecx, edx; - - /* - * syscall should always be enabled in longmode - so only become - * vendor specific (cpuid) if other modes are active... - */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - return true; - - eax = 0x00000000; - ecx = 0x00000000; - ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); - /* - * remark: Intel CPUs only support "syscall" in 64bit longmode. Also a - * 64bit guest with a 32bit compat-app running will #UD !! While this - * behaviour can be fixed (by emulating) into AMD response - CPUs of - * AMD can't behave like Intel. - */ - if (is_guest_vendor_intel(ebx, ecx, edx)) - return false; - - if (is_guest_vendor_amd(ebx, ecx, edx) || - is_guest_vendor_hygon(ebx, ecx, edx)) - return true; - - /* - * default: (not Intel, not AMD, not Hygon), apply Intel's - * stricter rules... - */ - return false; -} - static int em_syscall(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; @@ -2411,7 +2367,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->mode == X86EMUL_MODE_VM86) return emulate_ud(ctxt); - if (!(em_syscall_is_enabled(ctxt))) + /* + * Intel compatible CPUs only support SYSCALL in 64-bit mode, whereas + * AMD allows SYSCALL in any flavor of protected mode. Note, it's + * infeasible to emulate Intel behavior when running on AMD hardware, + * as SYSCALL won't fault in the "wrong" mode, i.e. there is no #UD + * for KVM to trap-and-emulate, unlike emulating AMD on Intel. + */ + if (ctxt->mode != X86EMUL_MODE_PROT64 && + ctxt->ops->guest_cpuid_is_intel_compatible(ctxt)) return emulate_ud(ctxt); ops->get_msr(ctxt, MSR_EFER, &efer); @@ -2471,11 +2435,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) return emulate_gp(ctxt, 0); /* - * Not recognized on AMD in compat mode (but is recognized in legacy - * mode). + * Intel's architecture allows SYSENTER in compatibility mode, but AMD + * does not. Note, AMD does allow SYSENTER in legacy protected mode. */ - if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) - && !vendor_intel(ctxt)) + if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) && + !ctxt->ops->guest_cpuid_is_intel_compatible(ctxt)) return emulate_ud(ctxt); /* sysenter/sysexit have not been tested in 64bit mode. */ @@ -2647,7 +2611,14 @@ static void string_registers_quirk(struct x86_emulate_ctxt *ctxt) * manner when ECX is zero due to REP-string optimizations. */ #ifdef CONFIG_X86_64 - if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt)) + u32 eax, ebx, ecx, edx; + + if (ctxt->ad_bytes != 4) + return; + + eax = ecx = 0; + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); + if (!is_guest_vendor_intel(ebx, ecx, edx)) return; *reg_write(ctxt, VCPU_REGS_RCX) = 0; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 8a47f8541eab..4f0a94346d00 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1417,7 +1417,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, } /* vmcall/vmmcall */ - static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i); + kvm_x86_call(patch_hypercall)(vcpu, instructions + i); i += 3; /* ret */ @@ -1737,7 +1737,8 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = (u64)vcpu->arch.virtual_tsc_khz * 1000; break; case HV_X64_MSR_APIC_FREQUENCY: - data = APIC_BUS_FREQUENCY; + data = div64_u64(1000000000ULL, + vcpu->kvm->arch.apic_bus_cycle_ns); break; default: kvm_pr_unimpl_rdmsr(vcpu, msr); @@ -1985,7 +1986,7 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) */ gva = entries[i] & PAGE_MASK; for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++) - static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); + kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); ++vcpu->stat.tlb_flush; } @@ -2526,7 +2527,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ - if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { + if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index ad9ca8a60144..3d7eb11d0e45 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -157,7 +157,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); - static_call_cond(kvm_x86_migrate_timers)(vcpu); + kvm_x86_call(migrate_timers)(vcpu); } bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index c2d7cfe82d00..76d46b2f41dd 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -106,7 +106,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); int kvm_setup_default_irq_routing(struct kvm *kvm); -int kvm_setup_empty_irq_routing(struct kvm *kvm); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 68f3f6c26046..8136695f7b96 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -395,13 +395,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) ARRAY_SIZE(default_routing), 0); } -static const struct kvm_irq_routing_entry empty_routing[] = {}; - -int kvm_setup_empty_irq_routing(struct kvm *kvm) -{ - return kvm_set_irq_routing(kvm, empty_routing, 0, 0); -} - void kvm_arch_post_irq_routing_update(struct kvm *kvm) { if (!irqchip_split(kvm)) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 75eae9c4998a..b1eb46e26b2e 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -98,7 +98,7 @@ static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg return 0; if (!kvm_register_is_available(vcpu, reg)) - static_call(kvm_x86_cache_reg)(vcpu, reg); + kvm_x86_call(cache_reg)(vcpu, reg); return vcpu->arch.regs[reg]; } @@ -138,7 +138,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) might_sleep(); /* on svm */ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) - static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR); + kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } @@ -153,7 +153,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; if ((tmask & vcpu->arch.cr0_guest_owned_bits) && !kvm_register_is_available(vcpu, VCPU_EXREG_CR0)) - static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0); + kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR0); return vcpu->arch.cr0 & mask; } @@ -175,7 +175,7 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; if ((tmask & vcpu->arch.cr4_guest_owned_bits) && !kvm_register_is_available(vcpu, VCPU_EXREG_CR4)) - static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4); + kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR4); return vcpu->arch.cr4 & mask; } @@ -190,7 +190,7 @@ static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu, static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3); + kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR3); return vcpu->arch.cr3; } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 29ea4313e1bb..55a18e2f2dcd 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -223,6 +223,7 @@ struct x86_emulate_ops { bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); + bool (*guest_cpuid_is_intel_compatible)(struct x86_emulate_ctxt *ctxt); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index acd7d48100a1..a7172ba59ad2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -738,8 +738,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(apic->apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu, - apic_find_highest_irr(apic)); + kvm_x86_call(hwapic_irr_update)(apic->vcpu, + apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -765,7 +765,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(apic->apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vec); + kvm_x86_call(hwapic_isr_update)(vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -810,7 +810,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(apic->apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); + kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -946,7 +946,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; if (kvm_x86_ops.sync_pir_to_irr) - highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu); + highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) @@ -1338,8 +1338,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->regs + APIC_TMR); } - static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode, - trig_mode, vector); + kvm_x86_call(deliver_interrupt)(apic, delivery_mode, + trig_mode, vector); break; case APIC_DM_REMRD: @@ -1557,7 +1557,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) remaining = 0; ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); - return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count)); + return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns * + apic->divide_count)); } static void __report_tpr_access(struct kvm_lapic *apic, bool write) @@ -1973,7 +1974,8 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict) { - return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count; + return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns * + (u64)apic->divide_count; } static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) @@ -2103,7 +2105,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic) { WARN_ON(preemptible()); WARN_ON(!apic->lapic_timer.hv_timer_in_use); - static_call(kvm_x86_cancel_hv_timer)(apic->vcpu); + kvm_x86_call(cancel_hv_timer)(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } @@ -2120,7 +2122,7 @@ static bool start_hv_timer(struct kvm_lapic *apic) if (!ktimer->tscdeadline) return false; - if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired)) + if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired)) return false; ktimer->hv_timer_in_use = true; @@ -2575,7 +2577,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); - static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); + kvm_x86_call(set_virtual_apic_mode)(vcpu); } apic->base_address = apic->vcpu->arch.apic_base & @@ -2685,7 +2687,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) u64 msr_val; int i; - static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu); + kvm_x86_call(apicv_pre_state_restore)(vcpu); if (!init_event) { msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; @@ -2740,9 +2742,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (apic->apicv_active) { - static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); - static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call_cond(kvm_x86_hwapic_isr_update)(-1); + kvm_x86_call(apicv_post_state_restore)(vcpu); + kvm_x86_call(hwapic_irr_update)(vcpu, -1); + kvm_x86_call(hwapic_isr_update)(-1); } vcpu->arch.apic_arb_prio = 0; @@ -2838,7 +2840,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) vcpu->arch.apic = apic; if (kvm_x86_ops.alloc_apic_backing_page) - apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu); + apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu); else apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!apic->regs) { @@ -3017,7 +3019,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) struct kvm_lapic *apic = vcpu->arch.apic; int r; - static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu); + kvm_x86_call(apicv_pre_state_restore)(vcpu); kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ @@ -3044,9 +3046,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); if (apic->apicv_active) { - static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); - static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); - static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); + kvm_x86_call(apicv_post_state_restore)(vcpu); + kvm_x86_call(hwapic_irr_update)(vcpu, + apic_find_highest_irr(apic)); + kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) @@ -3334,7 +3337,8 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector); + kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu, + sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a69e706b9080..7ef8ae73e82d 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -16,8 +16,7 @@ #define APIC_DEST_NOSHORT 0x0 #define APIC_DEST_MASK 0x800 -#define APIC_BUS_CYCLE_NS 1 -#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) +#define APIC_BUS_CYCLE_NS_DEFAULT 1 #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul @@ -236,7 +235,7 @@ static inline bool kvm_apic_has_pending_init_or_sipi(struct kvm_vcpu *vcpu) static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu) { return !is_smm(vcpu) && - !static_call(kvm_x86_apic_init_signal_blocked)(vcpu); + !kvm_x86_call(apic_init_signal_blocked)(vcpu); } static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2e454316f2a2..4341e0e28571 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -57,12 +57,6 @@ static __always_inline u64 rsvd_bits(int s, int e) return ((2ULL << (e - s)) - 1) << s; } -/* - * The number of non-reserved physical address bits irrespective of features - * that repurpose legal bits, e.g. MKTME. - */ -extern u8 __read_mostly shadow_phys_bits; - static inline gfn_t kvm_mmu_max_gfn(void) { /* @@ -76,30 +70,11 @@ static inline gfn_t kvm_mmu_max_gfn(void) * than hardware's real MAXPHYADDR. Using the host MAXPHYADDR * disallows such SPTEs entirely and simplifies the TDP MMU. */ - int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52; + int max_gpa_bits = likely(tdp_enabled) ? kvm_host.maxphyaddr : 52; return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1; } -static inline u8 kvm_get_shadow_phys_bits(void) -{ - /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected - * in CPU detection code, but the processor treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at - * the physical address bits reported by CPUID. - */ - if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) - return cpuid_eax(0x80000008) & 0xff; - - /* - * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with - * custom CPUID. Proceed with whatever the kernel found since these features - * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). - */ - return boot_cpu_data.x86_phys_bits; -} - u8 kvm_mmu_get_max_tdp_level(void); void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); @@ -163,8 +138,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) if (!VALID_PAGE(root_hpa)) return; - static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa, - vcpu->arch.mmu->root_role.level); + kvm_x86_call(load_mmu_pgd)(vcpu, root_hpa, + vcpu->arch.mmu->root_role.level); } static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, @@ -199,7 +174,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, { /* strip nested paging fault error codes */ unsigned int pfec = access; - unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); + unsigned long rflags = kvm_x86_call(get_rflags)(vcpu); /* * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1. @@ -246,14 +221,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma); - -static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm) -{ - return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm)); -} - -void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); +bool kvm_mmu_may_ignore_guest_pat(void); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8d74bdef68c1..901be9e420a4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -722,7 +722,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) if (sp->role.passthrough) return sp->gfn; - if (!sp->role.direct) + if (sp->shadowed_translation) return sp->shadowed_translation[index] >> PAGE_SHIFT; return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); @@ -736,7 +736,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) */ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { - if (sp_has_gptes(sp)) + if (sp->shadowed_translation) return sp->shadowed_translation[index] & ACC_ALL; /* @@ -757,7 +757,7 @@ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn_t gfn, unsigned int access) { - if (sp_has_gptes(sp)) { + if (sp->shadowed_translation) { sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } @@ -1700,8 +1700,7 @@ static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); - if (!sp->role.direct) - free_page((unsigned long)sp->shadowed_translation); + free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -2203,7 +2202,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); - if (!role.direct) + if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL) sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); @@ -3308,7 +3307,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, return RET_PF_CONTINUE; } -static bool page_fault_can_be_fast(struct kvm_page_fault *fault) +static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault) { /* * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only @@ -3320,6 +3319,26 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) return false; /* + * For hardware-protected VMs, certain conditions like attempting to + * perform a write to a page which is not in the state that the guest + * expects it to be in can result in a nested/extended #PF. In this + * case, the below code might misconstrue this situation as being the + * result of a write-protected access, and treat it as a spurious case + * rather than taking any action to satisfy the real source of the #PF + * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the + * guest spinning on a #PF indefinitely, so don't attempt the fast path + * in this case. + * + * Note that the kvm_mem_is_private() check might race with an + * attribute update, but this will either result in the guest spinning + * on RET_PF_SPURIOUS until the update completes, or an actual spurious + * case might go down the slow path. Either case will resolve itself. + */ + if (kvm->arch.has_private_mem && + fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) + return false; + + /* * #PF can be fast if: * * 1. The shadow page table entry is not present and A/D bits are @@ -3419,7 +3438,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) u64 *sptep; uint retry_count = 0; - if (!page_fault_can_be_fast(fault)) + if (!page_fault_can_be_fast(vcpu->kvm, fault)) return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3428,7 +3447,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) u64 new_spte; if (tdp_mmu_enabled) - sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); @@ -3438,7 +3457,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * available as the vCPU holds a reference to its root(s). */ if (WARN_ON_ONCE(!sptep)) - spte = REMOVED_SPTE; + spte = FROZEN_SPTE; if (!is_shadow_present_pte(spte)) break; @@ -4271,7 +4290,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL); + r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, + true, NULL, NULL); + + /* + * Account fixed page faults, otherwise they'll never be counted, but + * ignore stats for all other return times. Page-ready "faults" aren't + * truly spurious and never trigger emulation + */ + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; } static inline u8 kvm_max_level_for_order(int order) @@ -4291,6 +4319,25 @@ static inline u8 kvm_max_level_for_order(int order) return PG_LEVEL_4K; } +static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, + u8 max_level, int gmem_order) +{ + u8 req_max_level; + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + max_level = min(kvm_max_level_for_order(gmem_order), max_level); + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn); + if (req_max_level) + max_level = min(max_level, req_max_level); + + return req_max_level; +} + static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { @@ -4308,9 +4355,9 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, return r; } - fault->max_level = min(kvm_max_level_for_order(max_order), - fault->max_level); fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); + fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, + fault->max_level, max_order); return RET_PF_CONTINUE; } @@ -4561,7 +4608,10 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (WARN_ON_ONCE(error_code >> 32)) error_code = lower_32_bits(error_code); - /* Ensure the above sanity check also covers KVM-defined flags. */ + /* + * Restrict KVM-defined flags to bits 63:32 so that it's impossible for + * them to conflict with #PF error codes, which are limited to 32 bits. + */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; @@ -4621,38 +4671,23 @@ out_unlock: } #endif -bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma) +bool kvm_mmu_may_ignore_guest_pat(void) { /* - * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the - * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is - * to honor the memtype from the guest's MTRRs so that guest accesses - * to memory that is DMA'd aren't cached against the guest's wishes. - * - * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, - * e.g. KVM will force UC memtype for host MMIO. + * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does + * not support self-snoop (or is affected by an erratum), and the VM + * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to + * honor the memtype from the guest's PAT so that guest accesses to + * memory that is DMA'd aren't cached against the guest's wishes. As a + * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, + * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE + * bits in response to non-coherent device (un)registration. */ - return vm_has_noncoherent_dma && shadow_memtype_mask; + return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask; } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - /* - * If the guest's MTRRs may be used to compute the "real" memtype, - * restrict the mapping level to ensure KVM uses a consistent memtype - * across the entire mapping. - */ - if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) { - for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { - int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = gfn_round_for_level(fault->gfn, - fault->max_level); - - if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) - break; - } - } - #ifdef CONFIG_X86_64 if (tdp_mmu_enabled) return kvm_tdp_mmu_page_fault(vcpu, fault); @@ -4661,6 +4696,79 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } +static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, + u8 *level) +{ + int r; + + /* + * Restrict to TDP page fault, since that's the only case where the MMU + * is indexed by GPA. + */ + if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault) + return -EOPNOTSUPP; + + do { + if (signal_pending(current)) + return -EINTR; + cond_resched(); + r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); + } while (r == RET_PF_RETRY); + + if (r < 0) + return r; + + switch (r) { + case RET_PF_FIXED: + case RET_PF_SPURIOUS: + return 0; + + case RET_PF_EMULATE: + return -ENOENT; + + case RET_PF_RETRY: + case RET_PF_CONTINUE: + case RET_PF_INVALID: + default: + WARN_ONCE(1, "could not fix page fault during prefault"); + return -EIO; + } +} + +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + u64 error_code = PFERR_GUEST_FINAL_MASK; + u8 level = PG_LEVEL_4K; + u64 end; + int r; + + /* + * reload is efficient when called repeatedly, so we can do it on + * every iteration. + */ + kvm_mmu_reload(vcpu); + + if (kvm_arch_has_private_mem(vcpu->kvm) && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + + /* + * Shadow paging uses GVA for kvm page fault, so restrict to + * two-dimensional paging. + */ + r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); + if (r < 0) + return r; + + /* + * If the mapping that covers range->gpa can use a huge page, it + * may start below it or end after range->gpa + range->size. + */ + end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); + return min(range->size, end - range->gpa); +} + static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; @@ -4988,7 +5096,7 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, static inline u64 reserved_hpa_bits(void) { - return rsvd_bits(shadow_phys_bits, 63); + return rsvd_bits(kvm_host.maxphyaddr, 63); } /* @@ -5633,7 +5741,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) * stale entries. Flushing on alloc also allows KVM to skip the TLB * flush when freeing a root (see kvm_tdp_mmu_put_root()). */ - static_call(kvm_x86_flush_tlb_current)(vcpu); + kvm_x86_call(flush_tlb_current)(vcpu); out: return r; } @@ -5886,14 +5994,24 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err } if (r == RET_PF_INVALID) { + vcpu->stat.pf_taken++; + r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, - &emulation_type); + &emulation_type, NULL); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } if (r < 0) return r; + + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; + else if (r == RET_PF_EMULATE) + vcpu->stat.pf_emulate++; + else if (r == RET_PF_SPURIOUS) + vcpu->stat.pf_spurious++; + if (r != RET_PF_EMULATE) return 1; @@ -5995,7 +6113,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (is_noncanonical_address(addr, vcpu)) return; - static_call(kvm_x86_flush_tlb_gva)(vcpu, addr); + kvm_x86_call(flush_tlb_gva)(vcpu, addr); } if (!mmu->sync_spte) @@ -6787,6 +6905,7 @@ restart: return need_tlb_flush; } +EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) @@ -6917,7 +7036,6 @@ static unsigned long mmu_shrink_scan(struct shrinker *shrink, list_for_each_entry(kvm, &vm_list, vm_list) { int idx; - LIST_HEAD(invalid_list); /* * Never scan more than sc->nr_to_scan VM instances. diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ce2fcd19ba6b..1721d97743e9 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -288,7 +288,8 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, } static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u64 err, bool prefetch, int *emulation_type) + u64 err, bool prefetch, + int *emulation_type, u8 *level) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, @@ -318,14 +319,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); } - /* - * Async #PF "faults", a.k.a. prefetch faults, are not faults from the - * guest perspective and have already been counted at the time of the - * original fault. - */ - if (!prefetch) - vcpu->stat.pf_taken++; - if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp) r = kvm_tdp_page_fault(vcpu, &fault); else @@ -344,20 +337,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; + if (level) + *level = fault.goal_level; - /* - * Similar to above, prefetch faults aren't truly spurious, and the - * async #PF path doesn't do emulation. Do count faults that are fixed - * by the async #PF handler though, otherwise they'll never be counted. - */ - if (r == RET_PF_FIXED) - vcpu->stat.pf_fixed++; - else if (prefetch) - ; - else if (r == RET_PF_EMULATE) - vcpu->stat.pf_emulate++; - else if (r == RET_PF_SPURIOUS) - vcpu->stat.pf_spurious++; return r; } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index d3dbcf382ed2..69941cebb3a8 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -911,7 +911,8 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int gpa_t pte_gpa; gfn_t gfn; - if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE)) + if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE || + !sp->shadowed_translation)) return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index a5e014d7bc62..d4527965e48c 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -43,7 +43,25 @@ u64 __read_mostly shadow_acc_track_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; -u8 __read_mostly shadow_phys_bits; +static u8 __init kvm_get_host_maxphyaddr(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR, + * when reasoning about CPU behavior with respect to MAXPHYADDR. + */ + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; + + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; +} void __init kvm_mmu_spte_module_init(void) { @@ -55,6 +73,8 @@ void __init kvm_mmu_spte_module_init(void) * will change when the vendor module is (re)loaded. */ allow_mmio_caching = enable_mmio_caching; + + kvm_host.maxphyaddr = kvm_get_host_maxphyaddr(); } static u64 generation_mmio_spte_mask(u64 gen) @@ -190,8 +210,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= PT_PAGE_SIZE_MASK; if (shadow_memtype_mask) - spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -271,18 +291,12 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, - int index) +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index) { - u64 child_spte; - - if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) - return 0; + u64 child_spte = huge_spte; - if (WARN_ON_ONCE(!is_large_pte(huge_spte))) - return 0; - - child_spte = huge_spte; + KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm); /* * The child_spte already has the base address of the huge page being @@ -383,7 +397,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) * not set any RWX bits. */ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || - WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) + WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value)) mmio_value = 0; if (!mmio_value) @@ -441,8 +455,6 @@ void kvm_mmu_reset_all_pte_masks(void) u8 low_phys_bits; u64 mask; - shadow_phys_bits = kvm_get_shadow_phys_bits(); - /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is @@ -494,7 +506,7 @@ void kvm_mmu_reset_all_pte_masks(void) * 52-bit physical addresses then there are no reserved PA bits in the * PTEs and so the reserved PA approach must be disabled. */ - if (shadow_phys_bits < 52) + if (kvm_host.maxphyaddr < 52) mask = BIT_ULL(51) | PT_PRESENT_MASK; else mask = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 52fa004a1fbc..ef793c459b05 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -202,7 +202,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; /* * If a thread running without exclusive control of the MMU lock must perform a - * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a + * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a * non-present intermediate value. Other threads which encounter this value * should not modify the SPTE. * @@ -212,14 +212,14 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * * Only used by the TDP MMU. */ -#define REMOVED_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) +#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) /* Removed SPTEs must not be misconstrued as shadow present PTEs. */ -static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); +static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); -static inline bool is_removed_spte(u64 spte) +static inline bool is_frozen_spte(u64 spte) { - return spte == REMOVED_SPTE; + return spte == FROZEN_SPTE; } /* Get an SPTE's index into its parent's page table (and the spt array). */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 36539c1b36cd..c7dc49ee7388 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -365,8 +365,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * value to the removed SPTE value. */ for (;;) { - old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); - if (!is_removed_spte(old_spte)) + old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); + if (!is_frozen_spte(old_spte)) break; cpu_relax(); } @@ -397,11 +397,11 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * No retry is needed in the atomic update path as the * sole concern is dropping a Dirty bit, i.e. no other * task can zap/remove the SPTE as mmu_lock is held for - * write. Marking the SPTE as a removed SPTE is not + * write. Marking the SPTE as a frozen SPTE is not * strictly necessary for the same reason, but using - * the remove SPTE value keeps the shared/exclusive + * the frozen SPTE value keeps the shared/exclusive * paths consistent and allows the handle_changed_spte() - * call below to hardcode the new value to REMOVED_SPTE. + * call below to hardcode the new value to FROZEN_SPTE. * * Note, even though dropping a Dirty bit is the only * scenario where a non-atomic update could result in a @@ -413,10 +413,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * it here. */ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, - REMOVED_SPTE, level); + FROZEN_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_spte, REMOVED_SPTE, level, shared); + old_spte, FROZEN_SPTE, level, shared); } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -490,19 +490,19 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (!was_present && !is_present) { /* - * If this change does not involve a MMIO SPTE or removed SPTE, + * If this change does not involve a MMIO SPTE or frozen SPTE, * it is unexpected. Log the change, though it should not * impact the guest since both the former and current SPTEs * are nonpresent. */ if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) && !is_mmio_spte(kvm, new_spte) && - !is_removed_spte(new_spte))) + !is_frozen_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" "are MMIO SPTEs, or the new SPTE is\n" - "a temporary removed SPTE.\n" + "a temporary frozen SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); return; @@ -530,7 +530,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } -static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) +static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, + u64 new_spte) { u64 *sptep = rcu_dereference(iter->sptep); @@ -540,7 +541,7 @@ static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) * and pre-checking before inserting a new SPTE is advantageous as it * avoids unnecessary work. */ - WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); + WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte)); /* * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and @@ -572,9 +573,9 @@ static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) * no side-effects other than setting iter->old_spte to the last * known value of the spte. */ -static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { int ret; @@ -590,8 +591,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, return 0; } -static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) +static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter) { int ret; @@ -603,26 +604,26 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * in its place before the TLBs are flushed. * * Delay processing of the zapped SPTE until after TLBs are flushed and - * the REMOVED_SPTE is replaced (see below). + * the FROZEN_SPTE is replaced (see below). */ - ret = __tdp_mmu_set_spte_atomic(iter, REMOVED_SPTE); + ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE); if (ret) return ret; kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); /* - * No other thread can overwrite the removed SPTE as they must either + * No other thread can overwrite the frozen SPTE as they must either * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not - * overwrite the special removed SPTE value. Use the raw write helper to + * overwrite the special frozen SPTE value. Use the raw write helper to * avoid an unnecessary check on volatile bits. */ __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE); /* * Process the zapped SPTE after flushing TLBs, and after replacing - * REMOVED_SPTE with 0. This minimizes the amount of time vCPUs are - * blocked by the REMOVED_SPTE and reduces contention on the child + * FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are + * blocked by the FROZEN_SPTE and reduces contention on the child * SPTEs. */ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, @@ -652,12 +653,12 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, /* * No thread should be using this function to set SPTEs to or from the - * temporary removed SPTE value. + * temporary frozen SPTE value. * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic * should be used. If operating under the MMU lock in write mode, the - * use of the removed SPTE should not be necessary. + * use of the frozen SPTE should not be necessary. */ - WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); + WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte)); old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); @@ -1126,7 +1127,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * If SPTE has been frozen by another thread, just give up and * retry, avoiding unnecessary page table allocation and free. */ - if (is_removed_spte(iter.old_spte)) + if (is_frozen_spte(iter.old_spte)) goto retry; if (iter.level == fault->goal_level) @@ -1339,17 +1340,15 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, return spte_set; } -static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void) { struct kvm_mmu_page *sp; - gfp |= __GFP_ZERO; - - sp = kmem_cache_alloc(mmu_page_header_cache, gfp); + sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT); if (!sp) return NULL; - sp->spt = (void *)__get_free_page(gfp); + sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!sp->spt) { kmem_cache_free(mmu_page_header_cache, sp); return NULL; @@ -1358,47 +1357,6 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) return sp; } -static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, - struct tdp_iter *iter, - bool shared) -{ - struct kvm_mmu_page *sp; - - kvm_lockdep_assert_mmu_lock_held(kvm, shared); - - /* - * Since we are allocating while under the MMU lock we have to be - * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct - * reclaim and to avoid making any filesystem callbacks (which can end - * up invoking KVM MMU notifiers, resulting in a deadlock). - * - * If this allocation fails we drop the lock and retry with reclaim - * allowed. - */ - sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); - if (sp) - return sp; - - rcu_read_unlock(); - - if (shared) - read_unlock(&kvm->mmu_lock); - else - write_unlock(&kvm->mmu_lock); - - iter->yielded = true; - sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); - - if (shared) - read_lock(&kvm->mmu_lock); - else - write_lock(&kvm->mmu_lock); - - rcu_read_lock(); - - return sp; -} - /* Note, the caller is responsible for initializing @sp. */ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) @@ -1445,7 +1403,6 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, { struct kvm_mmu_page *sp = NULL; struct tdp_iter iter; - int ret = 0; rcu_read_lock(); @@ -1469,17 +1426,31 @@ retry: continue; if (!sp) { - sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); + rcu_read_unlock(); + + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); + + sp = tdp_mmu_alloc_sp_for_split(); + + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); + if (!sp) { - ret = -ENOMEM; trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte, - iter.level, ret); - break; + iter.level, -ENOMEM); + return -ENOMEM; } - if (iter.yielded) - continue; + rcu_read_lock(); + + iter.yielded = true; + continue; } tdp_mmu_init_child_sp(sp, &iter); @@ -1500,7 +1471,7 @@ retry: if (sp) tdp_mmu_free_sp(sp); - return ret; + return 0; } @@ -1801,12 +1772,11 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, * * WARNING: This function is only intended to be called during fast_page_fault. */ -u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte) { struct tdp_iter iter; struct kvm_mmu *mmu = vcpu->arch.mmu; - gfn_t gfn = addr >> PAGE_SHIFT; tdp_ptep_t sptep = NULL; tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 58b55e61bd33..1b74e058a81c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -64,7 +64,7 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void) int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); -u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index a67c28a56417..05490b9d8a43 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -19,33 +19,21 @@ #include <asm/mtrr.h> #include "cpuid.h" -#include "mmu.h" -#define IA32_MTRR_DEF_TYPE_E (1ULL << 11) -#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10) -#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff) - -static bool is_mtrr_base_msr(unsigned int msr) -{ - /* MTRR base MSRs use even numbers, masks use odd numbers. */ - return !(msr & 0x1); -} - -static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu, - unsigned int msr) +static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr) { - int index = (msr - MTRRphysBase_MSR(0)) / 2; - - return &vcpu->arch.mtrr_state.var_ranges[index]; -} + int index; -static bool msr_mtrr_valid(unsigned msr) -{ switch (msr) { case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1): + index = msr - MTRRphysBase_MSR(0); + return &vcpu->arch.mtrr_state.var[index]; case MSR_MTRRfix64K_00000: + return &vcpu->arch.mtrr_state.fixed_64k; case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: + index = msr - MSR_MTRRfix16K_80000; + return &vcpu->arch.mtrr_state.fixed_16k[index]; case MSR_MTRRfix4K_C0000: case MSR_MTRRfix4K_C8000: case MSR_MTRRfix4K_D0000: @@ -54,10 +42,14 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRfix4K_E8000: case MSR_MTRRfix4K_F0000: case MSR_MTRRfix4K_F8000: + index = msr - MSR_MTRRfix4K_C0000; + return &vcpu->arch.mtrr_state.fixed_4k[index]; case MSR_MTRRdefType: - return true; + return &vcpu->arch.mtrr_state.deftype; + default: + break; } - return false; + return NULL; } static bool valid_mtrr_type(unsigned t) @@ -70,9 +62,6 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) int i; u64 mask; - if (!msr_mtrr_valid(msr)) - return false; - if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; @@ -85,8 +74,9 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } /* variable MTRRs */ - WARN_ON(!(msr >= MTRRphysBase_MSR(0) && - msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))); + if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) && + msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)))) + return false; mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { @@ -94,309 +84,32 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!valid_mtrr_type(data & 0xff)) return false; mask |= 0xf00; - } else + } else { /* MTRR mask */ mask |= 0x7ff; - - return (data & mask) == 0; -} - -static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state) -{ - return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E); -} - -static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state) -{ - return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE); -} - -static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state) -{ - return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK; -} - -static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu) -{ - /* - * Intel SDM 11.11.2.2: all MTRRs are disabled when - * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC - * memory type is applied to all of physical memory. - * - * However, virtual machines can be run with CPUID such that - * there are no MTRRs. In that case, the firmware will never - * enable MTRRs and it is obviously undesirable to run the - * guest entirely with UC memory and we use WB. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR)) - return MTRR_TYPE_UNCACHABLE; - else - return MTRR_TYPE_WRBACK; -} - -/* -* Three terms are used in the following code: -* - segment, it indicates the address segments covered by fixed MTRRs. -* - unit, it corresponds to the MSR entry in the segment. -* - range, a range is covered in one memory cache type. -*/ -struct fixed_mtrr_segment { - u64 start; - u64 end; - - int range_shift; - - /* the start position in kvm_mtrr.fixed_ranges[]. */ - int range_start; -}; - -static struct fixed_mtrr_segment fixed_seg_table[] = { - /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */ - { - .start = 0x0, - .end = 0x80000, - .range_shift = 16, /* 64K */ - .range_start = 0, - }, - - /* - * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units, - * 16K fixed mtrr. - */ - { - .start = 0x80000, - .end = 0xc0000, - .range_shift = 14, /* 16K */ - .range_start = 8, - }, - - /* - * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units, - * 4K fixed mtrr. - */ - { - .start = 0xc0000, - .end = 0x100000, - .range_shift = 12, /* 12K */ - .range_start = 24, - } -}; - -/* - * The size of unit is covered in one MSR, one MSR entry contains - * 8 ranges so that unit size is always 8 * 2^range_shift. - */ -static u64 fixed_mtrr_seg_unit_size(int seg) -{ - return 8 << fixed_seg_table[seg].range_shift; -} - -static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit) -{ - switch (msr) { - case MSR_MTRRfix64K_00000: - *seg = 0; - *unit = 0; - break; - case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: - *seg = 1; - *unit = array_index_nospec( - msr - MSR_MTRRfix16K_80000, - MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1); - break; - case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: - *seg = 2; - *unit = array_index_nospec( - msr - MSR_MTRRfix4K_C0000, - MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1); - break; - default: - return false; } - return true; -} - -static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end) -{ - struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; - u64 unit_size = fixed_mtrr_seg_unit_size(seg); - - *start = mtrr_seg->start + unit * unit_size; - *end = *start + unit_size; - WARN_ON(*end > mtrr_seg->end); -} - -static int fixed_mtrr_seg_unit_range_index(int seg, int unit) -{ - struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; - - WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg) - > mtrr_seg->end); - - /* each unit has 8 ranges. */ - return mtrr_seg->range_start + 8 * unit; -} - -static int fixed_mtrr_seg_end_range_index(int seg) -{ - struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; - int n; - - n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift; - return mtrr_seg->range_start + n - 1; -} - -static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end) -{ - int seg, unit; - - if (!fixed_msr_to_seg_unit(msr, &seg, &unit)) - return false; - - fixed_mtrr_seg_unit_range(seg, unit, start, end); - return true; -} - -static int fixed_msr_to_range_index(u32 msr) -{ - int seg, unit; - - if (!fixed_msr_to_seg_unit(msr, &seg, &unit)) - return -1; - - return fixed_mtrr_seg_unit_range_index(seg, unit); -} - -static int fixed_mtrr_addr_to_seg(u64 addr) -{ - struct fixed_mtrr_segment *mtrr_seg; - int seg, seg_num = ARRAY_SIZE(fixed_seg_table); - - for (seg = 0; seg < seg_num; seg++) { - mtrr_seg = &fixed_seg_table[seg]; - if (mtrr_seg->start <= addr && addr < mtrr_seg->end) - return seg; - } - - return -1; -} - -static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg) -{ - struct fixed_mtrr_segment *mtrr_seg; - int index; - - mtrr_seg = &fixed_seg_table[seg]; - index = mtrr_seg->range_start; - index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift; - return index; -} - -static u64 fixed_mtrr_range_end_addr(int seg, int index) -{ - struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; - int pos = index - mtrr_seg->range_start; - - return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift); -} - -static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end) -{ - u64 mask; - - *start = range->base & PAGE_MASK; - - mask = range->mask & PAGE_MASK; - - /* This cannot overflow because writing to the reserved bits of - * variable MTRRs causes a #GP. - */ - *end = (*start | ~mask) + 1; -} - -static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) -{ - struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - gfn_t start, end; - - if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) - return; - - if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) - return; - - /* fixed MTRRs. */ - if (fixed_msr_to_range(msr, &start, &end)) { - if (!fixed_mtrr_is_enabled(mtrr_state)) - return; - } else if (msr == MSR_MTRRdefType) { - start = 0x0; - end = ~0ULL; - } else { - /* variable range MTRRs. */ - var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end); - } - - kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); -} - -static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range) -{ - return (range->mask & (1 << 11)) != 0; -} - -static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - struct kvm_mtrr_range *tmp, *cur; - - cur = var_mtrr_msr_to_range(vcpu, msr); - - /* remove the entry if it's in the list. */ - if (var_mtrr_range_is_valid(cur)) - list_del(&cur->node); - - /* - * Set all illegal GPA bits in the mask, since those bits must - * implicitly be 0. The bits are then cleared when reading them. - */ - if (is_mtrr_base_msr(msr)) - cur->base = data; - else - cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu); - - /* add it to the list if it's enabled. */ - if (var_mtrr_range_is_valid(cur)) { - list_for_each_entry(tmp, &mtrr_state->head, node) - if (cur->base >= tmp->base) - break; - list_add_tail(&cur->node, &tmp->node); - } + return (data & mask) == 0; } int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - int index; + u64 *mtrr; - if (!kvm_mtrr_valid(vcpu, msr, data)) + mtrr = find_mtrr(vcpu, msr); + if (!mtrr) return 1; - index = fixed_msr_to_range_index(msr); - if (index >= 0) - *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data; - else if (msr == MSR_MTRRdefType) - vcpu->arch.mtrr_state.deftype = data; - else - set_var_mtrr_msr(vcpu, msr, data); + if (!kvm_mtrr_valid(vcpu, msr, data)) + return 1; - update_mtrr(vcpu, msr); + *mtrr = data; return 0; } int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { - int index; + u64 *mtrr; /* MSR_MTRRcap is a readonly MSR. */ if (msr == MSR_MTRRcap) { @@ -410,311 +123,10 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } - if (!msr_mtrr_valid(msr)) + mtrr = find_mtrr(vcpu, msr); + if (!mtrr) return 1; - index = fixed_msr_to_range_index(msr); - if (index >= 0) { - *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index]; - } else if (msr == MSR_MTRRdefType) { - *pdata = vcpu->arch.mtrr_state.deftype; - } else { - /* Variable MTRRs */ - if (is_mtrr_base_msr(msr)) - *pdata = var_mtrr_msr_to_range(vcpu, msr)->base; - else - *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask; - - *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu); - } - + *pdata = *mtrr; return 0; } - -void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu) -{ - INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head); -} - -struct mtrr_iter { - /* input fields. */ - struct kvm_mtrr *mtrr_state; - u64 start; - u64 end; - - /* output fields. */ - int mem_type; - /* mtrr is completely disabled? */ - bool mtrr_disabled; - /* [start, end) is not fully covered in MTRRs? */ - bool partial_map; - - /* private fields. */ - union { - /* used for fixed MTRRs. */ - struct { - int index; - int seg; - }; - - /* used for var MTRRs. */ - struct { - struct kvm_mtrr_range *range; - /* max address has been covered in var MTRRs. */ - u64 start_max; - }; - }; - - bool fixed; -}; - -static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter) -{ - int seg, index; - - if (!fixed_mtrr_is_enabled(iter->mtrr_state)) - return false; - - seg = fixed_mtrr_addr_to_seg(iter->start); - if (seg < 0) - return false; - - iter->fixed = true; - index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg); - iter->index = index; - iter->seg = seg; - return true; -} - -static bool match_var_range(struct mtrr_iter *iter, - struct kvm_mtrr_range *range) -{ - u64 start, end; - - var_mtrr_range(range, &start, &end); - if (!(start >= iter->end || end <= iter->start)) { - iter->range = range; - - /* - * the function is called when we do kvm_mtrr.head walking. - * Range has the minimum base address which interleaves - * [looker->start_max, looker->end). - */ - iter->partial_map |= iter->start_max < start; - - /* update the max address has been covered. */ - iter->start_max = max(iter->start_max, end); - return true; - } - - return false; -} - -static void __mtrr_lookup_var_next(struct mtrr_iter *iter) -{ - struct kvm_mtrr *mtrr_state = iter->mtrr_state; - - list_for_each_entry_continue(iter->range, &mtrr_state->head, node) - if (match_var_range(iter, iter->range)) - return; - - iter->range = NULL; - iter->partial_map |= iter->start_max < iter->end; -} - -static void mtrr_lookup_var_start(struct mtrr_iter *iter) -{ - struct kvm_mtrr *mtrr_state = iter->mtrr_state; - - iter->fixed = false; - iter->start_max = iter->start; - iter->range = NULL; - iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node); - - __mtrr_lookup_var_next(iter); -} - -static void mtrr_lookup_fixed_next(struct mtrr_iter *iter) -{ - /* terminate the lookup. */ - if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) { - iter->fixed = false; - iter->range = NULL; - return; - } - - iter->index++; - - /* have looked up for all fixed MTRRs. */ - if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges)) - return mtrr_lookup_var_start(iter); - - /* switch to next segment. */ - if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg)) - iter->seg++; -} - -static void mtrr_lookup_var_next(struct mtrr_iter *iter) -{ - __mtrr_lookup_var_next(iter); -} - -static void mtrr_lookup_start(struct mtrr_iter *iter) -{ - if (!mtrr_is_enabled(iter->mtrr_state)) { - iter->mtrr_disabled = true; - return; - } - - if (!mtrr_lookup_fixed_start(iter)) - mtrr_lookup_var_start(iter); -} - -static void mtrr_lookup_init(struct mtrr_iter *iter, - struct kvm_mtrr *mtrr_state, u64 start, u64 end) -{ - iter->mtrr_state = mtrr_state; - iter->start = start; - iter->end = end; - iter->mtrr_disabled = false; - iter->partial_map = false; - iter->fixed = false; - iter->range = NULL; - - mtrr_lookup_start(iter); -} - -static bool mtrr_lookup_okay(struct mtrr_iter *iter) -{ - if (iter->fixed) { - iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index]; - return true; - } - - if (iter->range) { - iter->mem_type = iter->range->base & 0xff; - return true; - } - - return false; -} - -static void mtrr_lookup_next(struct mtrr_iter *iter) -{ - if (iter->fixed) - mtrr_lookup_fixed_next(iter); - else - mtrr_lookup_var_next(iter); -} - -#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \ - for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \ - mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_)) - -u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - struct mtrr_iter iter; - u64 start, end; - int type = -1; - const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK) - | (1 << MTRR_TYPE_WRTHROUGH); - - start = gfn_to_gpa(gfn); - end = start + PAGE_SIZE; - - mtrr_for_each_mem_type(&iter, mtrr_state, start, end) { - int curr_type = iter.mem_type; - - /* - * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR - * Precedences. - */ - - if (type == -1) { - type = curr_type; - continue; - } - - /* - * If two or more variable memory ranges match and the - * memory types are identical, then that memory type is - * used. - */ - if (type == curr_type) - continue; - - /* - * If two or more variable memory ranges match and one of - * the memory types is UC, the UC memory type used. - */ - if (curr_type == MTRR_TYPE_UNCACHABLE) - return MTRR_TYPE_UNCACHABLE; - - /* - * If two or more variable memory ranges match and the - * memory types are WT and WB, the WT memory type is used. - */ - if (((1 << type) & wt_wb_mask) && - ((1 << curr_type) & wt_wb_mask)) { - type = MTRR_TYPE_WRTHROUGH; - continue; - } - - /* - * For overlaps not defined by the above rules, processor - * behavior is undefined. - */ - - /* We use WB for this undefined behavior. :( */ - return MTRR_TYPE_WRBACK; - } - - if (iter.mtrr_disabled) - return mtrr_disabled_type(vcpu); - - /* not contained in any MTRRs. */ - if (type == -1) - return mtrr_default_type(mtrr_state); - - /* - * We just check one page, partially covered by MTRRs is - * impossible. - */ - WARN_ON(iter.partial_map); - - return type; -} -EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); - -bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, - int page_num) -{ - struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - struct mtrr_iter iter; - u64 start, end; - int type = -1; - - start = gfn_to_gpa(gfn); - end = gfn_to_gpa(gfn + page_num); - mtrr_for_each_mem_type(&iter, mtrr_state, start, end) { - if (type == -1) { - type = iter.mem_type; - continue; - } - - if (type != iter.mem_type) - return false; - } - - if (iter.mtrr_disabled) - return true; - - if (!iter.partial_map) - return true; - - if (type == -1) - return true; - - return type == mtrr_default_type(mtrr_state); -} diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a593b03c9aed..47a46283c866 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -34,16 +34,16 @@ EXPORT_SYMBOL_GPL(kvm_pmu_eventsel); /* Precise Distribution of Instructions Retired (PDIR) */ static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + X86_MATCH_VFM(INTEL_ICELAKE_D, NULL), + X86_MATCH_VFM(INTEL_ICELAKE_X, NULL), /* Instruction-Accurate PDIR (PDIR++) */ - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL), {} }; /* Precise Distribution (PDist) */ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL), {} }; @@ -69,7 +69,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { * code. Each pmc, stored in kvm_pmc.idx field, is unique across * all perf counters (both gp and fixed). The mapping relationship * between pmc and perf counters is as the following: - * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters + * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters @@ -194,7 +194,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, attr.sample_period = get_sample_period(pmc, pmc->counter); if ((attr.config & HSW_IN_TX_CHECKPOINTED) && - guest_cpuid_is_intel(pmc->vcpu)) { + (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) { /* * HSW_IN_TX_CHECKPOINTED is not supported with nonzero * period. Just clear the sample period so at least @@ -469,11 +469,11 @@ static int reprogram_counter(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) { fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, pmc->idx - KVM_FIXED_PMC_BASE_IDX); - if (fixed_ctr_ctrl & 0x1) + if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL) eventsel |= ARCH_PERFMON_EVENTSEL_OS; - if (fixed_ctr_ctrl & 0x2) + if (fixed_ctr_ctrl & INTEL_FIXED_0_USER) eventsel |= ARCH_PERFMON_EVENTSEL_USR; - if (fixed_ctr_ctrl & 0x8) + if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI) eventsel |= ARCH_PERFMON_EVENTSEL_INT; new_config = (u64)fixed_ctr_ctrl; } @@ -521,9 +521,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) } /* - * Unused perf_events are only released if the corresponding MSRs - * weren't accessed during the last vCPU time slice. kvm_arch_sched_in - * triggers KVM_REQ_PMU if cleanup is needed. + * Release unused perf_events if the corresponding guest MSRs weren't + * accessed during the last vCPU time slice (need_cleanup is set when + * the vCPU is scheduled back in). */ if (unlikely(pmu->need_cleanup)) kvm_pmu_cleanup(vcpu); @@ -542,7 +542,7 @@ int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) if (!kvm_pmu_ops.check_rdpmc_early) return 0; - return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx); + return kvm_pmu_call(check_rdpmc_early)(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -591,12 +591,12 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask); + pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask); if (!pmc) return 1; if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) && - (static_call(kvm_x86_get_cpl)(vcpu) != 0) && + (kvm_x86_call(get_cpl)(vcpu) != 0) && kvm_is_cr0_bit_set(vcpu, X86_CR0_PE)) return 1; @@ -607,7 +607,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { - static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu); + kvm_pmu_call(deliver_pmi)(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } } @@ -622,14 +622,14 @@ bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) default: break; } - return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || - static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); + return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) || + kvm_pmu_call(is_valid_msr)(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr); + struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr); if (pmc) __set_bit(pmc->idx, pmu->pmc_in_use); @@ -654,7 +654,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; break; default: - return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + return kvm_pmu_call(get_msr)(vcpu, msr_info); } return 0; @@ -681,13 +681,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) break; - if (data & pmu->global_status_mask) + if (data & pmu->global_status_rsvd) return 1; pmu->global_status = data; break; case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: - data &= ~pmu->global_ctrl_mask; + data &= ~pmu->global_ctrl_rsvd; fallthrough; case MSR_CORE_PERF_GLOBAL_CTRL: if (!kvm_valid_perf_global_ctrl(pmu, data)) @@ -704,7 +704,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in * GLOBAL_STATUS, and so the set of reserved bits is the same. */ - if (data & pmu->global_status_mask) + if (data & pmu->global_status_rsvd) return 1; fallthrough; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: @@ -713,7 +713,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; default: kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + return kvm_pmu_call(set_msr)(vcpu, msr_info); } return 0; @@ -740,7 +740,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu) pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; - static_call_cond(kvm_x86_pmu_reset)(vcpu); + kvm_pmu_call(reset)(vcpu); } @@ -768,17 +768,17 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; - pmu->global_ctrl_mask = ~0ull; - pmu->global_status_mask = ~0ull; - pmu->fixed_ctr_ctrl_mask = ~0ull; - pmu->pebs_enable_mask = ~0ull; - pmu->pebs_data_cfg_mask = ~0ull; + pmu->global_ctrl_rsvd = ~0ull; + pmu->global_status_rsvd = ~0ull; + pmu->fixed_ctr_ctrl_rsvd = ~0ull; + pmu->pebs_enable_rsvd = ~0ull; + pmu->pebs_data_cfg_rsvd = ~0ull; bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); if (!vcpu->kvm->arch.enable_pmu) return; - static_call(kvm_x86_pmu_refresh)(vcpu); + kvm_pmu_call(refresh)(vcpu); /* * At RESET, both Intel and AMD CPUs set all enable bits for general @@ -796,7 +796,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); - static_call(kvm_x86_pmu_init)(vcpu); + kvm_pmu_call(init)(vcpu); kvm_pmu_refresh(vcpu); } @@ -818,7 +818,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); } - static_call_cond(kvm_x86_pmu_cleanup)(vcpu); + kvm_pmu_call(cleanup)(vcpu); bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); } @@ -846,8 +846,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) } else { config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, pmc->idx - KVM_FIXED_PMC_BASE_IDX); - select_os = config & 0x1; - select_user = config & 0x2; + select_os = config & INTEL_FIXED_0_KERNEL; + select_user = config & INTEL_FIXED_0_USER; } /* @@ -857,7 +857,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) if (select_os == select_user) return select_os; - return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; + return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os : + select_user; } void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 4d52b0b539ba..ad89d0bd6005 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -14,7 +14,8 @@ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ -#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) +#define fixed_ctrl_field(ctrl_reg, idx) \ + (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) #define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000 #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 @@ -129,7 +130,7 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc) static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, u64 data) { - return !(pmu->global_ctrl_mask & data); + return !(pmu->global_ctrl_rsvd & data); } /* returns general purpose PMC with the specified MSR. Note that it can be @@ -170,7 +171,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3; + pmc->idx - KVM_FIXED_PMC_BASE_IDX) & + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER); return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; } @@ -217,7 +219,7 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, pmu_ops->MAX_NR_GP_COUNTERS); kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, - KVM_PMC_MAX_FIXED); + KVM_MAX_NR_FIXED_COUNTERS); kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index d06d43d8d2aa..00e3c27d2a87 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -200,11 +200,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR); enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR); - static_call(kvm_x86_get_gdt)(vcpu, &dt); + kvm_x86_call(get_gdt)(vcpu, &dt); smram->gdtr.base = dt.address; smram->gdtr.limit = dt.size; - static_call(kvm_x86_get_idt)(vcpu, &dt); + kvm_x86_call(get_idt)(vcpu, &dt); smram->idtr.base = dt.address; smram->idtr.limit = dt.size; @@ -220,7 +220,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, smram->smm_revision = 0x00020000; smram->smbase = vcpu->arch.smbase; - smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); } #ifdef CONFIG_X86_64 @@ -250,13 +250,13 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR); - static_call(kvm_x86_get_idt)(vcpu, &dt); + kvm_x86_call(get_idt)(vcpu, &dt); smram->idtr.limit = dt.size; smram->idtr.base = dt.address; enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR); - static_call(kvm_x86_get_gdt)(vcpu, &dt); + kvm_x86_call(get_gdt)(vcpu, &dt); smram->gdtr.limit = dt.size; smram->gdtr.base = dt.address; @@ -267,7 +267,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS); enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS); - smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); } #endif @@ -297,7 +297,7 @@ void enter_smm(struct kvm_vcpu *vcpu) * Kill the VM in the unlikely case of failure, because the VM * can be in undefined state in this case. */ - if (static_call(kvm_x86_enter_smm)(vcpu, &smram)) + if (kvm_x86_call(enter_smm)(vcpu, &smram)) goto error; kvm_smm_changed(vcpu, true); @@ -305,24 +305,24 @@ void enter_smm(struct kvm_vcpu *vcpu) if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram))) goto error; - if (static_call(kvm_x86_get_nmi_mask)(vcpu)) + if (kvm_x86_call(get_nmi_mask)(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - static_call(kvm_x86_set_nmi_mask)(vcpu, true); + kvm_x86_call(set_nmi_mask)(vcpu, true); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0x8000); - static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + kvm_x86_call(set_interrupt_shadow)(vcpu, 0); cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - static_call(kvm_x86_set_cr0)(vcpu, cr0); + kvm_x86_call(set_cr0)(vcpu, cr0); - static_call(kvm_x86_set_cr4)(vcpu, 0); + kvm_x86_call(set_cr4)(vcpu, 0); /* Undocumented: IDT limit is set to zero on entry to SMM. */ dt.address = dt.size = 0; - static_call(kvm_x86_set_idt)(vcpu, &dt); + kvm_x86_call(set_idt)(vcpu, &dt); if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1))) goto error; @@ -354,7 +354,7 @@ void enter_smm(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - if (static_call(kvm_x86_set_efer)(vcpu, 0)) + if (kvm_x86_call(set_efer)(vcpu, 0)) goto error; #endif @@ -479,11 +479,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, dt.address = smstate->gdtr.base; dt.size = smstate->gdtr.limit; - static_call(kvm_x86_set_gdt)(vcpu, &dt); + kvm_x86_call(set_gdt)(vcpu, &dt); dt.address = smstate->idtr.base; dt.size = smstate->idtr.limit; - static_call(kvm_x86_set_idt)(vcpu, &dt); + kvm_x86_call(set_idt)(vcpu, &dt); rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES); rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS); @@ -501,7 +501,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, if (r != X86EMUL_CONTINUE) return r; - static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + kvm_x86_call(set_interrupt_shadow)(vcpu, 0); ctxt->interruptibility = (u8)smstate->int_shadow; return r; @@ -535,13 +535,13 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, dt.size = smstate->idtr.limit; dt.address = smstate->idtr.base; - static_call(kvm_x86_set_idt)(vcpu, &dt); + kvm_x86_call(set_idt)(vcpu, &dt); rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR); dt.size = smstate->gdtr.limit; dt.address = smstate->gdtr.base; - static_call(kvm_x86_set_gdt)(vcpu, &dt); + kvm_x86_call(set_gdt)(vcpu, &dt); r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4); if (r != X86EMUL_CONTINUE) @@ -554,7 +554,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS); rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS); - static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + kvm_x86_call(set_interrupt_shadow)(vcpu, 0); ctxt->interruptibility = (u8)smstate->int_shadow; return X86EMUL_CONTINUE; @@ -576,7 +576,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0) - static_call(kvm_x86_set_nmi_mask)(vcpu, false); + kvm_x86_call(set_nmi_mask)(vcpu, false); kvm_smm_changed(vcpu, false); @@ -628,7 +628,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) * state (e.g. enter guest mode) before loading state from the SMM * state-save area. */ - if (static_call(kvm_x86_leave_smm)(vcpu, &smram)) + if (kvm_x86_call(leave_smm)(vcpu, &smram)) return X86EMUL_UNHANDLEABLE; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 55b9a6d96bcf..6f704c1037e5 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1181,7 +1181,7 @@ int svm_allocate_nested(struct vcpu_svm *svm) if (svm->nested.initialized) return 0; - vmcb02_page = snp_safe_alloc_page(&svm->vcpu); + vmcb02_page = snp_safe_alloc_page(); if (!vmcb02_page) return -ENOMEM; svm->nested.vmcb02.ptr = page_address(vmcb02_page); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index dfcc38bd97d3..22d5a65b410c 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -199,8 +199,8 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); if (pmu->version > 1) { - pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); - pmu->global_status_mask = pmu->global_ctrl_mask; + pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_status_rsvd = pmu->global_ctrl_rsvd; } pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; @@ -217,10 +217,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE); - BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC); + BUILD_BUG_ON(KVM_MAX_NR_AMD_GP_COUNTERS > AMD64_NUM_COUNTERS_CORE); - for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) { + for (i = 0; i < KVM_MAX_NR_AMD_GP_COUNTERS; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -238,6 +237,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .refresh = amd_pmu_refresh, .init = amd_pmu_init, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, - .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, + .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 95095a233a45..a16c873b3232 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -19,12 +19,14 @@ #include <linux/misc_cgroup.h> #include <linux/processor.h> #include <linux/trace_events.h> +#include <uapi/linux/sev-guest.h> #include <asm/pkru.h> #include <asm/trapnr.h> #include <asm/fpu/xcr.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> +#include <asm/sev.h> #include "mmu.h" #include "x86.h" @@ -37,7 +39,7 @@ #define GHCB_VERSION_DEFAULT 2ULL #define GHCB_VERSION_MIN 1ULL -#define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP +#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) /* enable/disable SEV support */ static bool sev_enabled = true; @@ -47,6 +49,10 @@ module_param_named(sev, sev_enabled, bool, 0444); static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); +/* enable/disable SEV-SNP support */ +static bool sev_snp_enabled = true; +module_param_named(sev_snp, sev_snp_enabled, bool, 0444); + /* enable/disable SEV-ES DebugSwap support */ static bool sev_es_debug_swap_enabled = true; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); @@ -56,6 +62,23 @@ static u64 sev_supported_vmsa_features; #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + +#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) + +#define INITIAL_VMSA_GPA 0xFFFFFFFFF000 + static u8 sev_enc_bit; static DECLARE_RWSEM(sev_deactivate_lock); static DEFINE_MUTEX(sev_bitmap_lock); @@ -66,6 +89,8 @@ static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; +static int snp_decommission_context(struct kvm *kvm); + struct enc_region { struct list_head list; unsigned long npages; @@ -92,12 +117,17 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) down_write(&sev_deactivate_lock); wbinvd_on_all_cpus(); - ret = sev_guest_df_flush(&error); + + if (sev_snp_enabled) + ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); + else + ret = sev_guest_df_flush(&error); up_write(&sev_deactivate_lock); if (ret) - pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); + pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", + sev_snp_enabled ? "-SNP" : "", ret, error); return ret; } @@ -233,6 +263,53 @@ static void sev_decommission(unsigned int handle) sev_guest_decommission(&decommission, NULL); } +/* + * Transition a page to hypervisor-owned/shared state in the RMP table. This + * should not fail under normal conditions, but leak the page should that + * happen since it will no longer be usable by the host due to RMP protections. + */ +static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level) +{ + if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) { + snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); + return -EIO; + } + + return 0; +} + +/* + * Certain page-states, such as Pre-Guest and Firmware pages (as documented + * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be + * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE + * unless they are reclaimed first. + * + * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they + * might not be usable by the host due to being set as immutable or still + * being associated with a guest ASID. + * + * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be + * converted back to shared, as the page is no longer usable due to RMP + * protections, and it's infeasible for the guest to continue on. + */ +static int snp_page_reclaim(struct kvm *kvm, u64 pfn) +{ + struct sev_data_snp_page_reclaim data = {0}; + int fw_err, rc; + + data.paddr = __sme_set(pfn << PAGE_SHIFT); + rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err); + if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) { + snp_leak_pages(pfn, 1); + return -EIO; + } + + if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K)) + return -EIO; + + return rc; +} + static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) { struct sev_data_deactivate deactivate; @@ -250,6 +327,78 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) sev_decommission(handle); } +/* + * This sets up bounce buffers/firmware pages to handle SNP Guest Request + * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB + * 2.0 specification for more details. + * + * Technically, when an SNP Guest Request is issued, the guest will provide its + * own request/response pages, which could in theory be passed along directly + * to firmware rather than using bounce pages. However, these pages would need + * special care: + * + * - Both pages are from shared guest memory, so they need to be protected + * from migration/etc. occurring while firmware reads/writes to them. At a + * minimum, this requires elevating the ref counts and potentially needing + * an explicit pinning of the memory. This places additional restrictions + * on what type of memory backends userspace can use for shared guest + * memory since there is some reliance on using refcounted pages. + * + * - The response page needs to be switched to Firmware-owned[1] state + * before the firmware can write to it, which can lead to potential + * host RMP #PFs if the guest is misbehaved and hands the host a + * guest page that KVM might write to for other reasons (e.g. virtio + * buffers/etc.). + * + * Both of these issues can be avoided completely by using separately-allocated + * bounce pages for both the request/response pages and passing those to + * firmware instead. So that's what is being set up here. + * + * Guest requests rely on message sequence numbers to ensure requests are + * issued to firmware in the order the guest issues them, so concurrent guest + * requests generally shouldn't happen. But a misbehaved guest could issue + * concurrent guest requests in theory, so a mutex is used to serialize + * access to the bounce buffers. + * + * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more + * details on Firmware-owned pages, along with "RMP and VMPL Access Checks" + * in the APM for details on the related RMP restrictions. + */ +static int snp_guest_req_init(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct page *req_page; + + req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!req_page) + return -ENOMEM; + + sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sev->guest_resp_buf) { + __free_page(req_page); + return -EIO; + } + + sev->guest_req_buf = page_address(req_page); + mutex_init(&sev->guest_req_mutex); + + return 0; +} + +static void snp_guest_req_cleanup(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + + if (sev->guest_resp_buf) + snp_free_firmware_page(sev->guest_resp_buf); + + if (sev->guest_req_buf) + __free_page(virt_to_page(sev->guest_req_buf)); + + sev->guest_req_buf = NULL; + sev->guest_resp_buf = NULL; +} + static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_init *data, unsigned long vm_type) @@ -288,6 +437,9 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (sev->es_active && !sev->ghcb_version) sev->ghcb_version = GHCB_VERSION_DEFAULT; + if (vm_type == KVM_X86_SNP_VM) + sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; + ret = sev_asid_new(sev); if (ret) goto e_no_asid; @@ -297,6 +449,10 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (ret) goto e_free; + /* This needs to happen after SEV/SNP firmware initialization. */ + if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm)) + goto e_free; + INIT_LIST_HEAD(&sev->regions_list); INIT_LIST_HEAD(&sev->mirror_vms); sev->need_init = false; @@ -348,7 +504,8 @@ static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EINVAL; if (kvm->arch.vm_type != KVM_X86_SEV_VM && - kvm->arch.vm_type != KVM_X86_SEV_ES_VM) + kvm->arch.vm_type != KVM_X86_SEV_ES_VM && + kvm->arch.vm_type != KVM_X86_SNP_VM) return -EINVAL; if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) @@ -1999,6 +2156,410 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val) } } +/* + * The guest context contains all the information, keys and metadata + * associated with the guest that the firmware tracks to implement SEV + * and SNP features. The firmware stores the guest context in hypervisor + * provide page via the SNP_GCTX_CREATE command. + */ +static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct sev_data_snp_addr data = {}; + void *context; + int rc; + + /* Allocate memory for context page */ + context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); + if (!context) + return NULL; + + data.address = __psp_pa(context); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); + if (rc) { + pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d", + rc, argp->error); + snp_free_firmware_page(context); + return NULL; + } + + return context; +} + +static int snp_bind_asid(struct kvm *kvm, int *error) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_activate data = {0}; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.asid = sev_get_asid(kvm); + return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); +} + +static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_launch_start start = {0}; + struct kvm_sev_snp_launch_start params; + int rc; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + /* Don't allow userspace to allocate memory for more than 1 SNP context. */ + if (sev->snp_context) + return -EINVAL; + + sev->snp_context = snp_context_create(kvm, argp); + if (!sev->snp_context) + return -ENOTTY; + + if (params.flags) + return -EINVAL; + + if (params.policy & ~SNP_POLICY_MASK_VALID) + return -EINVAL; + + /* Check for policy bits that must be set */ + if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) || + !(params.policy & SNP_POLICY_MASK_SMT)) + return -EINVAL; + + if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) + return -EINVAL; + + start.gctx_paddr = __psp_pa(sev->snp_context); + start.policy = params.policy; + memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); + if (rc) { + pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n", + __func__, rc); + goto e_free_context; + } + + sev->fd = argp->sev_fd; + rc = snp_bind_asid(kvm, &argp->error); + if (rc) { + pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n", + __func__, rc); + goto e_free_context; + } + + return 0; + +e_free_context: + snp_decommission_context(kvm); + + return rc; +} + +struct sev_gmem_populate_args { + __u8 type; + int sev_fd; + int fw_error; +}; + +static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn, + void __user *src, int order, void *opaque) +{ + struct sev_gmem_populate_args *sev_populate_args = opaque; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + int n_private = 0, ret, i; + int npages = (1 << order); + gfn_t gfn; + + if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) + return -EINVAL; + + for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { + struct sev_data_snp_launch_update fw_args = {0}; + bool assigned; + int level; + + if (!kvm_mem_is_private(kvm, gfn)) { + pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n", + __func__, gfn); + ret = -EINVAL; + goto err; + } + + ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); + if (ret || assigned) { + pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", + __func__, gfn, ret, assigned); + ret = -EINVAL; + goto err; + } + + if (src) { + void *vaddr = kmap_local_pfn(pfn + i); + + ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE); + if (ret) + goto err; + kunmap_local(vaddr); + } + + ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K, + sev_get_asid(kvm), true); + if (ret) + goto err; + + n_private++; + + fw_args.gctx_paddr = __psp_pa(sev->snp_context); + fw_args.address = __sme_set(pfn_to_hpa(pfn + i)); + fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); + fw_args.page_type = sev_populate_args->type; + + ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &fw_args, &sev_populate_args->fw_error); + if (ret) + goto fw_err; + } + + return 0; + +fw_err: + /* + * If the firmware command failed handle the reclaim and cleanup of that + * PFN specially vs. prior pages which can be cleaned up below without + * needing to reclaim in advance. + * + * Additionally, when invalid CPUID function entries are detected, + * firmware writes the expected values into the page and leaves it + * unencrypted so it can be used for debugging and error-reporting. + * + * Copy this page back into the source buffer so userspace can use this + * information to provide information on which CPUID leaves/fields + * failed CPUID validation. + */ + if (!snp_page_reclaim(kvm, pfn + i) && + sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && + sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { + void *vaddr = kmap_local_pfn(pfn + i); + + if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE)) + pr_debug("Failed to write CPUID page back to userspace\n"); + + kunmap_local(vaddr); + } + + /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */ + n_private--; + +err: + pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n", + __func__, ret, sev_populate_args->fw_error, n_private); + for (i = 0; i < n_private; i++) + kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K); + + return ret; +} + +static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_gmem_populate_args sev_populate_args = {0}; + struct kvm_sev_snp_launch_update params; + struct kvm_memory_slot *memslot; + long npages, count; + void __user *src; + int ret = 0; + + if (!sev_snp_guest(kvm) || !sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, + params.gfn_start, params.len, params.type, params.flags); + + if (!PAGE_ALIGNED(params.len) || params.flags || + (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && + params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && + params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && + params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS && + params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) + return -EINVAL; + + npages = params.len / PAGE_SIZE; + + /* + * For each GFN that's being prepared as part of the initial guest + * state, the following pre-conditions are verified: + * + * 1) The backing memslot is a valid private memslot. + * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES + * beforehand. + * 3) The PFN of the guest_memfd has not already been set to private + * in the RMP table. + * + * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page + * faults if there's a race between a fault and an attribute update via + * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized + * here. However, kvm->slots_lock guards against both this as well as + * concurrent memslot updates occurring while these checks are being + * performed, so use that here to make it easier to reason about the + * initial expected state and better guard against unexpected + * situations. + */ + mutex_lock(&kvm->slots_lock); + + memslot = gfn_to_memslot(kvm, params.gfn_start); + if (!kvm_slot_can_be_private(memslot)) { + ret = -EINVAL; + goto out; + } + + sev_populate_args.sev_fd = argp->sev_fd; + sev_populate_args.type = params.type; + src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); + + count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, + sev_gmem_post_populate, &sev_populate_args); + if (count < 0) { + argp->error = sev_populate_args.fw_error; + pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", + __func__, count, argp->error); + ret = -EIO; + } else { + params.gfn_start += count; + params.len -= count * PAGE_SIZE; + if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) + params.uaddr += count * PAGE_SIZE; + + ret = 0; + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) + ret = -EFAULT; + } + +out: + mutex_unlock(&kvm->slots_lock); + + return ret; +} + +static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_launch_update data = {}; + struct kvm_vcpu *vcpu; + unsigned long i; + int ret; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.page_type = SNP_PAGE_TYPE_VMSA; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vcpu_svm *svm = to_svm(vcpu); + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + ret = sev_es_sync_vmsa(svm); + if (ret) + return ret; + + /* Transition the VMSA page to a firmware state. */ + ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); + if (ret) + return ret; + + /* Issue the SNP command to encrypt the VMSA */ + data.address = __sme_pa(svm->sev_es.vmsa); + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &data, &argp->error); + if (ret) { + snp_page_reclaim(kvm, pfn); + + return ret; + } + + svm->vcpu.arch.guest_state_protected = true; + /* + * SEV-ES (and thus SNP) guest mandates LBR Virtualization to + * be _always_ ON. Enable it only after setting + * guest_state_protected because KVM_SET_MSRS allows dynamic + * toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); + } + + return 0; +} + +static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_snp_launch_finish params; + struct sev_data_snp_launch_finish *data; + void *id_block = NULL, *id_auth = NULL; + int ret; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (!sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + if (params.flags) + return -EINVAL; + + /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */ + ret = snp_launch_update_vmsa(kvm, argp); + if (ret) + return ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + if (params.id_block_en) { + id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); + if (IS_ERR(id_block)) { + ret = PTR_ERR(id_block); + goto e_free; + } + + data->id_block_en = 1; + data->id_block_paddr = __sme_pa(id_block); + + id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); + if (IS_ERR(id_auth)) { + ret = PTR_ERR(id_auth); + goto e_free_id_block; + } + + data->id_auth_paddr = __sme_pa(id_auth); + + if (params.auth_key_en) + data->auth_key_en = 1; + } + + data->vcek_disabled = params.vcek_disabled; + + memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE); + data->gctx_paddr = __psp_pa(sev->snp_context); + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); + + kfree(id_auth); + +e_free_id_block: + kfree(id_block); + +e_free: + kfree(data); + + return ret; +} + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -2022,6 +2583,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) goto out; } + /* + * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only + * allow the use of SNP-specific commands. + */ + if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) { + r = -EPERM; + goto out; + } + switch (sev_cmd.id) { case KVM_SEV_ES_INIT: if (!sev_es_enabled) { @@ -2086,6 +2656,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_RECEIVE_FINISH: r = sev_receive_finish(kvm, &sev_cmd); break; + case KVM_SEV_SNP_LAUNCH_START: + r = snp_launch_start(kvm, &sev_cmd); + break; + case KVM_SEV_SNP_LAUNCH_UPDATE: + r = snp_launch_update(kvm, &sev_cmd); + break; + case KVM_SEV_SNP_LAUNCH_FINISH: + r = snp_launch_finish(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; @@ -2281,6 +2860,31 @@ e_source_fput: return ret; } +static int snp_decommission_context(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_snp_addr data = {}; + int ret; + + /* If context is not created then do nothing */ + if (!sev->snp_context) + return 0; + + /* Do the decommision, which will unbind the ASID from the SNP context */ + data.address = __sme_pa(sev->snp_context); + down_write(&sev_deactivate_lock); + ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); + up_write(&sev_deactivate_lock); + + if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) + return ret; + + snp_free_firmware_page(sev->snp_context); + sev->snp_context = NULL; + + return 0; +} + void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -2322,7 +2926,19 @@ void sev_vm_destroy(struct kvm *kvm) } } - sev_unbind_asid(kvm, sev->handle); + if (sev_snp_guest(kvm)) { + snp_guest_req_cleanup(kvm); + + /* + * Decomission handles unbinding of the ASID. If it fails for + * some unexpected reason, just leak the ASID. + */ + if (snp_decommission_context(kvm)) + return; + } else { + sev_unbind_asid(kvm, sev->handle); + } + sev_asid_free(sev); } @@ -2336,11 +2952,16 @@ void __init sev_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SEV_ES); kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); } + if (sev_snp_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM); + } } void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; @@ -2427,6 +3048,7 @@ void __init sev_hardware_setup(void) sev_es_asid_count = min_sev_asid - 1; WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); sev_es_supported = true; + sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: if (boot_cpu_has(X86_FEATURE_SEV)) @@ -2439,9 +3061,15 @@ out: pr_info("SEV-ES %s (ASIDs %u - %u)\n", sev_es_supported ? "enabled" : "disabled", min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + if (boot_cpu_has(X86_FEATURE_SEV_SNP)) + pr_info("SEV-SNP %s (ASIDs %u - %u)\n", + sev_snp_supported ? "enabled" : "disabled", + min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; + sev_snp_enabled = sev_snp_supported; + if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) sev_es_debug_swap_enabled = false; @@ -2520,7 +3148,13 @@ do_wbinvd: void sev_guest_memory_reclaimed(struct kvm *kvm) { - if (!sev_guest(kvm)) + /* + * With SNP+gmem, private/encrypted memory is unreachable via the + * hva-based mmu notifiers, so these events are only actually + * pertaining to shared pages where there is no need to perform + * the WBINVD to flush associated caches. + */ + if (!sev_guest(kvm) || sev_snp_guest(kvm)) return; wbinvd_on_all_cpus(); @@ -2535,11 +3169,24 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); + /* + * If it's an SNP guest, then the VMSA was marked in the RMP table as + * a guest-owned page. Transition the page to hypervisor state before + * releasing it back to the system. + */ + if (sev_snp_guest(vcpu->kvm)) { + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) + goto skip_vmsa_free; + } + if (vcpu->arch.guest_state_protected) sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); __free_page(virt_to_page(svm->sev_es.vmsa)); +skip_vmsa_free: if (svm->sev_es.ghcb_sa_free) kvfree(svm->sev_es.ghcb_sa); } @@ -2735,6 +3382,13 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; + case SVM_VMGEXIT_AP_CREATION: + if (!sev_snp_guest(vcpu->kvm)) + goto vmgexit_err; + if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) + if (!kvm_ghcb_rax_is_valid(svm)) + goto vmgexit_err; + break; case SVM_VMGEXIT_NMI_COMPLETE: case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: @@ -2742,6 +3396,18 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_HV_FEATURES: case SVM_VMGEXIT_TERM_REQUEST: break; + case SVM_VMGEXIT_PSC: + if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) + goto vmgexit_err; + break; + case SVM_VMGEXIT_GUEST_REQUEST: + case SVM_VMGEXIT_EXT_GUEST_REQUEST: + if (!sev_snp_guest(vcpu->kvm) || + !PAGE_ALIGNED(control->exit_info_1) || + !PAGE_ALIGNED(control->exit_info_2) || + control->exit_info_1 == control->exit_info_2) + goto vmgexit_err; + break; default: reason = GHCB_ERR_INVALID_EVENT; goto vmgexit_err; @@ -2929,6 +3595,534 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) svm->vmcb->control.ghcb_gpa = value; } +static int snp_rmptable_psmash(kvm_pfn_t pfn) +{ + int ret; + + pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); + + /* + * PSMASH_FAIL_INUSE indicates another processor is modifying the + * entry, so retry until that's no longer the case. + */ + do { + ret = psmash(pfn); + } while (ret == PSMASH_FAIL_INUSE); + + return ret; +} + +static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (vcpu->run->hypercall.ret) + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + else + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); + + return 1; /* resume guest */ +} + +static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) +{ + u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr)); + u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr); + struct kvm_vcpu *vcpu = &svm->vcpu; + + if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = gpa; + vcpu->run->hypercall.args[1] = 1; + vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + + vcpu->arch.complete_userspace_io = snp_complete_psc_msr; + + return 0; /* forward request to userspace */ +} + +struct psc_buffer { + struct psc_hdr hdr; + struct psc_entry entries[]; +} __packed; + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc); + +static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) +{ + svm->sev_es.psc_inflight = 0; + svm->sev_es.psc_idx = 0; + svm->sev_es.psc_2m = false; + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret); +} + +static void __snp_complete_one_psc(struct vcpu_svm *svm) +{ + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + struct psc_entry *entries = psc->entries; + struct psc_hdr *hdr = &psc->hdr; + __u16 idx; + + /* + * Everything in-flight has been processed successfully. Update the + * corresponding entries in the guest's PSC buffer and zero out the + * count of in-flight PSC entries. + */ + for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight; + svm->sev_es.psc_inflight--, idx++) { + struct psc_entry *entry = &entries[idx]; + + entry->cur_page = entry->pagesize ? 512 : 1; + } + + hdr->cur_entry = idx; +} + +static int snp_complete_one_psc(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + + if (vcpu->run->hypercall.ret) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; /* resume guest */ + } + + __snp_complete_one_psc(svm); + + /* Handle the next range (if any). */ + return snp_begin_psc(svm, psc); +} + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) +{ + struct psc_entry *entries = psc->entries; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct psc_hdr *hdr = &psc->hdr; + struct psc_entry entry_start; + u16 idx, idx_start, idx_end; + int npages; + bool huge; + u64 gfn; + + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + +next_range: + /* There should be no other PSCs in-flight at this point. */ + if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + + /* + * The PSC descriptor buffer can be modified by a misbehaved guest after + * validation, so take care to only use validated copies of values used + * for things like array indexing. + */ + idx_start = hdr->cur_entry; + idx_end = hdr->end_entry; + + if (idx_end >= VMGEXIT_PSC_MAX_COUNT) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); + return 1; + } + + /* Find the start of the next range which needs processing. */ + for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) { + entry_start = entries[idx]; + + gfn = entry_start.gfn; + huge = entry_start.pagesize; + npages = huge ? 512 : 1; + + if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY); + return 1; + } + + if (entry_start.cur_page) { + /* + * If this is a partially-completed 2M range, force 4K handling + * for the remaining pages since they're effectively split at + * this point. Subsequent code should ensure this doesn't get + * combined with adjacent PSC entries where 2M handling is still + * possible. + */ + npages -= entry_start.cur_page; + gfn += entry_start.cur_page; + huge = false; + } + + if (npages) + break; + } + + if (idx > idx_end) { + /* Nothing more to process. */ + snp_complete_psc(svm, 0); + return 1; + } + + svm->sev_es.psc_2m = huge; + svm->sev_es.psc_idx = idx; + svm->sev_es.psc_inflight = 1; + + /* + * Find all subsequent PSC entries that contain adjacent GPA + * ranges/operations and can be combined into a single + * KVM_HC_MAP_GPA_RANGE exit. + */ + while (++idx <= idx_end) { + struct psc_entry entry = entries[idx]; + + if (entry.operation != entry_start.operation || + entry.gfn != entry_start.gfn + npages || + entry.cur_page || !!entry.pagesize != huge) + break; + + svm->sev_es.psc_inflight++; + npages += huge ? 512 : 1; + } + + switch (entry_start.operation) { + case VMGEXIT_PSC_OP_PRIVATE: + case VMGEXIT_PSC_OP_SHARED: + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); + vcpu->run->hypercall.args[1] = npages; + vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= entry_start.pagesize + ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M + : KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + vcpu->arch.complete_userspace_io = snp_complete_one_psc; + return 0; /* forward request to userspace */ + default: + /* + * Only shared/private PSC operations are currently supported, so if the + * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), + * then consider the entire range completed and avoid exiting to + * userspace. In theory snp_complete_psc() can always be called directly + * at this point to complete the current range and start the next one, + * but that could lead to unexpected levels of recursion. + */ + __snp_complete_one_psc(svm); + goto next_range; + } + + unreachable(); +} + +static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex)); + + /* Mark the vCPU as offline and not runnable */ + vcpu->arch.pv.pv_unhalted = false; + vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + + /* Clear use of the VMSA */ + svm->vmcb->control.vmsa_pa = INVALID_PAGE; + + if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { + gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); + struct kvm_memory_slot *slot; + kvm_pfn_t pfn; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot) + return -EINVAL; + + /* + * The new VMSA will be private memory guest memory, so + * retrieve the PFN from the gmem backend. + */ + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL)) + return -EINVAL; + + /* + * From this point forward, the VMSA will always be a + * guest-mapped page rather than the initial one allocated + * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa + * could be free'd and cleaned up here, but that involves + * cleanups like wbinvd_on_all_cpus() which would ideally + * be handled during teardown rather than guest boot. + * Deferring that also allows the existing logic for SEV-ES + * VMSAs to be re-used with minimal SNP-specific changes. + */ + svm->sev_es.snp_has_guest_vmsa = true; + + /* Use the new VMSA */ + svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); + + /* Mark the vCPU as runnable */ + vcpu->arch.pv.pv_unhalted = false; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + + /* + * gmem pages aren't currently migratable, but if this ever + * changes then care should be taken to ensure + * svm->sev_es.vmsa is pinned through some other means. + */ + kvm_release_pfn_clean(pfn); + } + + /* + * When replacing the VMSA during SEV-SNP AP creation, + * mark the VMCB dirty so that full state is always reloaded. + */ + vmcb_mark_all_dirty(svm->vmcb); + + return 0; +} + +/* + * Invoked as part of svm_vcpu_reset() processing of an init event. + */ +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int ret; + + if (!sev_snp_guest(vcpu->kvm)) + return; + + mutex_lock(&svm->sev_es.snp_vmsa_mutex); + + if (!svm->sev_es.snp_ap_waiting_for_reset) + goto unlock; + + svm->sev_es.snp_ap_waiting_for_reset = false; + + ret = __sev_snp_update_protected_guest_state(vcpu); + if (ret) + vcpu_unimpl(vcpu, "snp: AP state update on init failed\n"); + +unlock: + mutex_unlock(&svm->sev_es.snp_vmsa_mutex); +} + +static int sev_snp_ap_creation(struct vcpu_svm *svm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_vcpu *target_vcpu; + struct vcpu_svm *target_svm; + unsigned int request; + unsigned int apic_id; + bool kick; + int ret; + + request = lower_32_bits(svm->vmcb->control.exit_info_1); + apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); + + /* Validate the APIC ID */ + target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); + if (!target_vcpu) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", + apic_id); + return -EINVAL; + } + + ret = 0; + + target_svm = to_svm(target_vcpu); + + /* + * The target vCPU is valid, so the vCPU will be kicked unless the + * request is for CREATE_ON_INIT. For any errors at this stage, the + * kick will place the vCPU in an non-runnable state. + */ + kick = true; + + mutex_lock(&target_svm->sev_es.snp_vmsa_mutex); + + target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + target_svm->sev_es.snp_ap_waiting_for_reset = true; + + /* Interrupt injection mode shouldn't change for AP creation */ + if (request < SVM_VMGEXIT_AP_DESTROY) { + u64 sev_features; + + sev_features = vcpu->arch.regs[VCPU_REGS_RAX]; + sev_features ^= sev->vmsa_features; + + if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n", + vcpu->arch.regs[VCPU_REGS_RAX]); + ret = -EINVAL; + goto out; + } + } + + switch (request) { + case SVM_VMGEXIT_AP_CREATE_ON_INIT: + kick = false; + fallthrough; + case SVM_VMGEXIT_AP_CREATE: + if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", + svm->vmcb->control.exit_info_2); + ret = -EINVAL; + goto out; + } + + /* + * Malicious guest can RMPADJUST a large page into VMSA which + * will hit the SNP erratum where the CPU will incorrectly signal + * an RMP violation #PF if a hugepage collides with the RMP entry + * of VMSA page, reject the AP CREATE request if VMSA address from + * guest is 2M aligned. + */ + if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) { + vcpu_unimpl(vcpu, + "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", + svm->vmcb->control.exit_info_2); + ret = -EINVAL; + goto out; + } + + target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; + break; + case SVM_VMGEXIT_AP_DESTROY: + break; + default: + vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", + request); + ret = -EINVAL; + break; + } + +out: + if (kick) { + kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); + kvm_vcpu_kick(target_vcpu); + } + + mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex); + + return ret; +} + +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct sev_data_snp_guest_request data = {0}; + struct kvm *kvm = svm->vcpu.kvm; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + sev_ret_code fw_err = 0; + int ret; + + if (!sev_snp_guest(kvm)) + return -EINVAL; + + mutex_lock(&sev->guest_req_mutex); + + if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) { + ret = -EIO; + goto out_unlock; + } + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.req_paddr = __psp_pa(sev->guest_req_buf); + data.res_paddr = __psp_pa(sev->guest_resp_buf); + + /* + * Firmware failures are propagated on to guest, but any other failure + * condition along the way should be reported to userspace. E.g. if + * the PSP is dead and commands are timing out. + */ + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); + if (ret && !fw_err) + goto out_unlock; + + if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) { + ret = -EIO; + goto out_unlock; + } + + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(0, fw_err)); + + ret = 1; /* resume guest */ + +out_unlock: + mutex_unlock(&sev->guest_req_mutex); + return ret; +} + +static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct kvm *kvm = svm->vcpu.kvm; + u8 msg_type; + + if (!sev_snp_guest(kvm)) + return -EINVAL; + + if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type), + &msg_type, 1)) + return -EIO; + + /* + * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for + * additional certificate data to be provided alongside the attestation + * report via the guest-provided data pages indicated by RAX/RBX. The + * certificate data is optional and requires additional KVM enablement + * to provide an interface for userspace to provide it, but KVM still + * needs to be able to handle extended guest requests either way. So + * provide a stub implementation that will always return an empty + * certificate table in the guest-provided data pages. + */ + if (msg_type == SNP_MSG_REPORT_REQ) { + struct kvm_vcpu *vcpu = &svm->vcpu; + u64 data_npages; + gpa_t data_gpa; + + if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm)) + goto request_invalid; + + data_gpa = vcpu->arch.regs[VCPU_REGS_RAX]; + data_npages = vcpu->arch.regs[VCPU_REGS_RBX]; + + if (!PAGE_ALIGNED(data_gpa)) + goto request_invalid; + + /* + * As per GHCB spec (see "SNP Extended Guest Request"), the + * certificate table is terminated by 24-bytes of zeroes. + */ + if (data_npages && kvm_clear_guest(kvm, data_gpa, 24)) + return -EIO; + } + + return snp_handle_guest_req(svm, req_gpa, resp_gpa); + +request_invalid: + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + return 1; /* resume guest */ +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3008,6 +4202,38 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); break; + case GHCB_MSR_PREF_GPA_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_REG_GPA_REQ: { + u64 gfn; + + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + + svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); + + set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + } + case GHCB_MSR_PSC_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + ret = snp_begin_psc_msr(svm, control->ghcb_gpa); + break; case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; @@ -3020,12 +4246,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; - vcpu->run->system_event.ndata = 1; - vcpu->run->system_event.data[0] = control->ghcb_gpa; - - return 0; + goto out_terminate; } default: /* Error, keep GHCB MSR value as-is */ @@ -3036,6 +4257,14 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) control->ghcb_gpa, ret); return ret; + +out_terminate: + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + + return 0; } int sev_handle_vmgexit(struct kvm_vcpu *vcpu) @@ -3071,6 +4300,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); sev_es_sync_from_ghcb(svm); + + /* SEV-SNP guest requires that the GHCB GPA must be registered */ + if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { + vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); + return -EINVAL; + } + ret = sev_es_validate_vmgexit(svm); if (ret) return ret; @@ -3145,6 +4381,28 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) vcpu->run->system_event.ndata = 1; vcpu->run->system_event.data[0] = control->ghcb_gpa; break; + case SVM_VMGEXIT_PSC: + ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); + if (ret) + break; + + ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); + break; + case SVM_VMGEXIT_AP_CREATION: + ret = sev_snp_ap_creation(svm); + if (ret) { + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + } + + ret = 1; + break; + case SVM_VMGEXIT_GUEST_REQUEST: + ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; + case SVM_VMGEXIT_EXT_GUEST_REQUEST: + ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", @@ -3238,7 +4496,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) * the VMSA will be NULL if this vCPU is the destination for intrahost * migration, and will be copied later. */ - if (svm->sev_es.vmsa) + if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); /* Can't intercept CR register access, HV can't modify CR registers */ @@ -3310,6 +4568,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); + + mutex_init(&svm->sev_es.snp_vmsa_mutex); } void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) @@ -3331,9 +4591,9 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed * by common SVM code). */ - hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + hostsa->xcr0 = kvm_host.xcr0; hostsa->pkru = read_pkru(); - hostsa->xss = host_xss; + hostsa->xss = kvm_host.xss; /* * If DebugSwap is enabled, debug registers are loaded but NOT saved by @@ -3389,13 +4649,13 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) } } -struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) +struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) { unsigned long pfn; struct page *p; if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) - return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + return alloc_pages_node(node, gfp | __GFP_ZERO, 0); /* * Allocate an SNP-safe page to workaround the SNP erratum where @@ -3406,7 +4666,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) * Allocate one extra page, choose a page which is not * 2MB-aligned, and free the other. */ - p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + p = alloc_pages_node(node, gfp | __GFP_ZERO, 1); if (!p) return NULL; @@ -3420,3 +4680,271 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) return p; } + +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) +{ + struct kvm_memory_slot *slot; + struct kvm *kvm = vcpu->kvm; + int order, rmp_level, ret; + bool assigned; + kvm_pfn_t pfn; + gfn_t gfn; + + gfn = gpa >> PAGE_SHIFT; + + /* + * The only time RMP faults occur for shared pages is when the guest is + * triggering an RMP fault for an implicit page-state change from + * shared->private. Implicit page-state changes are forwarded to + * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults + * for shared pages should not end up here. + */ + if (!kvm_mem_is_private(kvm, gfn)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n", + gpa); + return; + } + + slot = gfn_to_memslot(kvm, gfn); + if (!kvm_slot_can_be_private(slot)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", + gpa); + return; + } + + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order); + if (ret) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", + gpa); + return; + } + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret || !assigned) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n", + gpa, pfn, ret); + goto out_no_trace; + } + + /* + * There are 2 cases where a PSMASH may be needed to resolve an #NPF + * with PFERR_GUEST_RMP_BIT set: + * + * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM + * bit set if the guest issues them with a smaller granularity than + * what is indicated by the page-size bit in the 2MB RMP entry for + * the PFN that backs the GPA. + * + * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is + * smaller than what is indicated by the 2MB RMP entry for the PFN + * that backs the GPA. + * + * In both these cases, the corresponding 2M RMP entry needs to + * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already + * split into 4K RMP entries, then this is likely a spurious case which + * can occur when there are concurrent accesses by the guest to a 2MB + * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in + * the process of being PMASH'd into 4K entries. These cases should + * resolve automatically on subsequent accesses, so just ignore them + * here. + */ + if (rmp_level == PG_LEVEL_4K) + goto out; + + ret = snp_rmptable_psmash(pfn); + if (ret) { + /* + * Look it up again. If it's 4K now then the PSMASH may have + * raced with another process and the issue has already resolved + * itself. + */ + if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) && + assigned && rmp_level == PG_LEVEL_4K) + goto out; + + pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n", + gpa, pfn, ret); + } + + kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); +out: + trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); +out_no_trace: + put_page(pfn_to_page(pfn)); +} + +static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn = start; + + while (pfn < end) { + int ret, rmp_level; + bool assigned; + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret) { + pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n", + pfn, start, end, rmp_level, ret); + return false; + } + + if (assigned) { + pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n", + __func__, pfn, start, end, rmp_level); + return false; + } + + pfn++; + } + + return true; +} + +static u8 max_level_for_order(int order) +{ + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) +{ + kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + + /* + * If this is a large folio, and the entire 2M range containing the + * PFN is currently shared, then the entire 2M-aligned range can be + * set to private via a single 2M RMP entry. + */ + if (max_level_for_order(order) > PG_LEVEL_4K && + is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) + return true; + + return false; +} + +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + kvm_pfn_t pfn_aligned; + gfn_t gfn_aligned; + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc) { + pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n", + gfn, pfn, rc); + return -ENOENT; + } + + if (assigned) { + pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n", + __func__, gfn, pfn, max_order, level); + return 0; + } + + if (is_large_rmp_possible(kvm, pfn, max_order)) { + level = PG_LEVEL_2M; + pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD); + } else { + level = PG_LEVEL_4K; + pfn_aligned = pfn; + gfn_aligned = gfn; + } + + rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false); + if (rc) { + pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n", + gfn, pfn, level, rc); + return -EINVAL; + } + + pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n", + __func__, gfn, pfn, pfn_aligned, max_order, level); + + return 0; +} + +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return; + + pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); + + for (pfn = start; pfn < end;) { + bool use_2m_update = false; + int rc, rmp_level; + bool assigned; + + rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (rc || !assigned) + goto next_pfn; + + use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && + end >= (pfn + PTRS_PER_PMD) && + rmp_level > PG_LEVEL_4K; + + /* + * If an unaligned PFN corresponds to a 2M region assigned as a + * large page in the RMP table, PSMASH the region into individual + * 4K RMP entries before attempting to convert a 4K sub-page. + */ + if (!use_2m_update && rmp_level > PG_LEVEL_4K) { + /* + * This shouldn't fail, but if it does, report it, but + * still try to update RMP entry to shared and pray this + * was a spurious error that can be addressed later. + */ + rc = snp_rmptable_psmash(pfn); + WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n", + pfn, rc); + } + + rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); + if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n", + pfn, rc)) + goto next_pfn; + + /* + * SEV-ES avoids host/guest cache coherency issues through + * WBINVD hooks issued via MMU notifiers during run-time, and + * KVM's VM destroy path at shutdown. Those MMU notifier events + * don't cover gmem since there is no requirement to map pages + * to a HVA in order to use them for a running guest. While the + * shutdown path would still likely cover things for SNP guests, + * userspace may also free gmem pages during run-time via + * hole-punching operations on the guest_memfd, so flush the + * cache entries for these pages before free'ing them back to + * the host. + */ + clflush_cache_range(__va(pfn_to_hpa(pfn)), + use_2m_update ? PMD_SIZE : PAGE_SIZE); +next_pfn: + pfn += use_2m_update ? PTRS_PER_PMD : 1; + cond_resched(); + } +} + +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc || !assigned) + return PG_LEVEL_4K; + + return level; +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c95d3900fe56..c115d26844f7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -53,6 +53,7 @@ #include "svm_onhyperv.h" MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions"); MODULE_LICENSE("GPL"); #ifdef MODULE @@ -570,6 +571,11 @@ static void __svm_write_tsc_multiplier(u64 multiplier) __this_cpu_write(current_tsc_ratio, multiplier); } +static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) +{ + return page_address(sd->save_area) + 0x400; +} + static inline void kvm_cpu_svm_disable(void) { uint64_t efer; @@ -674,12 +680,9 @@ static int svm_hardware_enable(void) * TSC_AUX field now to avoid a RDMSR on every vCPU run. */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - struct sev_es_save_area *hostsa; u32 __maybe_unused msr_hi; - hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); - - rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi); + rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } return 0; @@ -704,7 +707,7 @@ static int svm_cpu_init(int cpu) int ret = -ENOMEM; memset(sd, 0, sizeof(struct svm_cpu_data)); - sd->save_area = snp_safe_alloc_page(NULL); + sd->save_area = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); if (!sd->save_area) return ret; @@ -1202,7 +1205,7 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (guest_cpuid_is_intel(vcpu)) { + if (guest_cpuid_is_intel_compatible(vcpu)) { /* * We must intercept SYSENTER_EIP and SYSENTER_ESP * accesses because the processor only stores 32 bits. @@ -1404,6 +1407,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; + if (init_event) + sev_snp_init_protected_guest_state(vcpu); + init_vmcb(vcpu); if (!init_event) @@ -1427,7 +1433,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); err = -ENOMEM; - vmcb01_page = snp_safe_alloc_page(vcpu); + vmcb01_page = snp_safe_alloc_page(); if (!vmcb01_page) goto out; @@ -1436,7 +1442,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) * SEV-ES guests require a separate VMSA page used to contain * the encrypted register state of the guest. */ - vmsa_page = snp_safe_alloc_page(vcpu); + vmsa_page = snp_safe_alloc_page(); if (!vmsa_page) goto error_free_vmcb_page; } @@ -1501,11 +1507,6 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } -static struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) -{ - return page_address(sd->save_area) + 0x400; -} - static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1551,6 +1552,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct vcpu_svm *svm = to_svm(vcpu); struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); + if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) + shrink_ple_window(vcpu); + if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; @@ -2050,6 +2054,7 @@ static int pf_interception(struct kvm_vcpu *vcpu) static int npf_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + int rc; u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; @@ -2063,11 +2068,19 @@ static int npf_interception(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) error_code &= ~PFERR_SYNTHETIC_MASK; + if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) + error_code |= PFERR_PRIVATE_ACCESS; + trace_kvm_page_fault(vcpu, fault_address, error_code); - return kvm_mmu_page_fault(vcpu, fault_address, error_code, - static_cpu_has(X86_FEATURE_DECODEASSISTS) ? - svm->vmcb->control.insn_bytes : NULL, - svm->vmcb->control.insn_len); + rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, + static_cpu_has(X86_FEATURE_DECODEASSISTS) ? + svm->vmcb->control.insn_bytes : NULL, + svm->vmcb->control.insn_len); + + if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) + sev_handle_rmp_fault(vcpu, fault_address, error_code); + + return rc; } static int db_interception(struct kvm_vcpu *vcpu) @@ -2875,12 +2888,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SYSENTER_EIP: msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; - if (guest_cpuid_is_intel(vcpu)) + if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_eip_hi << 32; break; case MSR_IA32_SYSENTER_ESP: msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; - if (guest_cpuid_is_intel(vcpu)) + if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; case MSR_TSC_AUX: @@ -3107,11 +3120,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * 32 bit part of these msrs to support Intel's * implementation of SYSENTER/SYSEXIT. */ - svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; + svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; break; case MSR_IA32_SYSENTER_ESP: svm->vmcb01.ptr->save.sysenter_esp = (u32)data; - svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; + svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; break; case MSR_TSC_AUX: /* @@ -4372,11 +4385,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV); /* - * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that + * Intercept VMLOAD if the vCPU model is Intel in order to emulate that * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing * SVM on Intel is bonkers and extremely unlikely to work). */ - if (!guest_cpuid_is_intel(vcpu)) + if (!guest_cpuid_is_intel_compatible(vcpu)) kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); @@ -4595,12 +4608,6 @@ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) vcpu->arch.at_instruction_boundary = true; } -static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - shrink_ple_window(vcpu); -} - static void svm_setup_mce(struct kvm_vcpu *vcpu) { /* [63:9] are reserved. */ @@ -4937,8 +4944,11 @@ static int svm_vm_init(struct kvm *kvm) if (type != KVM_X86_DEFAULT_VM && type != KVM_X86_SW_PROTECTED_VM) { - kvm->arch.has_protected_state = (type == KVM_X86_SEV_ES_VM); + kvm->arch.has_protected_state = + (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM); to_kvm_sev_info(kvm)->need_init = true; + + kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); } if (!pause_filter_count || !pause_filter_thresh) @@ -4955,7 +4965,7 @@ static int svm_vm_init(struct kvm *kvm) static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) { - struct page *page = snp_safe_alloc_page(vcpu); + struct page *page = snp_safe_alloc_page(); if (!page) return NULL; @@ -5060,8 +5070,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_intercept = svm_check_intercept, .handle_exit_irqoff = svm_handle_exit_irqoff, - .sched_in = svm_sched_in, - .nested_ops = &svm_nested_ops, .deliver_interrupt = svm_deliver_interrupt, @@ -5095,6 +5103,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, .alloc_apic_backing_page = svm_alloc_apic_backing_page, + + .gmem_prepare = sev_gmem_prepare, + .gmem_invalidate = sev_gmem_invalidate, + .private_max_mapping_level = sev_private_max_mapping_level, }; /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 0f1472690b59..76107c7d0595 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -94,6 +94,10 @@ struct kvm_sev_info { struct list_head mirror_entry; /* Use as a list entry of mirrors */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; + void *snp_context; /* SNP guest context page */ + void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */ + void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ + struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ }; struct kvm_svm { @@ -209,6 +213,18 @@ struct vcpu_sev_es_state { u32 ghcb_sa_len; bool ghcb_sa_sync; bool ghcb_sa_free; + + /* SNP Page-State-Change buffer entries currently being processed */ + u16 psc_idx; + u16 psc_inflight; + bool psc_2m; + + u64 ghcb_registered_gpa; + + struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */ + gpa_t snp_vmsa_gpa; + bool snp_ap_waiting_for_reset; + bool snp_has_guest_vmsa; }; struct vcpu_svm { @@ -350,6 +366,23 @@ static __always_inline bool sev_es_guest(struct kvm *kvm) #endif } +static __always_inline bool sev_snp_guest(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_AMD_SEV + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) && + !WARN_ON_ONCE(!sev_es_guest(kvm)); +#else + return false; +#endif +} + +static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val) +{ + return svm->sev_es.ghcb_registered_gpa == val; +} + static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -638,7 +671,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ #define AVIC_REQUIRED_APICV_INHIBITS \ ( \ - BIT(APICV_INHIBIT_REASON_DISABLE) | \ + BIT(APICV_INHIBIT_REASON_DISABLED) | \ BIT(APICV_INHIBIT_REASON_ABSENT) | \ BIT(APICV_INHIBIT_REASON_HYPERV) | \ BIT(APICV_INHIBIT_REASON_NESTED) | \ @@ -696,7 +729,13 @@ void sev_guest_memory_reclaimed(struct kvm *kvm); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); /* These symbols are used in common code and are stubbed below. */ -struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); + +struct page *snp_safe_alloc_page_node(int node, gfp_t gfp); +static inline struct page *snp_safe_alloc_page(void) +{ + return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); +} + void sev_free_vcpu(struct kvm_vcpu *vcpu); void sev_vm_destroy(struct kvm *kvm); void __init sev_set_cpu_caps(void); @@ -705,9 +744,20 @@ void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); int sev_dev_get_attr(u32 group, u64 attr, u64 *val); extern unsigned int max_sev_asid; +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); #else -static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) { - return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); +static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) +{ + return alloc_pages_node(node, gfp | __GFP_ZERO, 0); +} + +static inline struct page *snp_safe_alloc_page(void) +{ + return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); } static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {} @@ -718,6 +768,18 @@ static inline void sev_hardware_unsetup(void) {} static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } #define max_sev_asid 0 +static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} +static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {} +static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + return 0; +} +static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} +static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + return 0; +} + #endif /* vmenter.S */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e19fed438a67..d3aeffd6ae75 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -314,12 +314,12 @@ TRACE_EVENT(name, \ __entry->guest_rip = kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ - static_call(kvm_x86_get_exit_info)(vcpu, \ - &__entry->exit_reason, \ - &__entry->info1, \ - &__entry->info2, \ - &__entry->intr_info, \ - &__entry->error_code); \ + kvm_x86_call(get_exit_info)(vcpu, \ + &__entry->exit_reason, \ + &__entry->info1, \ + &__entry->info2, \ + &__entry->intr_info, \ + &__entry->error_code); \ ), \ \ TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \ @@ -828,7 +828,8 @@ TRACE_EVENT(kvm_emulate_insn, ), TP_fast_assign( - __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS); + __entry->csbase = kvm_x86_call(get_segment_base)(vcpu, + VCPU_SREG_CS); __entry->len = vcpu->arch.emulate_ctxt->fetch.ptr - vcpu->arch.emulate_ctxt->fetch.data; __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len; @@ -1375,6 +1376,10 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, __entry->vcpu_id, __entry->timer_index) ); +#define kvm_print_apicv_inhibit_reasons(inhibits) \ + (inhibits), (inhibits) ? " " : "", \ + (inhibits) ? __print_flags(inhibits, "|", APICV_INHIBIT_REASONS) : "" + TRACE_EVENT(kvm_apicv_inhibit_changed, TP_PROTO(int reason, bool set, unsigned long inhibits), TP_ARGS(reason, set, inhibits), @@ -1391,9 +1396,10 @@ TRACE_EVENT(kvm_apicv_inhibit_changed, __entry->inhibits = inhibits; ), - TP_printk("%s reason=%u, inhibits=0x%lx", + TP_printk("%s reason=%u, inhibits=0x%lx%s%s", __entry->set ? "set" : "cleared", - __entry->reason, __entry->inhibits) + __entry->reason, + kvm_print_apicv_inhibit_reasons(__entry->inhibits)) ); TRACE_EVENT(kvm_apicv_accept_irq, @@ -1834,6 +1840,37 @@ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit, __entry->vcpu_id, __entry->ghcb_gpa, __entry->result) ); +/* + * Tracepoint for #NPFs due to RMP faults. + */ +TRACE_EVENT(kvm_rmp_fault, + TP_PROTO(struct kvm_vcpu *vcpu, u64 gpa, u64 pfn, u64 error_code, + int rmp_level, int psmash_ret), + TP_ARGS(vcpu, gpa, pfn, error_code, rmp_level, psmash_ret), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, gpa) + __field(u64, pfn) + __field(u64, error_code) + __field(int, rmp_level) + __field(int, psmash_ret) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gpa = gpa; + __entry->pfn = pfn; + __entry->error_code = error_code; + __entry->rmp_level = rmp_level; + __entry->psmash_ret = psmash_ret; + ), + + TP_printk("vcpu %u gpa %016llx pfn 0x%llx error_code 0x%llx rmp_level %d psmash_ret %d", + __entry->vcpu_id, __entry->gpa, __entry->pfn, + __entry->error_code, __entry->rmp_level, __entry->psmash_ret) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index d4ed681785fd..0bf35ebe8a1b 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -8,7 +8,7 @@ #include "posted_intr.h" #define VMX_REQUIRED_APICV_INHIBITS \ - (BIT(APICV_INHIBIT_REASON_DISABLE)| \ + (BIT(APICV_INHIBIT_REASON_DISABLED) | \ BIT(APICV_INHIBIT_REASON_ABSENT) | \ BIT(APICV_INHIBIT_REASON_HYPERV) | \ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ @@ -97,7 +97,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, - .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_interrupt = vmx_deliver_interrupt, .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, @@ -122,8 +121,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .sched_in = vmx_sched_in, - .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 643935a0f70a..2392a7ef254d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -12,6 +12,7 @@ #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "posted_intr.h" #include "sgx.h" #include "trace.h" #include "vmx.h" @@ -2425,7 +2426,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; - if (guest_efer != host_efer) + if (guest_efer != kvm_host.efer) exec_control |= VM_ENTRY_LOAD_IA32_EFER; } vm_entry_controls_set(vmx, exec_control); @@ -2438,7 +2439,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ exec_control = __vm_exit_controls_get(vmcs01); - if (cpu_has_load_ia32_efer() && guest_efer != host_efer) + if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; else exec_control &= ~VM_EXIT_LOAD_IA32_EFER; @@ -3899,8 +3900,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) if (!pi_test_and_clear_on(vmx->nested.pi_desc)) return 0; - max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); - if (max_irr != 256) { + max_irr = pi_find_highest_vector(vmx->nested.pi_desc); + if (max_irr > 0) { vapic_page = vmx->nested.virtual_apic_map.hva; if (!vapic_page) goto mmio_needed; @@ -4031,10 +4032,46 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.preemption_timer_expired; } -static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) +static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) { - return nested_vmx_preemption_timer_pending(vcpu) || - to_vmx(vcpu)->nested.mtf_pending; + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic = vmx->nested.virtual_apic_map.hva; + int max_irr, vppr; + + if (nested_vmx_preemption_timer_pending(vcpu) || + vmx->nested.mtf_pending) + return true; + + /* + * Virtual Interrupt Delivery doesn't require manual injection. Either + * the interrupt is already in GUEST_RVI and will be recognized by CPU + * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move + * the interrupt from the PIR to RVI prior to entering the guest. + */ + if (for_injection) + return false; + + if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || + __vmx_interrupt_blocked(vcpu)) + return false; + + if (!vapic) + return false; + + vppr = *((u32 *)(vapic + APIC_PROCPRI)); + + max_irr = vmx_get_rvi(); + if ((max_irr & 0xf0) > (vppr & 0xf0)) + return true; + + if (vmx->nested.pi_pending && vmx->nested.pi_desc && + pi_test_on(vmx->nested.pi_desc)) { + max_irr = pi_find_highest_vector(vmx->nested.pi_desc); + if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) + return true; + } + + return false; } /* @@ -4665,7 +4702,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) return vmcs_read64(GUEST_IA32_EFER); if (cpu_has_load_ia32_efer()) - return host_efer; + return kvm_host.efer; for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) @@ -4676,7 +4713,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) if (efer_msr) return efer_msr->data; - return host_efer; + return kvm_host.efer; } static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index be40474de6e4..83382a4d1d66 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -348,14 +348,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (data & pmu->fixed_ctr_ctrl_mask) + if (data & pmu->fixed_ctr_ctrl_rsvd) return 1; if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); break; case MSR_IA32_PEBS_ENABLE: - if (data & pmu->pebs_enable_mask) + if (data & pmu->pebs_enable_rsvd) return 1; if (pmu->pebs_enable != data) { @@ -371,7 +371,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmu->ds_area = data; break; case MSR_PEBS_DATA_CFG: - if (data & pmu->pebs_data_cfg_mask) + if (data & pmu->pebs_data_cfg_rsvd) return 1; pmu->pebs_data_cfg = data; @@ -436,8 +436,8 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) }; u64 eventsel; - BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED); - BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED); + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS); + BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS); /* * Yell if perf reports support for a fixed counter but perf doesn't @@ -448,6 +448,14 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) return eventsel; } +static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits) +{ + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits); +} + static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -456,8 +464,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) union cpuid10_eax eax; union cpuid10_edx edx; u64 perf_capabilities; - u64 counter_mask; - int i; + u64 counter_rsvd; memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); @@ -501,22 +508,24 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ((u64)1 << edx.split.bit_width_fixed) - 1; } - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) - pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); - counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL | + INTEL_FIXED_0_USER | + INTEL_FIXED_0_ENABLE_PMI); + + counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); - pmu->global_ctrl_mask = counter_mask; + pmu->global_ctrl_rsvd = counter_rsvd; /* * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET) * share reserved bit definitions. The kernel just happens to use * OVF_CTRL for the names. */ - pmu->global_status_mask = pmu->global_ctrl_mask + pmu->global_status_rsvd = pmu->global_ctrl_rsvd & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); if (vmx_pt_mode_is_host_guest()) - pmu->global_status_mask &= + pmu->global_status_rsvd &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); @@ -544,15 +553,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { - pmu->pebs_enable_mask = counter_mask; + pmu->pebs_enable_rsvd = counter_rsvd; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmu->fixed_ctr_ctrl_mask &= - ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); - } - pmu->pebs_data_cfg_mask = ~0xff00000full; + pmu->pebs_data_cfg_rsvd = ~0xff00000full; + intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE); } else { - pmu->pebs_enable_mask = + pmu->pebs_enable_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); } } @@ -564,14 +570,14 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { + for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; pmu->gp_counters[i].current_config = 0; } - for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { + for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; @@ -731,6 +737,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, - .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, + .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, .MIN_NR_GP_COUNTERS = 1, }; diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 6b2a0226257e..1715d2ab07be 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_VMX_POSTED_INTR_H #define __KVM_X86_VMX_POSTED_INTR_H + +#include <linux/find.h> #include <asm/posted_intr.h> void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); @@ -12,4 +14,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void vmx_pi_start_assignment(struct kvm *kvm); +static inline int pi_find_highest_vector(struct pi_desc *pi_desc) +{ + int vec; + + vec = find_last_bit((unsigned long *)pi_desc->pir, 256); + return vec < 256 ? vec : -1; +} + #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 01936013428b..56fd150a6f24 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -188,12 +188,13 @@ struct __packed vmcs12 { }; /* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + * VMCS12_REVISION is KVM's arbitrary ID for the layout of struct vmcs12. KVM + * enumerates this value to L1 via MSR_IA32_VMX_BASIC, and checks the revision + * ID during nested VMPTRLD to verify that L1 is loading a VMCS that adhere's + * to KVM's virtual CPU definition. * - * IMPORTANT: Changing this value will break save/restore compatibility with - * older kvm releases. + * DO NOT change this value, as it will break save/restore compatibility with + * older KVM releases. */ #define VMCS12_REVISION 0x11e57ed0 @@ -206,7 +207,8 @@ struct __packed vmcs12 { #define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE /* - * For save/restore compatibility, the vmcs12 field offsets must not change. + * For save/restore compatibility, the vmcs12 field offsets must not change, + * although appending fields and/or filling gaps is obviously allowed. */ #define CHECK_OFFSET(field, loc) \ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b3c83c06f826..f18c2d8c7476 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -74,6 +74,7 @@ #include "posted_intr.h" MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); MODULE_LICENSE("GPL"); #ifdef MODULE @@ -259,7 +260,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } - if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; return 0; } @@ -404,7 +405,7 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) * and VM-Exit. */ vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && - (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && + (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && !boot_cpu_has_bug(X86_BUG_MDS) && !boot_cpu_has_bug(X86_BUG_TAA); @@ -1123,12 +1124,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) * atomically, since it's faster than switching it manually. */ if (cpu_has_load_ia32_efer() || - (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { + (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; - if (guest_efer != host_efer) + if (guest_efer != kvm_host.efer) add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer, false); + guest_efer, kvm_host.efer, false); else clear_atomic_switch_msr(vmx, MSR_EFER); return false; @@ -1141,7 +1142,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) clear_atomic_switch_msr(vmx, MSR_EFER); guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; + guest_efer |= kvm_host.efer & ignore_bits; vmx->guest_uret_msrs[i].data = guest_efer; vmx->guest_uret_msrs[i].mask = ~ignore_bits; @@ -1411,6 +1412,38 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) } #endif +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old, ple_window, + ple_window_grow, + ple_window_max); + + if (vmx->ple_window != old) { + vmx->ple_window_dirty = true; + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, ple_window, + ple_window_shrink, + ple_window); + + if (vmx->ple_window != old) { + vmx->ple_window_dirty = true; + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } +} + void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy) { @@ -1486,6 +1519,9 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) + shrink_ple_window(vcpu); + vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); @@ -2525,17 +2561,15 @@ static bool cpu_has_sgx(void) */ static bool cpu_has_perf_global_ctrl_bug(void) { - if (boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_NEHALEM_EP: /* AAK155 */ - case INTEL_FAM6_NEHALEM: /* AAP115 */ - case INTEL_FAM6_WESTMERE: /* AAT100 */ - case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */ - case INTEL_FAM6_NEHALEM_EX: /* BA97 */ - return true; - default: - break; - } + switch (boot_cpu_data.x86_vfm) { + case INTEL_NEHALEM_EP: /* AAK155 */ + case INTEL_NEHALEM: /* AAP115 */ + case INTEL_WESTMERE: /* AAT100 */ + case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ + case INTEL_NEHALEM_EX: /* BA97 */ + return true; + default: + break; } return false; @@ -2834,9 +2868,6 @@ int vmx_hardware_enable(void) return r; } - if (enable_ept) - ept_sync_global(); - return 0; } @@ -4108,26 +4139,6 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } -bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - void *vapic_page; - u32 vppr; - int rvi; - - if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || - !nested_cpu_has_vid(get_vmcs12(vcpu)) || - WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) - return false; - - rvi = vmx_get_rvi(); - - vapic_page = vmx->nested.virtual_apic_map.hva; - vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4357,7 +4368,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) } if (cpu_has_load_ia32_efer()) - vmcs_write64(HOST_IA32_EFER, host_efer); + vmcs_write64(HOST_IA32_EFER, kvm_host.efer); } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) @@ -5052,14 +5063,19 @@ int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !vmx_nmi_blocked(vcpu); } +bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) +{ + return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); +} + bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return false; - return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); + return __vmx_interrupt_blocked(vcpu); } int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) @@ -5897,38 +5913,6 @@ int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) return 1; } -static void grow_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned int old = vmx->ple_window; - - vmx->ple_window = __grow_ple_window(old, ple_window, - ple_window_grow, - ple_window_max); - - if (vmx->ple_window != old) { - vmx->ple_window_dirty = true; - trace_kvm_ple_window_update(vcpu->vcpu_id, - vmx->ple_window, old); - } -} - -static void shrink_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned int old = vmx->ple_window; - - vmx->ple_window = __shrink_ple_window(old, ple_window, - ple_window_shrink, - ple_window); - - if (vmx->ple_window != old) { - vmx->ple_window_dirty = true; - trace_kvm_ple_window_update(vcpu->vcpu_id, - vmx->ple_window, old); - } -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -6677,9 +6661,10 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) bool flush_l1d; /* - * Clear the per-vcpu flush bit, it gets set again - * either from vcpu_run() or from one of the unsafe - * VMEXIT handlers. + * Clear the per-vcpu flush bit, it gets set again if the vCPU + * is reloaded, i.e. if the vCPU is scheduled out or if KVM + * exits to userspace, or if KVM reaches one of the unsafe + * VMEXIT handlers, e.g. if KVM calls into the emulator. */ flush_l1d = vcpu->arch.l1tf_flush_l1d; vcpu->arch.l1tf_flush_l1d = false; @@ -7665,39 +7650,25 @@ int vmx_vm_init(struct kvm *kvm) u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in - * memory aliases with conflicting memory types and sometimes MCEs. - * We have to be careful as to what are honored and when. - * - * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to - * UC. The effective memory type is UC or WC depending on guest PAT. - * This was historically the source of MCEs and we want to be - * conservative. - * - * When there is no need to deal with noncoherent DMA (e.g., no VT-d - * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The - * EPT memory type is set to WB. The effective memory type is forced - * WB. - * - * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The - * EPT memory type is used to emulate guest CD/MTRR. + /* + * Force UC for host MMIO regions, as allowing the guest to access MMIO + * with cacheable accesses will result in Machine Checks. */ - if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + /* + * Force WB and ignore guest PAT if the VM does NOT have a non-coherent + * device attached and the CPU doesn't support self-snoop. Letting the + * guest control memory types on Intel CPUs without self-snoop may + * result in unexpected behavior, and so KVM's (historical) ABI is to + * trust the guest to behave only as a last resort. + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP) && + !kvm_arch_has_noncoherent_dma(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; - if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT; - else - return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) | - VMX_EPT_IPAT_BIT; - } - - return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; + return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) @@ -8179,12 +8150,6 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) } #endif -void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - shrink_ple_window(vcpu); -} - void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8396,18 +8361,16 @@ static void __init vmx_setup_me_spte_mask(void) u64 me_mask = 0; /* - * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use - * the former to avoid exposing shadow_phys_bits. - * * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to - * shadow_phys_bits. On MKTME and/or TDX capable systems, + * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, * boot_cpu_data.x86_phys_bits holds the actual physical address - * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR - * reported by CPUID. Those bits between are KeyID bits. + * w/o the KeyID bits, and kvm_host.maxphyaddr equals to + * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. */ - if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits()) + if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, - kvm_get_shadow_phys_bits() - 1); + kvm_host.maxphyaddr - 1); + /* * Unlike SME, host kernel doesn't support setting up any * MKTME KeyID on Intel platforms. No memory encryption @@ -8629,9 +8592,9 @@ static void __vmx_exit(void) static void vmx_exit(void) { kvm_exit(); + __vmx_exit(); kvm_x86_vendor_exit(); - __vmx_exit(); } module_exit(vmx_exit); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 7b64e271a931..42498fa63abb 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -406,6 +406,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); +bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); @@ -727,7 +728,7 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) return true; return allow_smaller_maxphyaddr && - cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits(); + cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr; } static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 502704596c83..ce3221cd1d01 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -46,10 +46,8 @@ bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu); void vmx_migrate_timers(struct kvm_vcpu *vcpu); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu); -bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void vmx_hwapic_isr_update(int max_isr); -bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu); int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu); void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector); @@ -111,8 +109,6 @@ u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); void vmx_write_tsc_offset(struct kvm_vcpu *vcpu); void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu); -void vmx_request_immediate_exit(struct kvm_vcpu *vcpu); -void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu); void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); #ifdef CONFIG_X86_64 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0763a0f72a06..af6c8cf6a37a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -100,6 +100,9 @@ struct kvm_caps kvm_caps __read_mostly; EXPORT_SYMBOL_GPL(kvm_caps); +struct kvm_host_values kvm_host __read_mostly; +EXPORT_SYMBOL_GPL(kvm_host); + #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) #define emul_to_vcpu(ctxt) \ @@ -220,21 +223,12 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) -u64 __read_mostly host_efer; -EXPORT_SYMBOL_GPL(host_efer); - bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; EXPORT_SYMBOL_GPL(enable_apicv); -u64 __read_mostly host_xss; -EXPORT_SYMBOL_GPL(host_xss); - -u64 __read_mostly host_arch_capabilities; -EXPORT_SYMBOL_GPL(host_arch_capabilities); - const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), @@ -308,8 +302,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -u64 __read_mostly host_xcr0; - static struct kmem_cache *x86_emulator_cache; /* @@ -833,7 +825,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) + if (kvm_x86_call(get_cpl)(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; @@ -917,7 +909,7 @@ static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) return false; - return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0); + return kvm_x86_call(is_valid_cr0)(vcpu, cr0); } void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) @@ -954,11 +946,6 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) kvm_mmu_reset_context(vcpu); - - if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_mmu_honors_guest_mtrrs(vcpu->kvm) && - !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); } EXPORT_SYMBOL_GPL(kvm_post_set_cr0); @@ -981,7 +968,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_pae(vcpu)) return 1; - static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); if (cs_l) return 1; } @@ -995,7 +982,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) return 1; - static_call(kvm_x86_set_cr0)(vcpu, cr0); + kvm_x86_call(set_cr0)(vcpu, cr0); kvm_post_set_cr0(vcpu, old_cr0, cr0); @@ -1016,11 +1003,11 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - if (vcpu->arch.xcr0 != host_xcr0) + if (vcpu->arch.xcr0 != kvm_host.xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != host_xss) + vcpu->arch.ia32_xss != kvm_host.xss) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } @@ -1047,12 +1034,12 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - if (vcpu->arch.xcr0 != host_xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); + if (vcpu->arch.xcr0 != kvm_host.xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != host_xss) - wrmsrl(MSR_IA32_XSS, host_xss); + vcpu->arch.ia32_xss != kvm_host.xss) + wrmsrl(MSR_IA32_XSS, kvm_host.xss); } } @@ -1113,7 +1100,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ - if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || + if (kvm_x86_call(get_cpl)(vcpu) != 0 || __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { kvm_inject_gp(vcpu, 0); return 1; @@ -1138,7 +1125,7 @@ EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { return __kvm_is_valid_cr4(vcpu, cr4) && - static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); + kvm_x86_call(is_valid_cr4)(vcpu, cr4); } void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) @@ -1206,7 +1193,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - static_call(kvm_x86_set_cr4)(vcpu, cr4); + kvm_x86_call(set_cr4)(vcpu, cr4); kvm_post_set_cr4(vcpu, old_cr4, cr4); @@ -1345,7 +1332,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; - static_call(kvm_x86_set_dr7)(vcpu, dr7); + kvm_x86_call(set_dr7)(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; @@ -1461,10 +1448,10 @@ static const u32 msrs_to_save_pmu[] = { MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, - MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_CORE_PERF_GLOBAL_CTRL, MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, - /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */ + /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, @@ -1477,7 +1464,7 @@ static const u32 msrs_to_save_pmu[] = { MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, - /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */ + /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, @@ -1619,7 +1606,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr) static u64 kvm_get_arch_capabilities(void) { - u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP; + u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP; /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -1688,7 +1675,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - return static_call(kvm_x86_get_msr_feature)(msr); + return kvm_x86_call(get_msr_feature)(msr); } return 0; } @@ -1762,7 +1749,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - r = static_call(kvm_x86_set_efer)(vcpu, efer); + r = kvm_x86_call(set_efer)(vcpu, efer); if (r) { WARN_ON(r > 0); return r; @@ -1877,11 +1864,11 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * incomplete and conflicting architectural behavior. Current * AMD CPUs completely ignore bits 63:32, i.e. they aren't * reserved and always read as zeros. Enforce Intel's reserved - * bits check if and only if the guest CPU is Intel, and clear - * the bits in all other cases. This ensures cross-vendor - * migration will provide consistent behavior for the guest. + * bits check if the guest CPU is Intel compatible, otherwise + * clear the bits. This ensures cross-vendor migration will + * provide consistent behavior for the guest. */ - if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) + if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0) return 1; data = (u32)data; @@ -1892,7 +1879,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, msr.index = index; msr.host_initiated = host_initiated; - return static_call(kvm_x86_set_msr)(vcpu, &msr); + return kvm_x86_call(set_msr)(vcpu, &msr); } static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, @@ -1934,7 +1921,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, msr.index = index; msr.host_initiated = host_initiated; - ret = static_call(kvm_x86_get_msr)(vcpu, &msr); + ret = kvm_x86_call(get_msr)(vcpu, &msr); if (!ret) *data = msr.data; return ret; @@ -2002,7 +1989,7 @@ static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) static int complete_fast_msr_access(struct kvm_vcpu *vcpu) { - return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); + return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error); } static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) @@ -2066,7 +2053,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) trace_kvm_msr_read_ex(ecx); } - return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); + return kvm_x86_call(complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); @@ -2091,7 +2078,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) trace_kvm_msr_write_ex(ecx, data); } - return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); + return kvm_x86_call(complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); @@ -2616,12 +2603,12 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) if (is_guest_mode(vcpu)) vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( l1_offset, - static_call(kvm_x86_get_l2_tsc_offset)(vcpu), - static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + kvm_x86_call(get_l2_tsc_offset)(vcpu), + kvm_x86_call(get_l2_tsc_multiplier)(vcpu)); else vcpu->arch.tsc_offset = l1_offset; - static_call(kvm_x86_write_tsc_offset)(vcpu); + kvm_x86_call(write_tsc_offset)(vcpu); } static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) @@ -2632,12 +2619,12 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli if (is_guest_mode(vcpu)) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( l1_multiplier, - static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + kvm_x86_call(get_l2_tsc_multiplier)(vcpu)); else vcpu->arch.tsc_scaling_ratio = l1_multiplier; if (kvm_caps.has_tsc_control) - static_call(kvm_x86_write_tsc_multiplier)(vcpu); + kvm_x86_call(write_tsc_multiplier)(vcpu); } static inline bool kvm_check_tsc_unstable(void) @@ -3610,7 +3597,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_flush_tlb_all)(vcpu); + kvm_x86_call(flush_tlb_all)(vcpu); /* Flushing all ASIDs flushes the current ASID... */ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); @@ -3631,7 +3618,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_prev_roots(vcpu); } - static_call(kvm_x86_flush_tlb_guest)(vcpu); + kvm_x86_call(flush_tlb_guest)(vcpu); /* * Flushing all "guest" TLB is always a superset of Hyper-V's fine @@ -3644,7 +3631,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_flush_tlb_current)(vcpu); + kvm_x86_call(flush_tlb_current)(vcpu); } /* @@ -4703,8 +4690,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_MEMORY_FAULT_INFO: + case KVM_CAP_X86_GUEST_MODE: r = 1; break; + case KVM_CAP_PRE_FAULT_MEMORY: + r = tdp_enabled; + break; + case KVM_CAP_X86_APIC_BUS_CYCLES_NS: + r = APIC_BUS_CYCLE_NS_DEFAULT; + break; case KVM_CAP_EXIT_HYPERCALL: r = KVM_EXIT_HYPERCALL_VALID_MASK; break; @@ -4753,7 +4747,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); + r = kvm_x86_call(has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; case KVM_CAP_NR_VCPUS: r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); @@ -4833,7 +4827,7 @@ static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val) { if (attr->group) { if (kvm_x86_ops.dev_get_attr) - return static_call(kvm_x86_dev_get_attr)(attr->group, attr->attr, val); + return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val); return -ENXIO; } @@ -4995,16 +4989,25 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + vcpu->arch.l1tf_flush_l1d = true; + + if (vcpu->scheduled_out && pmu->version && pmu->event_count) { + pmu->need_cleanup = true; + kvm_make_request(KVM_REQ_PMU, vcpu); + } + /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { - if (static_call(kvm_x86_has_wbinvd_exit)()) + if (kvm_x86_call(has_wbinvd_exit)()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) smp_call_function_single(vcpu->cpu, wbinvd_ipi, NULL, 1); } - static_call(kvm_x86_vcpu_load)(vcpu, cpu); + kvm_x86_call(vcpu_load)(vcpu, cpu); /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); @@ -5112,14 +5115,14 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, idx); } - static_call(kvm_x86_vcpu_put)(vcpu); + kvm_x86_call(vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + kvm_x86_call(sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -5236,7 +5239,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, kvm_apic_after_set_mcg_cap(vcpu); - static_call(kvm_x86_setup_mce)(vcpu); + kvm_x86_call(setup_mce)(vcpu); out: return r; } @@ -5396,11 +5399,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = kvm_get_nr_pending_nmis(vcpu); - events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); + events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu); /* events->sipi_vector is never valid when reporting to user space */ @@ -5482,8 +5485,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - static_call(kvm_x86_set_interrupt_shadow)(vcpu, - events->interrupt.shadow); + kvm_x86_call(set_interrupt_shadow)(vcpu, + events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) { @@ -5492,7 +5495,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->nmi.pending) kvm_make_request(KVM_REQ_NMI, vcpu); } - static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); + kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) @@ -5840,7 +5843,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, if (!kvm_x86_ops.enable_l2_tlb_flush) return -ENOTTY; - return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu); + return kvm_x86_call(enable_l2_tlb_flush)(vcpu); case KVM_CAP_HYPERV_ENFORCE_CPUID: return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); @@ -5879,8 +5882,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), - GFP_KERNEL_ACCOUNT); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; if (!u.lapic) @@ -6073,7 +6075,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave)) break; - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; if (!u.xsave) break; @@ -6104,7 +6106,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_GET_XSAVE2: { int size = vcpu->arch.guest_fpu.uabi_size; - u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT); + u.xsave = kzalloc(size, GFP_KERNEL); r = -ENOMEM; if (!u.xsave) break; @@ -6122,7 +6124,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; if (!u.xcrs) break; @@ -6330,14 +6332,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; - ret = static_call(kvm_x86_set_tss_addr)(kvm, addr); + ret = kvm_x86_call(set_tss_addr)(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr); + return kvm_x86_call(set_identity_map_addr)(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -6543,9 +6545,6 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, goto split_irqchip_unlock; if (kvm->created_vcpus) goto split_irqchip_unlock; - r = kvm_setup_empty_irq_routing(kvm); - if (r) - goto split_irqchip_unlock; /* Pairs with irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; @@ -6650,14 +6649,14 @@ split_irqchip_unlock: if (!kvm_x86_ops.vm_copy_enc_context_from) break; - r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]); + r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]); break; case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: r = -EINVAL; if (!kvm_x86_ops.vm_move_enc_context_from) break; - r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]); + r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]); break; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { @@ -6692,7 +6691,9 @@ split_irqchip_unlock: break; mutex_lock(&kvm->lock); - if (kvm->arch.max_vcpu_ids == cap->args[0]) { + if (kvm->arch.bsp_vcpu_id > cap->args[0]) { + ; + } else if (kvm->arch.max_vcpu_ids == cap->args[0]) { r = 0; } else if (!kvm->arch.max_vcpu_ids) { kvm->arch.max_vcpu_ids = cap->args[0]; @@ -6745,6 +6746,30 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_X86_APIC_BUS_CYCLES_NS: { + u64 bus_cycle_ns = cap->args[0]; + u64 unused; + + /* + * Guard against overflow in tmict_to_ns(). 128 is the highest + * divide value that can be programmed in APIC_TDCR. + */ + r = -EINVAL; + if (!bus_cycle_ns || + check_mul_overflow((u64)U32_MAX * 128, bus_cycle_ns, &unused)) + break; + + r = 0; + mutex_lock(&kvm->lock); + if (!irqchip_in_kernel(kvm)) + r = -ENXIO; + else if (kvm->created_vcpus) + r = -EINVAL; + else + kvm->arch.apic_bus_cycle_ns = bus_cycle_ns; + mutex_unlock(&kvm->lock); + break; + } default: r = -EINVAL; break; @@ -7213,6 +7238,9 @@ set_pit2_out: mutex_lock(&kvm->lock); if (kvm->created_vcpus) r = -EBUSY; + else if (arg > KVM_MAX_VCPU_IDS || + (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids)) + r = -EINVAL; else kvm->arch.bsp_vcpu_id = arg; mutex_unlock(&kvm->lock); @@ -7289,7 +7317,7 @@ set_pit2_out: if (!kvm_x86_ops.mem_enc_ioctl) goto out; - r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp); + r = kvm_x86_call(mem_enc_ioctl)(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -7303,7 +7331,7 @@ set_pit2_out: if (!kvm_x86_ops.mem_enc_register_region) goto out; - r = static_call(kvm_x86_mem_enc_register_region)(kvm, ®ion); + r = kvm_x86_call(mem_enc_register_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -7317,7 +7345,7 @@ set_pit2_out: if (!kvm_x86_ops.mem_enc_unregister_region) goto out; - r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); + r = kvm_x86_call(mem_enc_unregister_region)(kvm, ®ion); break; } #ifdef CONFIG_KVM_HYPERV @@ -7411,17 +7439,20 @@ static void kvm_probe_msr_to_save(u32 msr_index) intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) return; break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: + case MSR_ARCH_PERFMON_PERFCTR0 ... + MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1: if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= kvm_pmu_cap.num_counters_gp) return; break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: + case MSR_ARCH_PERFMON_EVENTSEL0 ... + MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1: if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= kvm_pmu_cap.num_counters_gp) return; break; - case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX: + case MSR_ARCH_PERFMON_FIXED_CTR0 ... + MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1: if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= kvm_pmu_cap.num_counters_fixed) return; @@ -7452,7 +7483,7 @@ static void kvm_init_msr_lists(void) { unsigned i; - BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, + BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3, "Please update the fixed PMCs in msrs_to_save_pmu[]"); num_msrs_to_save = 0; @@ -7468,7 +7499,8 @@ static void kvm_init_msr_lists(void) } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) + if (!kvm_x86_call(has_emulated_msr)(NULL, + emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; @@ -7527,13 +7559,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - static_call(kvm_x86_set_segment)(vcpu, var, seg); + kvm_x86_call(set_segment)(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - static_call(kvm_x86_get_segment)(vcpu, var, seg); + kvm_x86_call(get_segment)(vcpu, var, seg); } gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, @@ -7556,7 +7588,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); @@ -7566,7 +7598,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } @@ -7619,7 +7651,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; @@ -7644,7 +7676,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -7667,7 +7699,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, if (system) access |= PFERR_IMPLICIT_ACCESS; - else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) + else if (kvm_x86_call(get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -7712,7 +7744,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v if (system) access |= PFERR_IMPLICIT_ACCESS; - else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) + else if (kvm_x86_call(get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -7733,8 +7765,8 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) { - return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type, - insn, insn_len); + return kvm_x86_call(check_emulate_instruction)(vcpu, emul_type, + insn, insn_len); } int handle_ud(struct kvm_vcpu *vcpu) @@ -7784,8 +7816,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, bool write) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) - | (write ? PFERR_WRITE_MASK : 0); + u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) + | (write ? PFERR_WRITE_MASK : 0); /* * currently PKRU is only applied to ept enabled guest so @@ -8211,7 +8243,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return static_call(kvm_x86_get_segment_base)(vcpu, seg); + return kvm_x86_call(get_segment_base)(vcpu, seg); } static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) @@ -8224,7 +8256,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; - if (static_call(kvm_x86_has_wbinvd_exit)()) { + if (kvm_x86_call(has_wbinvd_exit)()) { int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); @@ -8328,27 +8360,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt)); + return kvm_x86_call(get_cpl)(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(get_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(get_idt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(set_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(set_idt)(emul_to_vcpu(ctxt), dt); } static unsigned long emulator_get_cached_segment_base( @@ -8495,8 +8527,8 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage, - &ctxt->exception); + return kvm_x86_call(check_intercept)(emul_to_vcpu(ctxt), info, stage, + &ctxt->exception); } static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, @@ -8521,6 +8553,11 @@ static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); } +static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_is_intel_compatible(emul_to_vcpu(ctxt)); +} + static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) { return kvm_register_read_raw(emul_to_vcpu(ctxt), reg); @@ -8533,7 +8570,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) { - static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); + kvm_x86_call(set_nmi_mask)(emul_to_vcpu(ctxt), masked); } static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) @@ -8578,7 +8615,8 @@ static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt, if (!kvm_x86_ops.get_untagged_addr) return addr; - return static_call(kvm_x86_get_untagged_addr)(emul_to_vcpu(ctxt), addr, flags); + return kvm_x86_call(get_untagged_addr)(emul_to_vcpu(ctxt), + addr, flags); } static const struct x86_emulate_ops emulate_ops = { @@ -8619,6 +8657,7 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, .guest_has_rdpid = emulator_guest_has_rdpid, + .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible, .set_nmi_mask = emulator_set_nmi_mask, .is_smm = emulator_is_smm, .is_guest_mode = emulator_is_guest_mode, @@ -8630,7 +8669,7 @@ static const struct x86_emulate_ops emulate_ops = { static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + u32 int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -8641,7 +8680,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (int_shadow & mask) mask = 0; if (unlikely(int_shadow || mask)) { - static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask); + kvm_x86_call(set_interrupt_shadow)(vcpu, mask); if (!mask) kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -8682,7 +8721,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; - static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); @@ -8738,9 +8777,8 @@ static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, */ memset(&info, 0, sizeof(info)); - static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1], - &info[2], (u32 *)&info[3], - (u32 *)&info[4]); + kvm_x86_call(get_exit_info)(vcpu, (u32 *)&info[0], &info[1], &info[2], + (u32 *)&info[3], (u32 *)&info[4]); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION; @@ -8817,7 +8855,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) kvm_queue_exception(vcpu, UD_VECTOR); - if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { + if (!is_guest_mode(vcpu) && kvm_x86_call(get_cpl)(vcpu) == 0) { prepare_emulation_ctxt_failure_exit(vcpu); return 0; } @@ -8975,10 +9013,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); + unsigned long rflags = kvm_x86_call(get_rflags)(vcpu); int r; - r = static_call(kvm_x86_skip_emulated_instruction)(vcpu); + r = kvm_x86_call(skip_emulated_instruction)(vcpu); if (unlikely(!r)) return 0; @@ -9000,19 +9038,17 @@ EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) { - u32 shadow; - if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF) return true; /* - * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active, - * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first - * to avoid the relatively expensive CPUID lookup. + * Intel compatible CPUs inhibit code #DBs when MOV/POP SS blocking is + * active, but AMD compatible CPUs do not. */ - shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); - return (shadow & KVM_X86_SHADOW_INT_MOV_SS) && - guest_cpuid_is_intel(vcpu); + if (!guest_cpuid_is_intel_compatible(vcpu)) + return false; + + return kvm_x86_call(get_interrupt_shadow)(vcpu) & KVM_X86_SHADOW_INT_MOV_SS; } static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, @@ -9284,7 +9320,7 @@ restart: writeback: if (writeback) { - unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); + unsigned long rflags = kvm_x86_call(get_rflags)(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; @@ -9301,7 +9337,7 @@ writeback: kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); - static_call_cond(kvm_x86_update_emulated_instruction)(vcpu); + kvm_x86_call(update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -9700,7 +9736,7 @@ static int kvm_x86_check_processor_compatibility(void) __cr4_reserved_bits(cpu_has, &boot_cpu_data)) return -EIO; - return static_call(kvm_x86_check_processor_compatibility)(); + return kvm_x86_call(check_processor_compatibility)(); } static void kvm_x86_check_cpu_compat(void *ret) @@ -9772,19 +9808,19 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P; if (boot_cpu_has(X86_FEATURE_XSAVE)) { - host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } - rdmsrl_safe(MSR_EFER, &host_efer); + rdmsrl_safe(MSR_EFER, &kvm_host.efer); if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); + rdmsrl(MSR_IA32_XSS, kvm_host.xss); kvm_init_pmu_capability(ops->pmu_ops); if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities); + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); r = ops->hardware_setup(); if (r != 0) @@ -9843,7 +9879,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) out_unwind_ops: kvm_x86_ops.hardware_enable = NULL; - static_call(kvm_x86_hardware_unsetup)(); + kvm_x86_call(hardware_unsetup)(); out_mmu_exit: kvm_mmu_vendor_module_exit(); out_free_percpu: @@ -9874,7 +9910,7 @@ void kvm_x86_vendor_exit(void) irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif - static_call(kvm_x86_hardware_unsetup)(); + kvm_x86_call(hardware_unsetup)(); kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); @@ -10000,7 +10036,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated); bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) { ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons); - ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu); + ulong vcpu_reasons = + kvm_x86_call(vcpu_get_apicv_inhibit_reasons)(vcpu); return (vm_reasons | vcpu_reasons) == 0; } @@ -10009,6 +10046,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated); static void set_or_clear_apicv_inhibit(unsigned long *inhibits, enum kvm_apicv_inhibit reason, bool set) { + const struct trace_print_flags apicv_inhibits[] = { APICV_INHIBIT_REASONS }; + + BUILD_BUG_ON(ARRAY_SIZE(apicv_inhibits) != NR_APICV_INHIBIT_REASONS); + if (set) __set_bit(reason, inhibits); else @@ -10020,7 +10061,7 @@ static void set_or_clear_apicv_inhibit(unsigned long *inhibits, static void kvm_apicv_init(struct kvm *kvm) { enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT : - APICV_INHIBIT_REASON_DISABLE; + APICV_INHIBIT_REASON_DISABLED; set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true); @@ -10182,7 +10223,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a2 = kvm_rdx_read(vcpu); a3 = kvm_rsi_read(vcpu); op_64_bit = is_64_bit_hypercall(vcpu); - cpl = static_call(kvm_x86_get_cpl)(vcpu); + cpl = kvm_x86_call(get_cpl)(vcpu); ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl); if (nr == KVM_HC_MAP_GPA_RANGE && !ret) @@ -10214,7 +10255,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) return X86EMUL_PROPAGATE_FAULT; } - static_call(kvm_x86_patch_hypercall)(vcpu, instruction); + kvm_x86_call(patch_hypercall)(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, &ctxt->exception); @@ -10231,7 +10272,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu); + kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu); kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); @@ -10241,6 +10282,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) if (is_smm(vcpu)) kvm_run->flags |= KVM_RUN_X86_SMM; + if (is_guest_mode(vcpu)) + kvm_run->flags |= KVM_RUN_X86_GUEST_MODE; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -10266,7 +10309,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) tpr = kvm_lapic_get_cr8(vcpu); - static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr); + kvm_x86_call(update_cr8_intercept)(vcpu, tpr, max_irr); } @@ -10296,7 +10339,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) vcpu->arch.exception.error_code, vcpu->arch.exception.injected); - static_call(kvm_x86_inject_exception)(vcpu); + kvm_x86_call(inject_exception)(vcpu); } /* @@ -10382,9 +10425,9 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, else if (kvm_is_exception_pending(vcpu)) ; /* see above */ else if (vcpu->arch.nmi_injected) - static_call(kvm_x86_inject_nmi)(vcpu); + kvm_x86_call(inject_nmi)(vcpu); else if (vcpu->arch.interrupt.injected) - static_call(kvm_x86_inject_irq)(vcpu, true); + kvm_x86_call(inject_irq)(vcpu, true); /* * Exceptions that morph to VM-Exits are handled above, and pending @@ -10469,7 +10512,8 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, */ #ifdef CONFIG_KVM_SMM if (vcpu->arch.smi_pending) { - r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; + r = can_inject ? kvm_x86_call(smi_allowed)(vcpu, true) : + -EBUSY; if (r < 0) goto out; if (r) { @@ -10478,27 +10522,29 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, enter_smm(vcpu); can_inject = false; } else - static_call(kvm_x86_enable_smi_window)(vcpu); + kvm_x86_call(enable_smi_window)(vcpu); } #endif if (vcpu->arch.nmi_pending) { - r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; + r = can_inject ? kvm_x86_call(nmi_allowed)(vcpu, true) : + -EBUSY; if (r < 0) goto out; if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - static_call(kvm_x86_inject_nmi)(vcpu); + kvm_x86_call(inject_nmi)(vcpu); can_inject = false; - WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); + WARN_ON(kvm_x86_call(nmi_allowed)(vcpu, true) < 0); } if (vcpu->arch.nmi_pending) - static_call(kvm_x86_enable_nmi_window)(vcpu); + kvm_x86_call(enable_nmi_window)(vcpu); } if (kvm_cpu_has_injectable_intr(vcpu)) { - r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY; + r = can_inject ? kvm_x86_call(interrupt_allowed)(vcpu, true) : + -EBUSY; if (r < 0) goto out; if (r) { @@ -10506,17 +10552,17 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, if (!WARN_ON_ONCE(irq == -1)) { kvm_queue_interrupt(vcpu, irq, false); - static_call(kvm_x86_inject_irq)(vcpu, false); - WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); + kvm_x86_call(inject_irq)(vcpu, false); + WARN_ON(kvm_x86_call(interrupt_allowed)(vcpu, true) < 0); } } if (kvm_cpu_has_injectable_intr(vcpu)) - static_call(kvm_x86_enable_irq_window)(vcpu); + kvm_x86_call(enable_irq_window)(vcpu); } if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->has_events && - kvm_x86_ops.nested_ops->has_events(vcpu)) + kvm_x86_ops.nested_ops->has_events(vcpu, true)) *req_immediate_exit = true; /* @@ -10557,7 +10603,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) * blocks NMIs). KVM will immediately inject one of the two NMIs, and * will request an NMI window to handle the second NMI. */ - if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) + if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) limit = 1; else limit = 2; @@ -10566,14 +10612,14 @@ static void process_nmi(struct kvm_vcpu *vcpu) * Adjust the limit to account for pending virtual NMIs, which aren't * tracked in vcpu->arch.nmi_pending. */ - if (static_call(kvm_x86_is_vnmi_pending)(vcpu)) + if (kvm_x86_call(is_vnmi_pending)(vcpu)) limit--; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); if (vcpu->arch.nmi_pending && - (static_call(kvm_x86_set_vnmi_pending)(vcpu))) + (kvm_x86_call(set_vnmi_pending)(vcpu))) vcpu->arch.nmi_pending--; if (vcpu->arch.nmi_pending) @@ -10584,7 +10630,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu) { return vcpu->arch.nmi_pending + - static_call(kvm_x86_is_vnmi_pending)(vcpu); + kvm_x86_call(is_vnmi_pending)(vcpu); } void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, @@ -10618,7 +10664,7 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); - static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); + kvm_x86_call(refresh_apicv_exec_ctrl)(vcpu); /* * When APICv gets disabled, we may still have injected interrupts @@ -10718,7 +10764,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + kvm_x86_call(sync_pir_to_irr)(vcpu); if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); @@ -10743,17 +10789,17 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); - static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + kvm_x86_call(load_eoi_exitmap)(vcpu, eoi_exit_bitmap); return; } #endif - static_call_cond(kvm_x86_load_eoi_exitmap)( + kvm_x86_call(load_eoi_exitmap)( vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { - static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm); + kvm_x86_call(guest_memory_reclaimed)(kvm); } static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) @@ -10761,7 +10807,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); + kvm_x86_call(set_apic_access_page_addr)(vcpu); } /* @@ -10925,10 +10971,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) - static_call(kvm_x86_msr_filter_changed)(vcpu); + kvm_x86_call(msr_filter_changed)(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) - static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); + kvm_x86_call(update_cpu_dirty_logging)(vcpu); + + if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) { + kvm_vcpu_reset(vcpu, true); + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) { + r = 1; + goto out; + } + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || @@ -10950,7 +11004,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } if (req_int_win) - static_call(kvm_x86_enable_irq_window)(vcpu); + kvm_x86_call(enable_irq_window)(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); @@ -10965,7 +11019,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - static_call(kvm_x86_prepare_switch_to_guest)(vcpu); + kvm_x86_call(prepare_switch_to_guest)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -11001,7 +11055,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * i.e. they can post interrupts even if APICv is temporarily disabled. */ if (kvm_lapic_enabled(vcpu)) - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + kvm_x86_call(sync_pir_to_irr)(vcpu); if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; @@ -11045,12 +11099,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); - exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit); + exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, + req_immediate_exit); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; if (kvm_lapic_enabled(vcpu)) - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + kvm_x86_call(sync_pir_to_irr)(vcpu); if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; @@ -11069,7 +11124,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); + kvm_x86_call(sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); } @@ -11098,7 +11153,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.xfd_no_write_intercept) fpu_sync_guest_vmexit_xfd_state(); - static_call(kvm_x86_handle_exit_irqoff)(vcpu); + kvm_x86_call(handle_exit_irqoff)(vcpu); if (vcpu->arch.guest_fpu.xfd_err) wrmsrl(MSR_IA32_XFD_ERR, 0); @@ -11131,6 +11186,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_vcpu_srcu_read_lock(vcpu); /* + * Call this to ensure WC buffers in guest are evicted after each VM + * Exit, so that the evicted WC writes can be snooped across all cpus + */ + smp_mb__after_srcu_read_lock(); + + /* * Profile KVM exit RIPs: */ if (unlikely(prof_on == KVM_PROFILING)) { @@ -11144,13 +11205,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath); + r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); return r; cancel_injection: if (req_immediate_exit) kvm_make_request(KVM_REQ_EVENT, vcpu); - static_call(kvm_x86_cancel_injection)(vcpu); + kvm_x86_call(cancel_injection)(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: @@ -11200,7 +11261,10 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) * causes a spurious wakeup from HLT). */ if (is_guest_mode(vcpu)) { - if (kvm_check_nested_events(vcpu) < 0) + int r = kvm_check_nested_events(vcpu); + + WARN_ON_ONCE(r == -EBUSY); + if (r < 0) return 0; } @@ -11237,7 +11301,6 @@ static int vcpu_run(struct kvm_vcpu *vcpu) int r; vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->arch.l1tf_flush_l1d = true; for (;;) { /* @@ -11387,7 +11450,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_vcpu_srcu_read_lock(vcpu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { - if (kvm_run->immediate_exit) { + if (!vcpu->wants_to_run) { r = -EINTR; goto out; } @@ -11465,12 +11528,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vcpu->mmio_needed); } - if (kvm_run->immediate_exit) { + if (!vcpu->wants_to_run) { r = -EINTR; goto out; } - r = static_call(kvm_x86_vcpu_pre_run)(vcpu); + r = kvm_x86_call(vcpu_pre_run)(vcpu); if (r <= 0) goto out; @@ -11598,10 +11661,10 @@ static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - static_call(kvm_x86_get_idt)(vcpu, &dt); + kvm_x86_call(get_idt)(vcpu, &dt); sregs->idt.limit = dt.size; sregs->idt.base = dt.address; - static_call(kvm_x86_get_gdt)(vcpu, &dt); + kvm_x86_call(get_gdt)(vcpu, &dt); sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; @@ -11743,7 +11806,13 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); - if (ret) { + + /* + * Report an error userspace if MMIO is needed, as KVM doesn't support + * MMIO during a task switch (or any other complex operation). + */ + if (ret || vcpu->mmio_needed) { + vcpu->mmio_needed = false; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -11801,27 +11870,27 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, dt.size = sregs->idt.limit; dt.address = sregs->idt.base; - static_call(kvm_x86_set_idt)(vcpu, &dt); + kvm_x86_call(set_idt)(vcpu, &dt); dt.size = sregs->gdt.limit; dt.address = sregs->gdt.base; - static_call(kvm_x86_set_gdt)(vcpu, &dt); + kvm_x86_call(set_gdt)(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); - static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3); + kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3); kvm_set_cr8(vcpu, sregs->cr8); *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - static_call(kvm_x86_set_efer)(vcpu, sregs->efer); + kvm_x86_call(set_efer)(vcpu, sregs->efer); *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); + kvm_x86_call(set_cr0)(vcpu, sregs->cr0); *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; - static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); + kvm_x86_call(set_cr4)(vcpu, sregs->cr4); if (update_pdptrs) { idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -11999,7 +12068,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - static_call(kvm_x86_update_exception_bitmap)(vcpu); + kvm_x86_call(update_exception_bitmap)(vcpu); kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm); @@ -12136,7 +12205,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (id >= kvm->arch.max_vcpu_ids) return -EINVAL; - return static_call(kvm_x86_vcpu_precreate)(kvm); + return kvm_x86_call(vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -12207,14 +12276,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.hv_root_tdp = INVALID_PAGE; #endif - r = static_call(kvm_x86_vcpu_create)(vcpu); + r = kvm_x86_call(vcpu_create)(vcpu); if (r) goto free_guest_fpu; vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_xen_init_vcpu(vcpu); - kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz); kvm_vcpu_reset(vcpu, false); @@ -12265,7 +12333,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvmclock_reset(vcpu); - static_call(kvm_x86_vcpu_free)(vcpu); + kvm_x86_call(vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); @@ -12383,7 +12451,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); - static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + kvm_x86_call(vcpu_reset)(vcpu, init_event); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0xfff0); @@ -12402,10 +12470,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) else new_cr0 |= X86_CR0_NW | X86_CR0_CD; - static_call(kvm_x86_set_cr0)(vcpu, new_cr0); - static_call(kvm_x86_set_cr4)(vcpu, 0); - static_call(kvm_x86_set_efer)(vcpu, 0); - static_call(kvm_x86_update_exception_bitmap)(vcpu); + kvm_x86_call(set_cr0)(vcpu, new_cr0); + kvm_x86_call(set_cr4)(vcpu, 0); + kvm_x86_call(set_efer)(vcpu, 0); + kvm_x86_call(update_exception_bitmap)(vcpu); /* * On the standard CR0/CR4/EFER modification paths, there are several @@ -12462,7 +12530,7 @@ int kvm_arch_hardware_enable(void) if (ret) return ret; - ret = static_call(kvm_x86_hardware_enable)(); + ret = kvm_x86_call(hardware_enable)(); if (ret != 0) return ret; @@ -12544,7 +12612,7 @@ int kvm_arch_hardware_enable(void) void kvm_arch_hardware_disable(void) { - static_call(kvm_x86_hardware_disable)(); + kvm_x86_call(hardware_disable)(); drop_user_return_notifiers(); } @@ -12558,18 +12626,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - vcpu->arch.l1tf_flush_l1d = true; - if (pmu->version && unlikely(pmu->event_count)) { - pmu->need_cleanup = true; - kvm_make_request(KVM_REQ_PMU, vcpu); - } - static_call(kvm_x86_sched_in)(vcpu, cpu); -} - void kvm_arch_free_vm(struct kvm *kvm) { #if IS_ENABLED(CONFIG_HYPERV) @@ -12597,7 +12653,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_mmu_init_vm(kvm); - ret = static_call(kvm_x86_vm_init)(kvm); + ret = kvm_x86_call(vm_init)(kvm); if (ret) goto out_uninit_mmu; @@ -12620,6 +12676,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz; + kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT; kvm->arch.guest_can_read_msr_platform_info = true; kvm->arch.enable_pmu = enable_pmu; @@ -12771,7 +12828,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); } kvm_unload_vcpu_mmus(kvm); - static_call_cond(kvm_x86_vm_destroy)(kvm); + kvm_x86_call(vm_destroy)(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); @@ -13100,12 +13157,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_arch_free_memslot(kvm, old); } -static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - return (is_guest_mode(vcpu) && - static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); -} - static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) @@ -13123,22 +13174,23 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - static_call(kvm_x86_nmi_allowed)(vcpu, false))) + kvm_x86_call(nmi_allowed)(vcpu, false))) return true; #ifdef CONFIG_KVM_SMM if (kvm_test_request(KVM_REQ_SMI, vcpu) || (vcpu->arch.smi_pending && - static_call(kvm_x86_smi_allowed)(vcpu, false))) + kvm_x86_call(smi_allowed)(vcpu, false))) return true; #endif if (kvm_test_request(KVM_REQ_PMI, vcpu)) return true; - if (kvm_arch_interrupt_allowed(vcpu) && - (kvm_cpu_has_interrupt(vcpu) || - kvm_guest_apic_has_interrupt(vcpu))) + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) + return true; + + if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) return true; if (kvm_hv_has_stimer_pending(vcpu)) @@ -13146,7 +13198,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->has_events && - kvm_x86_ops.nested_ops->has_events(vcpu)) + kvm_x86_ops.nested_ops->has_events(vcpu, false)) return true; if (kvm_xen_has_pending_events(vcpu)) @@ -13163,7 +13215,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { return kvm_vcpu_apicv_active(vcpu) && - static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu); + kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); } bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) @@ -13191,7 +13243,7 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return true; - return static_call(kvm_x86_get_cpl)(vcpu) == 0; + return kvm_x86_call(get_cpl)(vcpu) == 0; } unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) @@ -13206,7 +13258,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return static_call(kvm_x86_interrupt_allowed)(vcpu, false); + return kvm_x86_call(interrupt_allowed)(vcpu, false); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -13232,7 +13284,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; - rflags = static_call(kvm_x86_get_rflags)(vcpu); + rflags = kvm_x86_call(get_rflags)(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; @@ -13244,7 +13296,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; - static_call(kvm_x86_set_rflags)(vcpu, rflags); + kvm_x86_call(set_rflags)(vcpu, rflags); } void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -13356,7 +13408,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) return false; if (vcpu->arch.apf.send_user_only && - static_call(kvm_x86_get_cpl)(vcpu) == 0) + kvm_x86_call(get_cpl)(vcpu) == 0) return false; if (is_guest_mode(vcpu)) { @@ -13467,7 +13519,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - static_call_cond(kvm_x86_pi_start_assignment)(kvm); + kvm_x86_call(pi_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); @@ -13486,13 +13538,13 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) { /* - * Non-coherent DMA assignment and de-assignment will affect - * whether KVM honors guest MTRRs and cause changes in memtypes - * in TDP. - * So, pass %true unconditionally to indicate non-coherent DMA was, - * or will be involved, and that zapping SPTEs might be necessary. + * Non-coherent DMA assignment and de-assignment may affect whether or + * not KVM honors guest PAT, and thus may cause changes in EPT SPTEs + * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first + * (or last) non-coherent device is (un)registered to so that new SPTEs + * with the correct "ignore guest PAT" setting are created. */ - if (__kvm_mmu_honors_guest_mtrrs(true)) + if (kvm_mmu_may_ignore_guest_pat()) kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); } @@ -13530,9 +13582,8 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); - ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 1); - + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, + prod->irq, irqfd->gsi, 1); if (ret) kvm_arch_end_assignment(irqfd->kvm); @@ -13555,7 +13606,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, + prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -13566,7 +13618,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set); + return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, @@ -13589,6 +13641,24 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +bool kvm_arch_gmem_prepare_needed(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_SNP_VM; +} + +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) +{ + return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order); +} +#endif + +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_x86_call(gmem_invalidate)(start, end); +} +#endif int kvm_spec_ctrl_test_value(u64 value) { @@ -13974,6 +14044,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault); static int __init kvm_x86_init(void) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d80a4c6b5a38..50596f6f8320 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -33,6 +33,20 @@ struct kvm_caps { u64 supported_perf_cap; }; +struct kvm_host_values { + /* + * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical + * address bits irrespective of features that repurpose legal bits, + * e.g. MKTME. + */ + u8 maxphyaddr; + + u64 efer; + u64 xcr0; + u64 xss; + u64 arch_capabilities; +}; + void kvm_spurious_fault(void); #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ @@ -159,7 +173,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) if (!is_long_mode(vcpu)) return false; - static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); return cs_l; } @@ -311,12 +325,8 @@ int handle_ud(struct kvm_vcpu *vcpu); void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, struct kvm_queued_exception *ex); -void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); -u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, - int page_num); bool kvm_vector_hashing_enabled(void); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, @@ -325,11 +335,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); -extern u64 host_xcr0; -extern u64 host_xss; -extern u64 host_arch_capabilities; - extern struct kvm_caps kvm_caps; +extern struct kvm_host_values kvm_host; extern bool enable_pmu; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index f65b35a05d91..622fe24da910 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -741,7 +741,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) } else { void __user * hva = u64_to_user_ptr(data->u.shared_info.hva); - if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) { + if (!PAGE_ALIGNED(hva)) { r = -EINVAL; } else if (!hva) { kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); @@ -1270,7 +1270,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) instructions[0] = 0xb8; /* vmcall / vmmcall */ - static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5); + kvm_x86_call(patch_hypercall)(vcpu, instructions + 5); /* ret */ instructions[8] = 0xc3; @@ -1650,7 +1650,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) params[5] = (u64)kvm_r9_read(vcpu); } #endif - cpl = static_call(kvm_x86_get_cpl)(vcpu); + cpl = kvm_x86_call(get_cpl)(vcpu); trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2], params[3], params[4], params[5]); |