summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h8
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h90
-rw-r--r--arch/x86/include/asm/sev-common.h25
-rw-r--r--arch/x86/include/asm/sev.h51
-rw-r--r--arch/x86/include/asm/svm.h9
-rw-r--r--arch/x86/include/uapi/asm/kvm.h49
-rw-r--r--arch/x86/kvm/Kconfig4
-rw-r--r--arch/x86/kvm/cpuid.c14
-rw-r--r--arch/x86/kvm/cpuid.h18
-rw-r--r--arch/x86/kvm/emulate.c71
-rw-r--r--arch/x86/kvm/hyperv.c9
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/irq_comm.c7
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h10
-rw-r--r--arch/x86/kvm/kvm_emulate.h1
-rw-r--r--arch/x86/kvm/lapic.c48
-rw-r--r--arch/x86/kvm/lapic.h5
-rw-r--r--arch/x86/kvm/mmu.h42
-rw-r--r--arch/x86/kvm/mmu/mmu.c206
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h26
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/mmu/spte.c46
-rw-r--r--arch/x86/kvm/mmu/spte.h10
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c136
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h2
-rw-r--r--arch/x86/kvm/mtrr.c644
-rw-r--r--arch/x86/kvm/pmu.c73
-rw-r--r--arch/x86/kvm/pmu.h10
-rw-r--r--arch/x86/kvm/smm.c44
-rw-r--r--arch/x86/kvm/svm/nested.c2
-rw-r--r--arch/x86/kvm/svm/pmu.c11
-rw-r--r--arch/x86/kvm/svm/sev.c1564
-rw-r--r--arch/x86/kvm/svm/svm.c78
-rw-r--r--arch/x86/kvm/svm/svm.h70
-rw-r--r--arch/x86/kvm/trace.h55
-rw-r--r--arch/x86/kvm/vmx/main.c5
-rw-r--r--arch/x86/kvm/vmx/nested.c55
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c52
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h10
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h14
-rw-r--r--arch/x86/kvm/vmx/vmx.c205
-rw-r--r--arch/x86/kvm/vmx/vmx.h3
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h4
-rw-r--r--arch/x86/kvm/x86.c567
-rw-r--r--arch/x86/kvm/x86.h25
-rw-r--r--arch/x86/kvm/xen.c6
48 files changed, 2852 insertions, 1541 deletions
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 5187fcf4b610..68ad4f923664 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -9,8 +9,7 @@ BUILD_BUG_ON(1)
* "static_call_update()" calls.
*
* KVM_X86_OP_OPTIONAL() can be used for those functions that can have
- * a NULL definition, for example if "static_call_cond()" will be used
- * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise
+ * a NULL definition. KVM_X86_OP_OPTIONAL_RET0() can be used likewise
* to make a definition optional, but in this case the default will
* be __static_call_return0.
*/
@@ -85,7 +84,6 @@ KVM_X86_OP_OPTIONAL(update_cr8_intercept)
KVM_X86_OP(refresh_apicv_exec_ctrl)
KVM_X86_OP_OPTIONAL(hwapic_irr_update)
KVM_X86_OP_OPTIONAL(hwapic_isr_update)
-KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt)
KVM_X86_OP_OPTIONAL(load_eoi_exitmap)
KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
@@ -103,7 +101,6 @@ KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
-KVM_X86_OP(sched_in)
KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
KVM_X86_OP_OPTIONAL(vcpu_unblocking)
@@ -139,6 +136,9 @@ KVM_X86_OP(vcpu_deliver_sipi_vector)
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
KVM_X86_OP_OPTIONAL(get_untagged_addr)
KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
+KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
+KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level)
+KVM_X86_OP_OPTIONAL(gmem_invalidate)
#undef KVM_X86_OP
#undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
index f852b13aeefe..9159bf1a4730 100644
--- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -9,8 +9,7 @@ BUILD_BUG_ON(1)
* "static_call_update()" calls.
*
* KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have
- * a NULL definition, for example if "static_call_cond()" will be used
- * at the call sites.
+ * a NULL definition.
*/
KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
KVM_X86_PMU_OP(msr_idx_to_pmc)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f8ca74e7678f..950a03e0181e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -121,6 +121,7 @@
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -159,7 +160,6 @@
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 256
-#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
#define ASYNC_PF_PER_VCPU 64
@@ -533,12 +533,16 @@ struct kvm_pmc {
};
/* More counters may conflict with other existing Architectural MSRs */
-#define KVM_INTEL_PMC_MAX_GENERIC 8
-#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
-#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
-#define KVM_PMC_MAX_FIXED 3
-#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
-#define KVM_AMD_PMC_MAX_GENERIC 6
+#define KVM_MAX(a, b) ((a) >= (b) ? (a) : (b))
+#define KVM_MAX_NR_INTEL_GP_COUNTERS 8
+#define KVM_MAX_NR_AMD_GP_COUNTERS 6
+#define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \
+ KVM_MAX_NR_AMD_GP_COUNTERS)
+
+#define KVM_MAX_NR_INTEL_FIXED_COUTNERS 3
+#define KVM_MAX_NR_AMD_FIXED_COUTNERS 0
+#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUTNERS, \
+ KVM_MAX_NR_AMD_FIXED_COUTNERS)
struct kvm_pmu {
u8 version;
@@ -546,16 +550,16 @@ struct kvm_pmu {
unsigned nr_arch_fixed_counters;
unsigned available_event_types;
u64 fixed_ctr_ctrl;
- u64 fixed_ctr_ctrl_mask;
+ u64 fixed_ctr_ctrl_rsvd;
u64 global_ctrl;
u64 global_status;
u64 counter_bitmask[2];
- u64 global_ctrl_mask;
- u64 global_status_mask;
+ u64 global_ctrl_rsvd;
+ u64 global_status_rsvd;
u64 reserved_bits;
u64 raw_event_mask;
- struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
- struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
+ struct kvm_pmc gp_counters[KVM_MAX_NR_GP_COUNTERS];
+ struct kvm_pmc fixed_counters[KVM_MAX_NR_FIXED_COUNTERS];
/*
* Overlay the bitmap with a 64-bit atomic so that all bits can be
@@ -571,9 +575,9 @@ struct kvm_pmu {
u64 ds_area;
u64 pebs_enable;
- u64 pebs_enable_mask;
+ u64 pebs_enable_rsvd;
u64 pebs_data_cfg;
- u64 pebs_data_cfg_mask;
+ u64 pebs_data_cfg_rsvd;
/*
* If a guest counter is cross-mapped to host counter with different
@@ -604,18 +608,12 @@ enum {
KVM_DEBUGREG_WONT_EXIT = 2,
};
-struct kvm_mtrr_range {
- u64 base;
- u64 mask;
- struct list_head node;
-};
-
struct kvm_mtrr {
- struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
- mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
+ u64 var[KVM_NR_VAR_MTRR * 2];
+ u64 fixed_64k;
+ u64 fixed_16k[2];
+ u64 fixed_4k[8];
u64 deftype;
-
- struct list_head head;
};
/* Hyper-V SynIC timer */
@@ -1207,7 +1205,7 @@ enum kvm_apicv_inhibit {
* APIC acceleration is disabled by a module parameter
* and/or not supported in hardware.
*/
- APICV_INHIBIT_REASON_DISABLE,
+ APICV_INHIBIT_REASON_DISABLED,
/*
* APIC acceleration is inhibited because AutoEOI feature is
@@ -1277,8 +1275,27 @@ enum kvm_apicv_inhibit {
* mapping between logical ID and vCPU.
*/
APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
+
+ NR_APICV_INHIBIT_REASONS,
};
+#define __APICV_INHIBIT_REASON(reason) \
+ { BIT(APICV_INHIBIT_REASON_##reason), #reason }
+
+#define APICV_INHIBIT_REASONS \
+ __APICV_INHIBIT_REASON(DISABLED), \
+ __APICV_INHIBIT_REASON(HYPERV), \
+ __APICV_INHIBIT_REASON(ABSENT), \
+ __APICV_INHIBIT_REASON(BLOCKIRQ), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \
+ __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \
+ __APICV_INHIBIT_REASON(NESTED), \
+ __APICV_INHIBIT_REASON(IRQWIN), \
+ __APICV_INHIBIT_REASON(PIT_REINJ), \
+ __APICV_INHIBIT_REASON(SEV), \
+ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
+
struct kvm_arch {
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
@@ -1364,6 +1381,7 @@ struct kvm_arch {
u32 default_tsc_khz;
bool user_set_tsc;
+ u64 apic_bus_cycle_ns;
seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
@@ -1708,13 +1726,11 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
const unsigned long required_apicv_inhibits;
bool allow_apicv_in_x2apic_without_x2apic_virtualization;
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(int isr);
- bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
@@ -1749,8 +1765,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
- void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
-
/*
* Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
* value indicates CPU dirty logging is unsupported or disabled.
@@ -1812,6 +1826,9 @@ struct kvm_x86_ops {
gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
+ int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+ void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
+ int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn);
};
struct kvm_x86_nested_ops {
@@ -1819,7 +1836,7 @@ struct kvm_x86_nested_ops {
bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
u32 error_code);
int (*check_events)(struct kvm_vcpu *vcpu);
- bool (*has_events)(struct kvm_vcpu *vcpu);
+ bool (*has_events)(struct kvm_vcpu *vcpu, bool for_injection);
void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
@@ -1853,11 +1870,13 @@ struct kvm_arch_async_pf {
};
extern u32 __read_mostly kvm_nr_uret_msrs;
-extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
extern struct kvm_x86_ops kvm_x86_ops;
+#define kvm_x86_call(func) static_call(kvm_x86_##func)
+#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func)
+
#define KVM_X86_OP(func) \
DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
#define KVM_X86_OP_OPTIONAL KVM_X86_OP
@@ -1881,7 +1900,7 @@ void kvm_arch_free_vm(struct kvm *kvm);
static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
if (kvm_x86_ops.flush_remote_tlbs &&
- !static_call(kvm_x86_flush_remote_tlbs)(kvm))
+ !kvm_x86_call(flush_remote_tlbs)(kvm))
return 0;
else
return -ENOTSUPP;
@@ -1894,7 +1913,7 @@ static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
if (!kvm_x86_ops.flush_remote_tlbs_range)
return -EOPNOTSUPP;
- return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
+ return kvm_x86_call(flush_remote_tlbs_range)(kvm, gfn, nr_pages);
}
#endif /* CONFIG_HYPERV */
@@ -1939,6 +1958,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -2292,12 +2312,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{
- static_call_cond(kvm_x86_vcpu_blocking)(vcpu);
+ kvm_x86_call(vcpu_blocking)(vcpu);
}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
+ kvm_x86_call(vcpu_unblocking)(vcpu);
}
static inline int kvm_cpu_get_apicid(int mps_cpu)
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index e90d403f2068..98726c2b04f8 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -59,6 +59,14 @@
#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12
#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0)
+/* Preferred GHCB GPA Request */
+#define GHCB_MSR_PREF_GPA_REQ 0x010
+#define GHCB_MSR_GPA_VALUE_POS 12
+#define GHCB_MSR_GPA_VALUE_MASK GENMASK_ULL(51, 0)
+
+#define GHCB_MSR_PREF_GPA_RESP 0x011
+#define GHCB_MSR_PREF_GPA_NONE 0xfffffffffffff
+
/* GHCB GPA Register */
#define GHCB_MSR_REG_GPA_REQ 0x012
#define GHCB_MSR_REG_GPA_REQ_VAL(v) \
@@ -93,11 +101,17 @@ enum psc_op {
/* GHCBData[11:0] */ \
GHCB_MSR_PSC_REQ)
+#define GHCB_MSR_PSC_REQ_TO_GFN(msr) (((msr) & GENMASK_ULL(51, 12)) >> 12)
+#define GHCB_MSR_PSC_REQ_TO_OP(msr) (((msr) & GENMASK_ULL(55, 52)) >> 52)
+
#define GHCB_MSR_PSC_RESP 0x015
#define GHCB_MSR_PSC_RESP_VAL(val) \
/* GHCBData[63:32] */ \
(((u64)(val) & GENMASK_ULL(63, 32)) >> 32)
+/* Set highest bit as a generic error response */
+#define GHCB_MSR_PSC_RESP_ERROR (BIT_ULL(63) | GHCB_MSR_PSC_RESP)
+
/* GHCB Run at VMPL Request/Response */
#define GHCB_MSR_VMPL_REQ 0x016
#define GHCB_MSR_VMPL_REQ_LEVEL(v) \
@@ -129,8 +143,19 @@ enum psc_op {
* The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which
* is a local stack variable in set_pages_state(). Do not increase this value
* without evaluating the impact to stack usage.
+ *
+ * Use VMGEXIT_PSC_MAX_COUNT in cases where the actual GHCB-defined max value
+ * is needed, such as when processing GHCB requests on the hypervisor side.
*/
#define VMGEXIT_PSC_MAX_ENTRY 64
+#define VMGEXIT_PSC_MAX_COUNT 253
+
+#define VMGEXIT_PSC_ERROR_GENERIC (0x100UL << 32)
+#define VMGEXIT_PSC_ERROR_INVALID_HDR ((1UL << 32) | 1)
+#define VMGEXIT_PSC_ERROR_INVALID_ENTRY ((1UL << 32) | 2)
+
+#define VMGEXIT_PSC_OP_PRIVATE 1
+#define VMGEXIT_PSC_OP_SHARED 2
struct psc_hdr {
u16 cur_entry;
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ac5886ce252e..79bbe2be900e 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -91,6 +91,9 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
/* RMUPDATE detected 4K page and 2MB page overlap. */
#define RMPUPDATE_FAIL_OVERLAP 4
+/* PSMASH failed due to concurrent access by another CPU */
+#define PSMASH_FAIL_INUSE 3
+
/* RMP page size */
#define RMP_PG_SIZE_4K 0
#define RMP_PG_SIZE_2M 1
@@ -116,6 +119,54 @@ struct snp_req_data {
unsigned int data_npages;
};
+#define MAX_AUTHTAG_LEN 32
+
+/* See SNP spec SNP_GUEST_REQUEST section for the structure */
+enum msg_type {
+ SNP_MSG_TYPE_INVALID = 0,
+ SNP_MSG_CPUID_REQ,
+ SNP_MSG_CPUID_RSP,
+ SNP_MSG_KEY_REQ,
+ SNP_MSG_KEY_RSP,
+ SNP_MSG_REPORT_REQ,
+ SNP_MSG_REPORT_RSP,
+ SNP_MSG_EXPORT_REQ,
+ SNP_MSG_EXPORT_RSP,
+ SNP_MSG_IMPORT_REQ,
+ SNP_MSG_IMPORT_RSP,
+ SNP_MSG_ABSORB_REQ,
+ SNP_MSG_ABSORB_RSP,
+ SNP_MSG_VMRK_REQ,
+ SNP_MSG_VMRK_RSP,
+
+ SNP_MSG_TYPE_MAX
+};
+
+enum aead_algo {
+ SNP_AEAD_INVALID,
+ SNP_AEAD_AES_256_GCM,
+};
+
+struct snp_guest_msg_hdr {
+ u8 authtag[MAX_AUTHTAG_LEN];
+ u64 msg_seqno;
+ u8 rsvd1[8];
+ u8 algo;
+ u8 hdr_version;
+ u16 hdr_sz;
+ u8 msg_type;
+ u8 msg_version;
+ u16 msg_sz;
+ u32 rsvd2;
+ u8 msg_vmpck;
+ u8 rsvd3[35];
+} __packed;
+
+struct snp_guest_msg {
+ struct snp_guest_msg_hdr hdr;
+ u8 payload[4000];
+} __packed;
+
struct sev_guest_platform_data {
u64 secrets_gpa;
};
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 728c98175b9c..f0dea3750ca9 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -285,7 +285,14 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
+#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
+#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
+#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
+#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
+
+#define SVM_SEV_FEAT_INT_INJ_MODES \
+ (SVM_SEV_FEAT_RESTRICTED_INJECTION | \
+ SVM_SEV_FEAT_ALTERNATE_INJECTION)
struct vmcb_seg {
u16 selector;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 9fae1b73b529..bf57a824f722 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,7 @@ struct kvm_ioapic_state {
#define KVM_RUN_X86_SMM (1 << 0)
#define KVM_RUN_X86_BUS_LOCK (1 << 1)
+#define KVM_RUN_X86_GUEST_MODE (1 << 2)
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
@@ -697,6 +698,11 @@ enum sev_cmd_id {
/* Second time is the charm; improved versions of the above ioctls. */
KVM_SEV_INIT2,
+ /* SNP-specific commands */
+ KVM_SEV_SNP_LAUNCH_START = 100,
+ KVM_SEV_SNP_LAUNCH_UPDATE,
+ KVM_SEV_SNP_LAUNCH_FINISH,
+
KVM_SEV_NR_MAX,
};
@@ -824,6 +830,48 @@ struct kvm_sev_receive_update_data {
__u32 pad2;
};
+struct kvm_sev_snp_launch_start {
+ __u64 policy;
+ __u8 gosvw[16];
+ __u16 flags;
+ __u8 pad0[6];
+ __u64 pad1[4];
+};
+
+/* Kept in sync with firmware values for simplicity. */
+#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1
+#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3
+#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4
+#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5
+#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6
+
+struct kvm_sev_snp_launch_update {
+ __u64 gfn_start;
+ __u64 uaddr;
+ __u64 len;
+ __u8 type;
+ __u8 pad0;
+ __u16 flags;
+ __u32 pad1;
+ __u64 pad2[4];
+};
+
+#define KVM_SEV_SNP_ID_BLOCK_SIZE 96
+#define KVM_SEV_SNP_ID_AUTH_SIZE 4096
+#define KVM_SEV_SNP_FINISH_DATA_SIZE 32
+
+struct kvm_sev_snp_launch_finish {
+ __u64 id_block_uaddr;
+ __u64 id_auth_uaddr;
+ __u8 id_block_en;
+ __u8 auth_key_en;
+ __u8 vcek_disabled;
+ __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE];
+ __u8 pad0[3];
+ __u16 flags;
+ __u64 pad1[4];
+};
+
#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
@@ -874,5 +922,6 @@ struct kvm_hyperv_eventfd {
#define KVM_X86_SW_PROTECTED_VM 1
#define KVM_X86_SEV_VM 2
#define KVM_X86_SEV_ES_VM 3
+#define KVM_X86_SNP_VM 4
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index fec95a770270..4287a8071a3a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -44,6 +44,7 @@ config KVM
select KVM_VFIO
select HAVE_KVM_PM_NOTIFIER if PM
select KVM_GENERIC_HARDWARE_ENABLING
+ select KVM_GENERIC_PRE_FAULT_MEMORY
select KVM_WERROR if WERROR
help
Support hosting fully virtualized guest machines using hardware
@@ -139,6 +140,9 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
+ select KVM_GENERIC_PRIVATE_MEM
+ select HAVE_KVM_GMEM_PREPARE
+ select HAVE_KVM_GMEM_INVALIDATE
help
Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
with Encrypted State (SEV-ES) on AMD processors.
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f2f2be5d1141..2617be544480 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -335,6 +335,18 @@ static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
#endif
}
+static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0);
+ if (!entry)
+ return false;
+
+ return is_guest_vendor_amd(entry->ebx, entry->ecx, entry->edx) ||
+ is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx);
+}
+
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -388,7 +400,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.cpuid_nent));
/* Invoke the vendor callback only after the above state is updated. */
- static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
+ kvm_x86_call(vcpu_after_set_cpuid)(vcpu);
/*
* Except for the MMU, which needs to do its thing any vendor specific
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 23dbb9eb277c..41697cca354e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -102,24 +102,6 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu,
*reg &= ~__feature_bit(x86_feature);
}
-static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0);
- return best &&
- (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) ||
- is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
-}
-
-static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0);
- return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
-}
-
static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu)
{
return vcpu->arch.is_amd_compatible;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c8cc578646d0..e72aed25d721 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2354,50 +2354,6 @@ setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
ss->avl = 0;
}
-static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
-{
- u32 eax, ebx, ecx, edx;
-
- eax = ecx = 0;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
- return is_guest_vendor_intel(ebx, ecx, edx);
-}
-
-static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
-{
- const struct x86_emulate_ops *ops = ctxt->ops;
- u32 eax, ebx, ecx, edx;
-
- /*
- * syscall should always be enabled in longmode - so only become
- * vendor specific (cpuid) if other modes are active...
- */
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- return true;
-
- eax = 0x00000000;
- ecx = 0x00000000;
- ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
- /*
- * remark: Intel CPUs only support "syscall" in 64bit longmode. Also a
- * 64bit guest with a 32bit compat-app running will #UD !! While this
- * behaviour can be fixed (by emulating) into AMD response - CPUs of
- * AMD can't behave like Intel.
- */
- if (is_guest_vendor_intel(ebx, ecx, edx))
- return false;
-
- if (is_guest_vendor_amd(ebx, ecx, edx) ||
- is_guest_vendor_hygon(ebx, ecx, edx))
- return true;
-
- /*
- * default: (not Intel, not AMD, not Hygon), apply Intel's
- * stricter rules...
- */
- return false;
-}
-
static int em_syscall(struct x86_emulate_ctxt *ctxt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
@@ -2411,7 +2367,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_ud(ctxt);
- if (!(em_syscall_is_enabled(ctxt)))
+ /*
+ * Intel compatible CPUs only support SYSCALL in 64-bit mode, whereas
+ * AMD allows SYSCALL in any flavor of protected mode. Note, it's
+ * infeasible to emulate Intel behavior when running on AMD hardware,
+ * as SYSCALL won't fault in the "wrong" mode, i.e. there is no #UD
+ * for KVM to trap-and-emulate, unlike emulating AMD on Intel.
+ */
+ if (ctxt->mode != X86EMUL_MODE_PROT64 &&
+ ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
return emulate_ud(ctxt);
ops->get_msr(ctxt, MSR_EFER, &efer);
@@ -2471,11 +2435,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
return emulate_gp(ctxt, 0);
/*
- * Not recognized on AMD in compat mode (but is recognized in legacy
- * mode).
+ * Intel's architecture allows SYSENTER in compatibility mode, but AMD
+ * does not. Note, AMD does allow SYSENTER in legacy protected mode.
*/
- if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA)
- && !vendor_intel(ctxt))
+ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) &&
+ !ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
return emulate_ud(ctxt);
/* sysenter/sysexit have not been tested in 64bit mode. */
@@ -2647,7 +2611,14 @@ static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
* manner when ECX is zero due to REP-string optimizations.
*/
#ifdef CONFIG_X86_64
- if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+ u32 eax, ebx, ecx, edx;
+
+ if (ctxt->ad_bytes != 4)
+ return;
+
+ eax = ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+ if (!is_guest_vendor_intel(ebx, ecx, edx))
return;
*reg_write(ctxt, VCPU_REGS_RCX) = 0;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 8a47f8541eab..4f0a94346d00 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1417,7 +1417,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
}
/* vmcall/vmmcall */
- static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i);
+ kvm_x86_call(patch_hypercall)(vcpu, instructions + i);
i += 3;
/* ret */
@@ -1737,7 +1737,8 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
break;
case HV_X64_MSR_APIC_FREQUENCY:
- data = APIC_BUS_FREQUENCY;
+ data = div64_u64(1000000000ULL,
+ vcpu->kvm->arch.apic_bus_cycle_ns);
break;
default:
kvm_pr_unimpl_rdmsr(vcpu, msr);
@@ -1985,7 +1986,7 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
*/
gva = entries[i] & PAGE_MASK;
for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++)
- static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
+ kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
++vcpu->stat.tlb_flush;
}
@@ -2526,7 +2527,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
* hypercall generates UD from non zero cpl and real mode
* per HYPER-V spec
*/
- if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
+ if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index ad9ca8a60144..3d7eb11d0e45 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -157,7 +157,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
__kvm_migrate_pit_timer(vcpu);
- static_call_cond(kvm_x86_migrate_timers)(vcpu);
+ kvm_x86_call(migrate_timers)(vcpu);
}
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index c2d7cfe82d00..76d46b2f41dd 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -106,7 +106,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
int kvm_setup_default_irq_routing(struct kvm *kvm);
-int kvm_setup_empty_irq_routing(struct kvm *kvm);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 68f3f6c26046..8136695f7b96 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -395,13 +395,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
ARRAY_SIZE(default_routing), 0);
}
-static const struct kvm_irq_routing_entry empty_routing[] = {};
-
-int kvm_setup_empty_irq_routing(struct kvm *kvm)
-{
- return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
-}
-
void kvm_arch_post_irq_routing_update(struct kvm *kvm)
{
if (!irqchip_split(kvm))
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 75eae9c4998a..b1eb46e26b2e 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -98,7 +98,7 @@ static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg
return 0;
if (!kvm_register_is_available(vcpu, reg))
- static_call(kvm_x86_cache_reg)(vcpu, reg);
+ kvm_x86_call(cache_reg)(vcpu, reg);
return vcpu->arch.regs[reg];
}
@@ -138,7 +138,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
might_sleep(); /* on svm */
if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -153,7 +153,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR0);
return vcpu->arch.cr0 & mask;
}
@@ -175,7 +175,7 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR4);
return vcpu->arch.cr4 & mask;
}
@@ -190,7 +190,7 @@ static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu,
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR3);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 29ea4313e1bb..55a18e2f2dcd 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -223,6 +223,7 @@ struct x86_emulate_ops {
bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_cpuid_is_intel_compatible)(struct x86_emulate_ctxt *ctxt);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index acd7d48100a1..a7172ba59ad2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -738,8 +738,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
if (unlikely(apic->apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu,
- apic_find_highest_irr(apic));
+ kvm_x86_call(hwapic_irr_update)(apic->vcpu,
+ apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -765,7 +765,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(apic->apicv_active))
- static_call_cond(kvm_x86_hwapic_isr_update)(vec);
+ kvm_x86_call(hwapic_isr_update)(vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -810,7 +810,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(apic->apicv_active))
- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
+ kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -946,7 +946,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
if (kvm_x86_ops.sync_pir_to_irr)
- highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
+ highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
@@ -1338,8 +1338,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic->regs + APIC_TMR);
}
- static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode,
- trig_mode, vector);
+ kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
+ trig_mode, vector);
break;
case APIC_DM_REMRD:
@@ -1557,7 +1557,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
remaining = 0;
ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
- return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count));
+ return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ apic->divide_count));
}
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
@@ -1973,7 +1974,8 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
{
- return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count;
+ return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ (u64)apic->divide_count;
}
static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
@@ -2103,7 +2105,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
{
WARN_ON(preemptible());
WARN_ON(!apic->lapic_timer.hv_timer_in_use);
- static_call(kvm_x86_cancel_hv_timer)(apic->vcpu);
+ kvm_x86_call(cancel_hv_timer)(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
}
@@ -2120,7 +2122,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
if (!ktimer->tscdeadline)
return false;
- if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
+ if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
return false;
ktimer->hv_timer_in_use = true;
@@ -2575,7 +2577,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
- static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
+ kvm_x86_call(set_virtual_apic_mode)(vcpu);
}
apic->base_address = apic->vcpu->arch.apic_base &
@@ -2685,7 +2687,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
u64 msr_val;
int i;
- static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu);
+ kvm_x86_call(apicv_pre_state_restore)(vcpu);
if (!init_event) {
msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
@@ -2740,9 +2742,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (apic->apicv_active) {
- static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
- static_call_cond(kvm_x86_hwapic_isr_update)(-1);
+ kvm_x86_call(apicv_post_state_restore)(vcpu);
+ kvm_x86_call(hwapic_irr_update)(vcpu, -1);
+ kvm_x86_call(hwapic_isr_update)(-1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2838,7 +2840,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
vcpu->arch.apic = apic;
if (kvm_x86_ops.alloc_apic_backing_page)
- apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu);
+ apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu);
else
apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!apic->regs) {
@@ -3017,7 +3019,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
struct kvm_lapic *apic = vcpu->arch.apic;
int r;
- static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu);
+ kvm_x86_call(apicv_pre_state_restore)(vcpu);
kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
/* set SPIV separately to get count of SW disabled APICs right */
@@ -3044,9 +3046,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
kvm_apic_update_apicv(vcpu);
if (apic->apicv_active) {
- static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
+ kvm_x86_call(apicv_post_state_restore)(vcpu);
+ kvm_x86_call(hwapic_irr_update)(vcpu,
+ apic_find_highest_irr(apic));
+ kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
@@ -3334,7 +3337,8 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
/* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
- static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector);
+ kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu,
+ sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a69e706b9080..7ef8ae73e82d 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -16,8 +16,7 @@
#define APIC_DEST_NOSHORT 0x0
#define APIC_DEST_MASK 0x800
-#define APIC_BUS_CYCLE_NS 1
-#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
+#define APIC_BUS_CYCLE_NS_DEFAULT 1
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
@@ -236,7 +235,7 @@ static inline bool kvm_apic_has_pending_init_or_sipi(struct kvm_vcpu *vcpu)
static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu)
{
return !is_smm(vcpu) &&
- !static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
+ !kvm_x86_call(apic_init_signal_blocked)(vcpu);
}
static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2e454316f2a2..4341e0e28571 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -57,12 +57,6 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
-/*
- * The number of non-reserved physical address bits irrespective of features
- * that repurpose legal bits, e.g. MKTME.
- */
-extern u8 __read_mostly shadow_phys_bits;
-
static inline gfn_t kvm_mmu_max_gfn(void)
{
/*
@@ -76,30 +70,11 @@ static inline gfn_t kvm_mmu_max_gfn(void)
* than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
* disallows such SPTEs entirely and simplifies the TDP MMU.
*/
- int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
+ int max_gpa_bits = likely(tdp_enabled) ? kvm_host.maxphyaddr : 52;
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
-static inline u8 kvm_get_shadow_phys_bits(void)
-{
- /*
- * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
- * in CPU detection code, but the processor treats those reduced bits as
- * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
- * the physical address bits reported by CPUID.
- */
- if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
- return cpuid_eax(0x80000008) & 0xff;
-
- /*
- * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
- * custom CPUID. Proceed with whatever the kernel found since these features
- * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
- */
- return boot_cpu_data.x86_phys_bits;
-}
-
u8 kvm_mmu_get_max_tdp_level(void);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
@@ -163,8 +138,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(root_hpa))
return;
- static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
- vcpu->arch.mmu->root_role.level);
+ kvm_x86_call(load_mmu_pgd)(vcpu, root_hpa,
+ vcpu->arch.mmu->root_role.level);
}
static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
@@ -199,7 +174,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
{
/* strip nested paging fault error codes */
unsigned int pfec = access;
- unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
/*
* For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1.
@@ -246,14 +221,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma);
-
-static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
-{
- return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
-}
-
-void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+bool kvm_mmu_may_ignore_guest_pat(void);
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8d74bdef68c1..901be9e420a4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -722,7 +722,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
if (sp->role.passthrough)
return sp->gfn;
- if (!sp->role.direct)
+ if (sp->shadowed_translation)
return sp->shadowed_translation[index] >> PAGE_SHIFT;
return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
@@ -736,7 +736,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
*/
static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
{
- if (sp_has_gptes(sp))
+ if (sp->shadowed_translation)
return sp->shadowed_translation[index] & ACC_ALL;
/*
@@ -757,7 +757,7 @@ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
gfn_t gfn, unsigned int access)
{
- if (sp_has_gptes(sp)) {
+ if (sp->shadowed_translation) {
sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
return;
}
@@ -1700,8 +1700,7 @@ static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
- if (!sp->role.direct)
- free_page((unsigned long)sp->shadowed_translation);
+ free_page((unsigned long)sp->shadowed_translation);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -2203,7 +2202,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
- if (!role.direct)
+ if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL)
sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
@@ -3308,7 +3307,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
return RET_PF_CONTINUE;
}
-static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
+static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault)
{
/*
* Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
@@ -3320,6 +3319,26 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
return false;
/*
+ * For hardware-protected VMs, certain conditions like attempting to
+ * perform a write to a page which is not in the state that the guest
+ * expects it to be in can result in a nested/extended #PF. In this
+ * case, the below code might misconstrue this situation as being the
+ * result of a write-protected access, and treat it as a spurious case
+ * rather than taking any action to satisfy the real source of the #PF
+ * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the
+ * guest spinning on a #PF indefinitely, so don't attempt the fast path
+ * in this case.
+ *
+ * Note that the kvm_mem_is_private() check might race with an
+ * attribute update, but this will either result in the guest spinning
+ * on RET_PF_SPURIOUS until the update completes, or an actual spurious
+ * case might go down the slow path. Either case will resolve itself.
+ */
+ if (kvm->arch.has_private_mem &&
+ fault->is_private != kvm_mem_is_private(kvm, fault->gfn))
+ return false;
+
+ /*
* #PF can be fast if:
*
* 1. The shadow page table entry is not present and A/D bits are
@@ -3419,7 +3438,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
u64 *sptep;
uint retry_count = 0;
- if (!page_fault_can_be_fast(fault))
+ if (!page_fault_can_be_fast(vcpu->kvm, fault))
return ret;
walk_shadow_page_lockless_begin(vcpu);
@@ -3428,7 +3447,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
u64 new_spte;
if (tdp_mmu_enabled)
- sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte);
else
sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
@@ -3438,7 +3457,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* available as the vCPU holds a reference to its root(s).
*/
if (WARN_ON_ONCE(!sptep))
- spte = REMOVED_SPTE;
+ spte = FROZEN_SPTE;
if (!is_shadow_present_pte(spte))
break;
@@ -4271,7 +4290,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
return;
- kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
+ r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
+ true, NULL, NULL);
+
+ /*
+ * Account fixed page faults, otherwise they'll never be counted, but
+ * ignore stats for all other return times. Page-ready "faults" aren't
+ * truly spurious and never trigger emulation
+ */
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
}
static inline u8 kvm_max_level_for_order(int order)
@@ -4291,6 +4319,25 @@ static inline u8 kvm_max_level_for_order(int order)
return PG_LEVEL_4K;
}
+static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
+ u8 max_level, int gmem_order)
+{
+ u8 req_max_level;
+
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ max_level = min(kvm_max_level_for_order(gmem_order), max_level);
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
+ if (req_max_level)
+ max_level = min(max_level, req_max_level);
+
+ return req_max_level;
+}
+
static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
@@ -4308,9 +4355,9 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
return r;
}
- fault->max_level = min(kvm_max_level_for_order(max_order),
- fault->max_level);
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+ fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
+ fault->max_level, max_order);
return RET_PF_CONTINUE;
}
@@ -4561,7 +4608,10 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
if (WARN_ON_ONCE(error_code >> 32))
error_code = lower_32_bits(error_code);
- /* Ensure the above sanity check also covers KVM-defined flags. */
+ /*
+ * Restrict KVM-defined flags to bits 63:32 so that it's impossible for
+ * them to conflict with #PF error codes, which are limited to 32 bits.
+ */
BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
vcpu->arch.l1tf_flush_l1d = true;
@@ -4621,38 +4671,23 @@ out_unlock:
}
#endif
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
+bool kvm_mmu_may_ignore_guest_pat(void)
{
/*
- * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
- * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
- * to honor the memtype from the guest's MTRRs so that guest accesses
- * to memory that is DMA'd aren't cached against the guest's wishes.
- *
- * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
- * e.g. KVM will force UC memtype for host MMIO.
+ * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does
+ * not support self-snoop (or is affected by an erratum), and the VM
+ * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
+ * honor the memtype from the guest's PAT so that guest accesses to
+ * memory that is DMA'd aren't cached against the guest's wishes. As a
+ * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
+ * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE
+ * bits in response to non-coherent device (un)registration.
*/
- return vm_has_noncoherent_dma && shadow_memtype_mask;
+ return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask;
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- /*
- * If the guest's MTRRs may be used to compute the "real" memtype,
- * restrict the mapping level to ensure KVM uses a consistent memtype
- * across the entire mapping.
- */
- if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
- for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
- int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
- gfn_t base = gfn_round_for_level(fault->gfn,
- fault->max_level);
-
- if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
- break;
- }
- }
-
#ifdef CONFIG_X86_64
if (tdp_mmu_enabled)
return kvm_tdp_mmu_page_fault(vcpu, fault);
@@ -4661,6 +4696,79 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
+static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
+ u8 *level)
+{
+ int r;
+
+ /*
+ * Restrict to TDP page fault, since that's the only case where the MMU
+ * is indexed by GPA.
+ */
+ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
+ return -EOPNOTSUPP;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+ cond_resched();
+ r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
+ } while (r == RET_PF_RETRY);
+
+ if (r < 0)
+ return r;
+
+ switch (r) {
+ case RET_PF_FIXED:
+ case RET_PF_SPURIOUS:
+ return 0;
+
+ case RET_PF_EMULATE:
+ return -ENOENT;
+
+ case RET_PF_RETRY:
+ case RET_PF_CONTINUE:
+ case RET_PF_INVALID:
+ default:
+ WARN_ONCE(1, "could not fix page fault during prefault");
+ return -EIO;
+ }
+}
+
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ u64 error_code = PFERR_GUEST_FINAL_MASK;
+ u8 level = PG_LEVEL_4K;
+ u64 end;
+ int r;
+
+ /*
+ * reload is efficient when called repeatedly, so we can do it on
+ * every iteration.
+ */
+ kvm_mmu_reload(vcpu);
+
+ if (kvm_arch_has_private_mem(vcpu->kvm) &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ /*
+ * Shadow paging uses GVA for kvm page fault, so restrict to
+ * two-dimensional paging.
+ */
+ r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+ if (r < 0)
+ return r;
+
+ /*
+ * If the mapping that covers range->gpa can use a huge page, it
+ * may start below it or end after range->gpa + range->size.
+ */
+ end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
+ return min(range->size, end - range->gpa);
+}
+
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
@@ -4988,7 +5096,7 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
static inline u64 reserved_hpa_bits(void)
{
- return rsvd_bits(shadow_phys_bits, 63);
+ return rsvd_bits(kvm_host.maxphyaddr, 63);
}
/*
@@ -5633,7 +5741,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
* stale entries. Flushing on alloc also allows KVM to skip the TLB
* flush when freeing a root (see kvm_tdp_mmu_put_root()).
*/
- static_call(kvm_x86_flush_tlb_current)(vcpu);
+ kvm_x86_call(flush_tlb_current)(vcpu);
out:
return r;
}
@@ -5886,14 +5994,24 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
}
if (r == RET_PF_INVALID) {
+ vcpu->stat.pf_taken++;
+
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
- &emulation_type);
+ &emulation_type, NULL);
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
if (r < 0)
return r;
+
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+ else if (r == RET_PF_EMULATE)
+ vcpu->stat.pf_emulate++;
+ else if (r == RET_PF_SPURIOUS)
+ vcpu->stat.pf_spurious++;
+
if (r != RET_PF_EMULATE)
return 1;
@@ -5995,7 +6113,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (is_noncanonical_address(addr, vcpu))
return;
- static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
+ kvm_x86_call(flush_tlb_gva)(vcpu, addr);
}
if (!mmu->sync_spte)
@@ -6787,6 +6905,7 @@ restart:
return need_tlb_flush;
}
+EXPORT_SYMBOL_GPL(kvm_zap_gfn_range);
static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
@@ -6917,7 +7036,6 @@ static unsigned long mmu_shrink_scan(struct shrinker *shrink,
list_for_each_entry(kvm, &vm_list, vm_list) {
int idx;
- LIST_HEAD(invalid_list);
/*
* Never scan more than sc->nr_to_scan VM instances.
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ce2fcd19ba6b..1721d97743e9 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -288,7 +288,8 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
}
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u64 err, bool prefetch, int *emulation_type)
+ u64 err, bool prefetch,
+ int *emulation_type, u8 *level)
{
struct kvm_page_fault fault = {
.addr = cr2_or_gpa,
@@ -318,14 +319,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
}
- /*
- * Async #PF "faults", a.k.a. prefetch faults, are not faults from the
- * guest perspective and have already been counted at the time of the
- * original fault.
- */
- if (!prefetch)
- vcpu->stat.pf_taken++;
-
if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
r = kvm_tdp_page_fault(vcpu, &fault);
else
@@ -344,20 +337,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (fault.write_fault_to_shadow_pgtable && emulation_type)
*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+ if (level)
+ *level = fault.goal_level;
- /*
- * Similar to above, prefetch faults aren't truly spurious, and the
- * async #PF path doesn't do emulation. Do count faults that are fixed
- * by the async #PF handler though, otherwise they'll never be counted.
- */
- if (r == RET_PF_FIXED)
- vcpu->stat.pf_fixed++;
- else if (prefetch)
- ;
- else if (r == RET_PF_EMULATE)
- vcpu->stat.pf_emulate++;
- else if (r == RET_PF_SPURIOUS)
- vcpu->stat.pf_spurious++;
return r;
}
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index d3dbcf382ed2..69941cebb3a8 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -911,7 +911,8 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
gpa_t pte_gpa;
gfn_t gfn;
- if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE))
+ if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE ||
+ !sp->shadowed_translation))
return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index a5e014d7bc62..d4527965e48c 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -43,7 +43,25 @@ u64 __read_mostly shadow_acc_track_mask;
u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
-u8 __read_mostly shadow_phys_bits;
+static u8 __init kvm_get_host_maxphyaddr(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR,
+ * when reasoning about CPU behavior with respect to MAXPHYADDR.
+ */
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
+
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
+}
void __init kvm_mmu_spte_module_init(void)
{
@@ -55,6 +73,8 @@ void __init kvm_mmu_spte_module_init(void)
* will change when the vendor module is (re)loaded.
*/
allow_mmio_caching = enable_mmio_caching;
+
+ kvm_host.maxphyaddr = kvm_get_host_maxphyaddr();
}
static u64 generation_mmio_spte_mask(u64 gen)
@@ -190,8 +210,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
spte |= PT_PAGE_SIZE_MASK;
if (shadow_memtype_mask)
- spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
if (host_writable)
spte |= shadow_host_writable_mask;
else
@@ -271,18 +291,12 @@ static u64 make_spte_executable(u64 spte)
* This is used during huge page splitting to build the SPTEs that make up the
* new page table.
*/
-u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role,
- int index)
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index)
{
- u64 child_spte;
-
- if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
- return 0;
+ u64 child_spte = huge_spte;
- if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
- return 0;
-
- child_spte = huge_spte;
+ KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm);
/*
* The child_spte already has the base address of the huge page being
@@ -383,7 +397,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
* not set any RWX bits.
*/
if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
- WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+ WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value))
mmio_value = 0;
if (!mmio_value)
@@ -441,8 +455,6 @@ void kvm_mmu_reset_all_pte_masks(void)
u8 low_phys_bits;
u64 mask;
- shadow_phys_bits = kvm_get_shadow_phys_bits();
-
/*
* If the CPU has 46 or less physical address bits, then set an
* appropriate mask to guard against L1TF attacks. Otherwise, it is
@@ -494,7 +506,7 @@ void kvm_mmu_reset_all_pte_masks(void)
* 52-bit physical addresses then there are no reserved PA bits in the
* PTEs and so the reserved PA approach must be disabled.
*/
- if (shadow_phys_bits < 52)
+ if (kvm_host.maxphyaddr < 52)
mask = BIT_ULL(51) | PT_PRESENT_MASK;
else
mask = 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 52fa004a1fbc..ef793c459b05 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -202,7 +202,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
/*
* If a thread running without exclusive control of the MMU lock must perform a
- * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
+ * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a
* non-present intermediate value. Other threads which encounter this value
* should not modify the SPTE.
*
@@ -212,14 +212,14 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
+#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
-static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
+static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
-static inline bool is_removed_spte(u64 spte)
+static inline bool is_frozen_spte(u64 spte)
{
- return spte == REMOVED_SPTE;
+ return spte == FROZEN_SPTE;
}
/* Get an SPTE's index into its parent's page table (and the spt array). */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 36539c1b36cd..c7dc49ee7388 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -365,8 +365,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* value to the removed SPTE value.
*/
for (;;) {
- old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
- if (!is_removed_spte(old_spte))
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
+ if (!is_frozen_spte(old_spte))
break;
cpu_relax();
}
@@ -397,11 +397,11 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* No retry is needed in the atomic update path as the
* sole concern is dropping a Dirty bit, i.e. no other
* task can zap/remove the SPTE as mmu_lock is held for
- * write. Marking the SPTE as a removed SPTE is not
+ * write. Marking the SPTE as a frozen SPTE is not
* strictly necessary for the same reason, but using
- * the remove SPTE value keeps the shared/exclusive
+ * the frozen SPTE value keeps the shared/exclusive
* paths consistent and allows the handle_changed_spte()
- * call below to hardcode the new value to REMOVED_SPTE.
+ * call below to hardcode the new value to FROZEN_SPTE.
*
* Note, even though dropping a Dirty bit is the only
* scenario where a non-atomic update could result in a
@@ -413,10 +413,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* it here.
*/
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
- REMOVED_SPTE, level);
+ FROZEN_SPTE, level);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_spte, REMOVED_SPTE, level, shared);
+ old_spte, FROZEN_SPTE, level, shared);
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -490,19 +490,19 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
*/
if (!was_present && !is_present) {
/*
- * If this change does not involve a MMIO SPTE or removed SPTE,
+ * If this change does not involve a MMIO SPTE or frozen SPTE,
* it is unexpected. Log the change, though it should not
* impact the guest since both the former and current SPTEs
* are nonpresent.
*/
if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
!is_mmio_spte(kvm, new_spte) &&
- !is_removed_spte(new_spte)))
+ !is_frozen_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
"different nonpresent SPTE, unless one or both\n"
"are MMIO SPTEs, or the new SPTE is\n"
- "a temporary removed SPTE.\n"
+ "a temporary frozen SPTE.\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
return;
@@ -530,7 +530,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}
-static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte)
+static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
+ u64 new_spte)
{
u64 *sptep = rcu_dereference(iter->sptep);
@@ -540,7 +541,7 @@ static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte)
* and pre-checking before inserting a new SPTE is advantageous as it
* avoids unnecessary work.
*/
- WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
+ WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
/*
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
@@ -572,9 +573,9 @@ static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte)
* no side-effects other than setting iter->old_spte to the last
* known value of the spte.
*/
-static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
int ret;
@@ -590,8 +591,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
return 0;
}
-static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter)
+static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
{
int ret;
@@ -603,26 +604,26 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* in its place before the TLBs are flushed.
*
* Delay processing of the zapped SPTE until after TLBs are flushed and
- * the REMOVED_SPTE is replaced (see below).
+ * the FROZEN_SPTE is replaced (see below).
*/
- ret = __tdp_mmu_set_spte_atomic(iter, REMOVED_SPTE);
+ ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE);
if (ret)
return ret;
kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
/*
- * No other thread can overwrite the removed SPTE as they must either
+ * No other thread can overwrite the frozen SPTE as they must either
* wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
- * overwrite the special removed SPTE value. Use the raw write helper to
+ * overwrite the special frozen SPTE value. Use the raw write helper to
* avoid an unnecessary check on volatile bits.
*/
__kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
/*
* Process the zapped SPTE after flushing TLBs, and after replacing
- * REMOVED_SPTE with 0. This minimizes the amount of time vCPUs are
- * blocked by the REMOVED_SPTE and reduces contention on the child
+ * FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are
+ * blocked by the FROZEN_SPTE and reduces contention on the child
* SPTEs.
*/
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
@@ -652,12 +653,12 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
/*
* No thread should be using this function to set SPTEs to or from the
- * temporary removed SPTE value.
+ * temporary frozen SPTE value.
* If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
* should be used. If operating under the MMU lock in write mode, the
- * use of the removed SPTE should not be necessary.
+ * use of the frozen SPTE should not be necessary.
*/
- WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
+ WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
@@ -1126,7 +1127,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* If SPTE has been frozen by another thread, just give up and
* retry, avoiding unnecessary page table allocation and free.
*/
- if (is_removed_spte(iter.old_spte))
+ if (is_frozen_spte(iter.old_spte))
goto retry;
if (iter.level == fault->goal_level)
@@ -1339,17 +1340,15 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
return spte_set;
}
-static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
{
struct kvm_mmu_page *sp;
- gfp |= __GFP_ZERO;
-
- sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
+ sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
if (!sp)
return NULL;
- sp->spt = (void *)__get_free_page(gfp);
+ sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sp->spt) {
kmem_cache_free(mmu_page_header_cache, sp);
return NULL;
@@ -1358,47 +1357,6 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
return sp;
}
-static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
- struct tdp_iter *iter,
- bool shared)
-{
- struct kvm_mmu_page *sp;
-
- kvm_lockdep_assert_mmu_lock_held(kvm, shared);
-
- /*
- * Since we are allocating while under the MMU lock we have to be
- * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
- * reclaim and to avoid making any filesystem callbacks (which can end
- * up invoking KVM MMU notifiers, resulting in a deadlock).
- *
- * If this allocation fails we drop the lock and retry with reclaim
- * allowed.
- */
- sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
- if (sp)
- return sp;
-
- rcu_read_unlock();
-
- if (shared)
- read_unlock(&kvm->mmu_lock);
- else
- write_unlock(&kvm->mmu_lock);
-
- iter->yielded = true;
- sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
-
- if (shared)
- read_lock(&kvm->mmu_lock);
- else
- write_lock(&kvm->mmu_lock);
-
- rcu_read_lock();
-
- return sp;
-}
-
/* Note, the caller is responsible for initializing @sp. */
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
@@ -1445,7 +1403,6 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
{
struct kvm_mmu_page *sp = NULL;
struct tdp_iter iter;
- int ret = 0;
rcu_read_lock();
@@ -1469,17 +1426,31 @@ retry:
continue;
if (!sp) {
- sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ sp = tdp_mmu_alloc_sp_for_split();
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
if (!sp) {
- ret = -ENOMEM;
trace_kvm_mmu_split_huge_page(iter.gfn,
iter.old_spte,
- iter.level, ret);
- break;
+ iter.level, -ENOMEM);
+ return -ENOMEM;
}
- if (iter.yielded)
- continue;
+ rcu_read_lock();
+
+ iter.yielded = true;
+ continue;
}
tdp_mmu_init_child_sp(sp, &iter);
@@ -1500,7 +1471,7 @@ retry:
if (sp)
tdp_mmu_free_sp(sp);
- return ret;
+ return 0;
}
@@ -1801,12 +1772,11 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*
* WARNING: This function is only intended to be called during fast_page_fault.
*/
-u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte)
{
struct tdp_iter iter;
struct kvm_mmu *mmu = vcpu->arch.mmu;
- gfn_t gfn = addr >> PAGE_SHIFT;
tdp_ptep_t sptep = NULL;
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 58b55e61bd33..1b74e058a81c 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -64,7 +64,7 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void)
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
-u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte);
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index a67c28a56417..05490b9d8a43 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -19,33 +19,21 @@
#include <asm/mtrr.h>
#include "cpuid.h"
-#include "mmu.h"
-#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
-#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
-#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
-
-static bool is_mtrr_base_msr(unsigned int msr)
-{
- /* MTRR base MSRs use even numbers, masks use odd numbers. */
- return !(msr & 0x1);
-}
-
-static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu,
- unsigned int msr)
+static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr)
{
- int index = (msr - MTRRphysBase_MSR(0)) / 2;
-
- return &vcpu->arch.mtrr_state.var_ranges[index];
-}
+ int index;
-static bool msr_mtrr_valid(unsigned msr)
-{
switch (msr) {
case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1):
+ index = msr - MTRRphysBase_MSR(0);
+ return &vcpu->arch.mtrr_state.var[index];
case MSR_MTRRfix64K_00000:
+ return &vcpu->arch.mtrr_state.fixed_64k;
case MSR_MTRRfix16K_80000:
case MSR_MTRRfix16K_A0000:
+ index = msr - MSR_MTRRfix16K_80000;
+ return &vcpu->arch.mtrr_state.fixed_16k[index];
case MSR_MTRRfix4K_C0000:
case MSR_MTRRfix4K_C8000:
case MSR_MTRRfix4K_D0000:
@@ -54,10 +42,14 @@ static bool msr_mtrr_valid(unsigned msr)
case MSR_MTRRfix4K_E8000:
case MSR_MTRRfix4K_F0000:
case MSR_MTRRfix4K_F8000:
+ index = msr - MSR_MTRRfix4K_C0000;
+ return &vcpu->arch.mtrr_state.fixed_4k[index];
case MSR_MTRRdefType:
- return true;
+ return &vcpu->arch.mtrr_state.deftype;
+ default:
+ break;
}
- return false;
+ return NULL;
}
static bool valid_mtrr_type(unsigned t)
@@ -70,9 +62,6 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
int i;
u64 mask;
- if (!msr_mtrr_valid(msr))
- return false;
-
if (msr == MSR_MTRRdefType) {
if (data & ~0xcff)
return false;
@@ -85,8 +74,9 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
/* variable MTRRs */
- WARN_ON(!(msr >= MTRRphysBase_MSR(0) &&
- msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)));
+ if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) &&
+ msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))))
+ return false;
mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
if ((msr & 1) == 0) {
@@ -94,309 +84,32 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!valid_mtrr_type(data & 0xff))
return false;
mask |= 0xf00;
- } else
+ } else {
/* MTRR mask */
mask |= 0x7ff;
-
- return (data & mask) == 0;
-}
-
-static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
-{
- return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
-}
-
-static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
-{
- return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
-}
-
-static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
-{
- return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
-}
-
-static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
-{
- /*
- * Intel SDM 11.11.2.2: all MTRRs are disabled when
- * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
- * memory type is applied to all of physical memory.
- *
- * However, virtual machines can be run with CPUID such that
- * there are no MTRRs. In that case, the firmware will never
- * enable MTRRs and it is obviously undesirable to run the
- * guest entirely with UC memory and we use WB.
- */
- if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
- return MTRR_TYPE_UNCACHABLE;
- else
- return MTRR_TYPE_WRBACK;
-}
-
-/*
-* Three terms are used in the following code:
-* - segment, it indicates the address segments covered by fixed MTRRs.
-* - unit, it corresponds to the MSR entry in the segment.
-* - range, a range is covered in one memory cache type.
-*/
-struct fixed_mtrr_segment {
- u64 start;
- u64 end;
-
- int range_shift;
-
- /* the start position in kvm_mtrr.fixed_ranges[]. */
- int range_start;
-};
-
-static struct fixed_mtrr_segment fixed_seg_table[] = {
- /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
- {
- .start = 0x0,
- .end = 0x80000,
- .range_shift = 16, /* 64K */
- .range_start = 0,
- },
-
- /*
- * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
- * 16K fixed mtrr.
- */
- {
- .start = 0x80000,
- .end = 0xc0000,
- .range_shift = 14, /* 16K */
- .range_start = 8,
- },
-
- /*
- * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
- * 4K fixed mtrr.
- */
- {
- .start = 0xc0000,
- .end = 0x100000,
- .range_shift = 12, /* 12K */
- .range_start = 24,
- }
-};
-
-/*
- * The size of unit is covered in one MSR, one MSR entry contains
- * 8 ranges so that unit size is always 8 * 2^range_shift.
- */
-static u64 fixed_mtrr_seg_unit_size(int seg)
-{
- return 8 << fixed_seg_table[seg].range_shift;
-}
-
-static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
-{
- switch (msr) {
- case MSR_MTRRfix64K_00000:
- *seg = 0;
- *unit = 0;
- break;
- case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
- *seg = 1;
- *unit = array_index_nospec(
- msr - MSR_MTRRfix16K_80000,
- MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
- break;
- case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
- *seg = 2;
- *unit = array_index_nospec(
- msr - MSR_MTRRfix4K_C0000,
- MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
- break;
- default:
- return false;
}
- return true;
-}
-
-static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- u64 unit_size = fixed_mtrr_seg_unit_size(seg);
-
- *start = mtrr_seg->start + unit * unit_size;
- *end = *start + unit_size;
- WARN_ON(*end > mtrr_seg->end);
-}
-
-static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
-
- WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
- > mtrr_seg->end);
-
- /* each unit has 8 ranges. */
- return mtrr_seg->range_start + 8 * unit;
-}
-
-static int fixed_mtrr_seg_end_range_index(int seg)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- int n;
-
- n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
- return mtrr_seg->range_start + n - 1;
-}
-
-static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
-{
- int seg, unit;
-
- if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
- return false;
-
- fixed_mtrr_seg_unit_range(seg, unit, start, end);
- return true;
-}
-
-static int fixed_msr_to_range_index(u32 msr)
-{
- int seg, unit;
-
- if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
- return -1;
-
- return fixed_mtrr_seg_unit_range_index(seg, unit);
-}
-
-static int fixed_mtrr_addr_to_seg(u64 addr)
-{
- struct fixed_mtrr_segment *mtrr_seg;
- int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
-
- for (seg = 0; seg < seg_num; seg++) {
- mtrr_seg = &fixed_seg_table[seg];
- if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
- return seg;
- }
-
- return -1;
-}
-
-static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
-{
- struct fixed_mtrr_segment *mtrr_seg;
- int index;
-
- mtrr_seg = &fixed_seg_table[seg];
- index = mtrr_seg->range_start;
- index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
- return index;
-}
-
-static u64 fixed_mtrr_range_end_addr(int seg, int index)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- int pos = index - mtrr_seg->range_start;
-
- return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
-}
-
-static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
-{
- u64 mask;
-
- *start = range->base & PAGE_MASK;
-
- mask = range->mask & PAGE_MASK;
-
- /* This cannot overflow because writing to the reserved bits of
- * variable MTRRs causes a #GP.
- */
- *end = (*start | ~mask) + 1;
-}
-
-static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- gfn_t start, end;
-
- if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
- return;
-
- if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
- return;
-
- /* fixed MTRRs. */
- if (fixed_msr_to_range(msr, &start, &end)) {
- if (!fixed_mtrr_is_enabled(mtrr_state))
- return;
- } else if (msr == MSR_MTRRdefType) {
- start = 0x0;
- end = ~0ULL;
- } else {
- /* variable range MTRRs. */
- var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end);
- }
-
- kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
-}
-
-static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
-{
- return (range->mask & (1 << 11)) != 0;
-}
-
-static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct kvm_mtrr_range *tmp, *cur;
-
- cur = var_mtrr_msr_to_range(vcpu, msr);
-
- /* remove the entry if it's in the list. */
- if (var_mtrr_range_is_valid(cur))
- list_del(&cur->node);
-
- /*
- * Set all illegal GPA bits in the mask, since those bits must
- * implicitly be 0. The bits are then cleared when reading them.
- */
- if (is_mtrr_base_msr(msr))
- cur->base = data;
- else
- cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
-
- /* add it to the list if it's enabled. */
- if (var_mtrr_range_is_valid(cur)) {
- list_for_each_entry(tmp, &mtrr_state->head, node)
- if (cur->base >= tmp->base)
- break;
- list_add_tail(&cur->node, &tmp->node);
- }
+ return (data & mask) == 0;
}
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
- int index;
+ u64 *mtrr;
- if (!kvm_mtrr_valid(vcpu, msr, data))
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
return 1;
- index = fixed_msr_to_range_index(msr);
- if (index >= 0)
- *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
- else if (msr == MSR_MTRRdefType)
- vcpu->arch.mtrr_state.deftype = data;
- else
- set_var_mtrr_msr(vcpu, msr, data);
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
- update_mtrr(vcpu, msr);
+ *mtrr = data;
return 0;
}
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
- int index;
+ u64 *mtrr;
/* MSR_MTRRcap is a readonly MSR. */
if (msr == MSR_MTRRcap) {
@@ -410,311 +123,10 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
- if (!msr_mtrr_valid(msr))
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
return 1;
- index = fixed_msr_to_range_index(msr);
- if (index >= 0) {
- *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
- } else if (msr == MSR_MTRRdefType) {
- *pdata = vcpu->arch.mtrr_state.deftype;
- } else {
- /* Variable MTRRs */
- if (is_mtrr_base_msr(msr))
- *pdata = var_mtrr_msr_to_range(vcpu, msr)->base;
- else
- *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask;
-
- *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
- }
-
+ *pdata = *mtrr;
return 0;
}
-
-void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
-{
- INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
-}
-
-struct mtrr_iter {
- /* input fields. */
- struct kvm_mtrr *mtrr_state;
- u64 start;
- u64 end;
-
- /* output fields. */
- int mem_type;
- /* mtrr is completely disabled? */
- bool mtrr_disabled;
- /* [start, end) is not fully covered in MTRRs? */
- bool partial_map;
-
- /* private fields. */
- union {
- /* used for fixed MTRRs. */
- struct {
- int index;
- int seg;
- };
-
- /* used for var MTRRs. */
- struct {
- struct kvm_mtrr_range *range;
- /* max address has been covered in var MTRRs. */
- u64 start_max;
- };
- };
-
- bool fixed;
-};
-
-static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
-{
- int seg, index;
-
- if (!fixed_mtrr_is_enabled(iter->mtrr_state))
- return false;
-
- seg = fixed_mtrr_addr_to_seg(iter->start);
- if (seg < 0)
- return false;
-
- iter->fixed = true;
- index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
- iter->index = index;
- iter->seg = seg;
- return true;
-}
-
-static bool match_var_range(struct mtrr_iter *iter,
- struct kvm_mtrr_range *range)
-{
- u64 start, end;
-
- var_mtrr_range(range, &start, &end);
- if (!(start >= iter->end || end <= iter->start)) {
- iter->range = range;
-
- /*
- * the function is called when we do kvm_mtrr.head walking.
- * Range has the minimum base address which interleaves
- * [looker->start_max, looker->end).
- */
- iter->partial_map |= iter->start_max < start;
-
- /* update the max address has been covered. */
- iter->start_max = max(iter->start_max, end);
- return true;
- }
-
- return false;
-}
-
-static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
-{
- struct kvm_mtrr *mtrr_state = iter->mtrr_state;
-
- list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
- if (match_var_range(iter, iter->range))
- return;
-
- iter->range = NULL;
- iter->partial_map |= iter->start_max < iter->end;
-}
-
-static void mtrr_lookup_var_start(struct mtrr_iter *iter)
-{
- struct kvm_mtrr *mtrr_state = iter->mtrr_state;
-
- iter->fixed = false;
- iter->start_max = iter->start;
- iter->range = NULL;
- iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
-
- __mtrr_lookup_var_next(iter);
-}
-
-static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
-{
- /* terminate the lookup. */
- if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
- iter->fixed = false;
- iter->range = NULL;
- return;
- }
-
- iter->index++;
-
- /* have looked up for all fixed MTRRs. */
- if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
- return mtrr_lookup_var_start(iter);
-
- /* switch to next segment. */
- if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
- iter->seg++;
-}
-
-static void mtrr_lookup_var_next(struct mtrr_iter *iter)
-{
- __mtrr_lookup_var_next(iter);
-}
-
-static void mtrr_lookup_start(struct mtrr_iter *iter)
-{
- if (!mtrr_is_enabled(iter->mtrr_state)) {
- iter->mtrr_disabled = true;
- return;
- }
-
- if (!mtrr_lookup_fixed_start(iter))
- mtrr_lookup_var_start(iter);
-}
-
-static void mtrr_lookup_init(struct mtrr_iter *iter,
- struct kvm_mtrr *mtrr_state, u64 start, u64 end)
-{
- iter->mtrr_state = mtrr_state;
- iter->start = start;
- iter->end = end;
- iter->mtrr_disabled = false;
- iter->partial_map = false;
- iter->fixed = false;
- iter->range = NULL;
-
- mtrr_lookup_start(iter);
-}
-
-static bool mtrr_lookup_okay(struct mtrr_iter *iter)
-{
- if (iter->fixed) {
- iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
- return true;
- }
-
- if (iter->range) {
- iter->mem_type = iter->range->base & 0xff;
- return true;
- }
-
- return false;
-}
-
-static void mtrr_lookup_next(struct mtrr_iter *iter)
-{
- if (iter->fixed)
- mtrr_lookup_fixed_next(iter);
- else
- mtrr_lookup_var_next(iter);
-}
-
-#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
- for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
- mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
-
-u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct mtrr_iter iter;
- u64 start, end;
- int type = -1;
- const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
- | (1 << MTRR_TYPE_WRTHROUGH);
-
- start = gfn_to_gpa(gfn);
- end = start + PAGE_SIZE;
-
- mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
- int curr_type = iter.mem_type;
-
- /*
- * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
- * Precedences.
- */
-
- if (type == -1) {
- type = curr_type;
- continue;
- }
-
- /*
- * If two or more variable memory ranges match and the
- * memory types are identical, then that memory type is
- * used.
- */
- if (type == curr_type)
- continue;
-
- /*
- * If two or more variable memory ranges match and one of
- * the memory types is UC, the UC memory type used.
- */
- if (curr_type == MTRR_TYPE_UNCACHABLE)
- return MTRR_TYPE_UNCACHABLE;
-
- /*
- * If two or more variable memory ranges match and the
- * memory types are WT and WB, the WT memory type is used.
- */
- if (((1 << type) & wt_wb_mask) &&
- ((1 << curr_type) & wt_wb_mask)) {
- type = MTRR_TYPE_WRTHROUGH;
- continue;
- }
-
- /*
- * For overlaps not defined by the above rules, processor
- * behavior is undefined.
- */
-
- /* We use WB for this undefined behavior. :( */
- return MTRR_TYPE_WRBACK;
- }
-
- if (iter.mtrr_disabled)
- return mtrr_disabled_type(vcpu);
-
- /* not contained in any MTRRs. */
- if (type == -1)
- return mtrr_default_type(mtrr_state);
-
- /*
- * We just check one page, partially covered by MTRRs is
- * impossible.
- */
- WARN_ON(iter.partial_map);
-
- return type;
-}
-EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
-
-bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
- int page_num)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct mtrr_iter iter;
- u64 start, end;
- int type = -1;
-
- start = gfn_to_gpa(gfn);
- end = gfn_to_gpa(gfn + page_num);
- mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
- if (type == -1) {
- type = iter.mem_type;
- continue;
- }
-
- if (type != iter.mem_type)
- return false;
- }
-
- if (iter.mtrr_disabled)
- return true;
-
- if (!iter.partial_map)
- return true;
-
- if (type == -1)
- return true;
-
- return type == mtrr_default_type(mtrr_state);
-}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index a593b03c9aed..47a46283c866 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -34,16 +34,16 @@ EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
/* Precise Distribution of Instructions Retired (PDIR) */
static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, NULL),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, NULL),
/* Instruction-Accurate PDIR (PDIR++) */
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
{}
};
/* Precise Distribution (PDist) */
static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
{}
};
@@ -69,7 +69,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
* code. Each pmc, stored in kvm_pmc.idx field, is unique across
* all perf counters (both gp and fixed). The mapping relationship
* between pmc and perf counters is as the following:
- * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
* [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
@@ -194,7 +194,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
attr.sample_period = get_sample_period(pmc, pmc->counter);
if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
- guest_cpuid_is_intel(pmc->vcpu)) {
+ (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
/*
* HSW_IN_TX_CHECKPOINTED is not supported with nonzero
* period. Just clear the sample period so at least
@@ -469,11 +469,11 @@ static int reprogram_counter(struct kvm_pmc *pmc)
if (pmc_is_fixed(pmc)) {
fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
- if (fixed_ctr_ctrl & 0x1)
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
eventsel |= ARCH_PERFMON_EVENTSEL_OS;
- if (fixed_ctr_ctrl & 0x2)
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
eventsel |= ARCH_PERFMON_EVENTSEL_USR;
- if (fixed_ctr_ctrl & 0x8)
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
eventsel |= ARCH_PERFMON_EVENTSEL_INT;
new_config = (u64)fixed_ctr_ctrl;
}
@@ -521,9 +521,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
}
/*
- * Unused perf_events are only released if the corresponding MSRs
- * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
- * triggers KVM_REQ_PMU if cleanup is needed.
+ * Release unused perf_events if the corresponding guest MSRs weren't
+ * accessed during the last vCPU time slice (need_cleanup is set when
+ * the vCPU is scheduled back in).
*/
if (unlikely(pmu->need_cleanup))
kvm_pmu_cleanup(vcpu);
@@ -542,7 +542,7 @@ int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
if (!kvm_pmu_ops.check_rdpmc_early)
return 0;
- return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx);
+ return kvm_pmu_call(check_rdpmc_early)(vcpu, idx);
}
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -591,12 +591,12 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
- pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
+ pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
if (!pmc)
return 1;
if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
- (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
+ (kvm_x86_call(get_cpl)(vcpu) != 0) &&
kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
return 1;
@@ -607,7 +607,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu)) {
- static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
+ kvm_pmu_call(deliver_pmi)(vcpu);
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
}
}
@@ -622,14 +622,14 @@ bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
default:
break;
}
- return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
- static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
+ return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) ||
+ kvm_pmu_call(is_valid_msr)(vcpu, msr);
}
static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
+ struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr);
if (pmc)
__set_bit(pmc->idx, pmu->pmc_in_use);
@@ -654,7 +654,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0;
break;
default:
- return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
+ return kvm_pmu_call(get_msr)(vcpu, msr_info);
}
return 0;
@@ -681,13 +681,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated)
break;
- if (data & pmu->global_status_mask)
+ if (data & pmu->global_status_rsvd)
return 1;
pmu->global_status = data;
break;
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
- data &= ~pmu->global_ctrl_mask;
+ data &= ~pmu->global_ctrl_rsvd;
fallthrough;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
@@ -704,7 +704,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
* GLOBAL_STATUS, and so the set of reserved bits is the same.
*/
- if (data & pmu->global_status_mask)
+ if (data & pmu->global_status_rsvd)
return 1;
fallthrough;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
@@ -713,7 +713,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
default:
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
- return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
+ return kvm_pmu_call(set_msr)(vcpu, msr_info);
}
return 0;
@@ -740,7 +740,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
- static_call_cond(kvm_x86_pmu_reset)(vcpu);
+ kvm_pmu_call(reset)(vcpu);
}
@@ -768,17 +768,17 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
pmu->raw_event_mask = X86_RAW_EVENT_MASK;
- pmu->global_ctrl_mask = ~0ull;
- pmu->global_status_mask = ~0ull;
- pmu->fixed_ctr_ctrl_mask = ~0ull;
- pmu->pebs_enable_mask = ~0ull;
- pmu->pebs_data_cfg_mask = ~0ull;
+ pmu->global_ctrl_rsvd = ~0ull;
+ pmu->global_status_rsvd = ~0ull;
+ pmu->fixed_ctr_ctrl_rsvd = ~0ull;
+ pmu->pebs_enable_rsvd = ~0ull;
+ pmu->pebs_data_cfg_rsvd = ~0ull;
bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
if (!vcpu->kvm->arch.enable_pmu)
return;
- static_call(kvm_x86_pmu_refresh)(vcpu);
+ kvm_pmu_call(refresh)(vcpu);
/*
* At RESET, both Intel and AMD CPUs set all enable bits for general
@@ -796,7 +796,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
memset(pmu, 0, sizeof(*pmu));
- static_call(kvm_x86_pmu_init)(vcpu);
+ kvm_pmu_call(init)(vcpu);
kvm_pmu_refresh(vcpu);
}
@@ -818,7 +818,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
pmc_stop_counter(pmc);
}
- static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
+ kvm_pmu_call(cleanup)(vcpu);
bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
}
@@ -846,8 +846,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
} else {
config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
- select_os = config & 0x1;
- select_user = config & 0x2;
+ select_os = config & INTEL_FIXED_0_KERNEL;
+ select_user = config & INTEL_FIXED_0_USER;
}
/*
@@ -857,7 +857,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
if (select_os == select_user)
return select_os;
- return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
+ return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os :
+ select_user;
}
void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 4d52b0b539ba..ad89d0bd6005 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -14,7 +14,8 @@
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
-#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+#define fixed_ctrl_field(ctrl_reg, idx) \
+ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
@@ -129,7 +130,7 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
u64 data)
{
- return !(pmu->global_ctrl_mask & data);
+ return !(pmu->global_ctrl_rsvd & data);
}
/* returns general purpose PMC with the specified MSR. Note that it can be
@@ -170,7 +171,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
if (pmc_is_fixed(pmc))
return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
- pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3;
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX) &
+ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
}
@@ -217,7 +219,7 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
pmu_ops->MAX_NR_GP_COUNTERS);
kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
- KVM_PMC_MAX_FIXED);
+ KVM_MAX_NR_FIXED_COUNTERS);
kvm_pmu_eventsel.INSTRUCTIONS_RETIRED =
perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS);
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index d06d43d8d2aa..00e3c27d2a87 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -200,11 +200,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ kvm_x86_call(get_gdt)(vcpu, &dt);
smram->gdtr.base = dt.address;
smram->gdtr.limit = dt.size;
- static_call(kvm_x86_get_idt)(vcpu, &dt);
+ kvm_x86_call(get_idt)(vcpu, &dt);
smram->idtr.base = dt.address;
smram->idtr.limit = dt.size;
@@ -220,7 +220,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
smram->smm_revision = 0x00020000;
smram->smbase = vcpu->arch.smbase;
- smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
}
#ifdef CONFIG_X86_64
@@ -250,13 +250,13 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR);
- static_call(kvm_x86_get_idt)(vcpu, &dt);
+ kvm_x86_call(get_idt)(vcpu, &dt);
smram->idtr.limit = dt.size;
smram->idtr.base = dt.address;
enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR);
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ kvm_x86_call(get_gdt)(vcpu, &dt);
smram->gdtr.limit = dt.size;
smram->gdtr.base = dt.address;
@@ -267,7 +267,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS);
enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS);
- smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
}
#endif
@@ -297,7 +297,7 @@ void enter_smm(struct kvm_vcpu *vcpu)
* Kill the VM in the unlikely case of failure, because the VM
* can be in undefined state in this case.
*/
- if (static_call(kvm_x86_enter_smm)(vcpu, &smram))
+ if (kvm_x86_call(enter_smm)(vcpu, &smram))
goto error;
kvm_smm_changed(vcpu, true);
@@ -305,24 +305,24 @@ void enter_smm(struct kvm_vcpu *vcpu)
if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram)))
goto error;
- if (static_call(kvm_x86_get_nmi_mask)(vcpu))
+ if (kvm_x86_call(get_nmi_mask)(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- static_call(kvm_x86_set_nmi_mask)(vcpu, true);
+ kvm_x86_call(set_nmi_mask)(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- static_call(kvm_x86_set_cr0)(vcpu, cr0);
+ kvm_x86_call(set_cr0)(vcpu, cr0);
- static_call(kvm_x86_set_cr4)(vcpu, 0);
+ kvm_x86_call(set_cr4)(vcpu, 0);
/* Undocumented: IDT limit is set to zero on entry to SMM. */
dt.address = dt.size = 0;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1)))
goto error;
@@ -354,7 +354,7 @@ void enter_smm(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- if (static_call(kvm_x86_set_efer)(vcpu, 0))
+ if (kvm_x86_call(set_efer)(vcpu, 0))
goto error;
#endif
@@ -479,11 +479,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
dt.address = smstate->gdtr.base;
dt.size = smstate->gdtr.limit;
- static_call(kvm_x86_set_gdt)(vcpu, &dt);
+ kvm_x86_call(set_gdt)(vcpu, &dt);
dt.address = smstate->idtr.base;
dt.size = smstate->idtr.limit;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES);
rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS);
@@ -501,7 +501,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
if (r != X86EMUL_CONTINUE)
return r;
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
ctxt->interruptibility = (u8)smstate->int_shadow;
return r;
@@ -535,13 +535,13 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
dt.size = smstate->idtr.limit;
dt.address = smstate->idtr.base;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR);
dt.size = smstate->gdtr.limit;
dt.address = smstate->gdtr.base;
- static_call(kvm_x86_set_gdt)(vcpu, &dt);
+ kvm_x86_call(set_gdt)(vcpu, &dt);
r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4);
if (r != X86EMUL_CONTINUE)
@@ -554,7 +554,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS);
rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS);
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
ctxt->interruptibility = (u8)smstate->int_shadow;
return X86EMUL_CONTINUE;
@@ -576,7 +576,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
return X86EMUL_UNHANDLEABLE;
if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0)
- static_call(kvm_x86_set_nmi_mask)(vcpu, false);
+ kvm_x86_call(set_nmi_mask)(vcpu, false);
kvm_smm_changed(vcpu, false);
@@ -628,7 +628,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
* state (e.g. enter guest mode) before loading state from the SMM
* state-save area.
*/
- if (static_call(kvm_x86_leave_smm)(vcpu, &smram))
+ if (kvm_x86_call(leave_smm)(vcpu, &smram))
return X86EMUL_UNHANDLEABLE;
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 55b9a6d96bcf..6f704c1037e5 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1181,7 +1181,7 @@ int svm_allocate_nested(struct vcpu_svm *svm)
if (svm->nested.initialized)
return 0;
- vmcb02_page = snp_safe_alloc_page(&svm->vcpu);
+ vmcb02_page = snp_safe_alloc_page();
if (!vmcb02_page)
return -ENOMEM;
svm->nested.vmcb02.ptr = page_address(vmcb02_page);
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index dfcc38bd97d3..22d5a65b410c 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -199,8 +199,8 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
kvm_pmu_cap.num_counters_gp);
if (pmu->version > 1) {
- pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1);
- pmu->global_status_mask = pmu->global_ctrl_mask;
+ pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1);
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd;
}
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
@@ -217,10 +217,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int i;
- BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE);
- BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC);
+ BUILD_BUG_ON(KVM_MAX_NR_AMD_GP_COUNTERS > AMD64_NUM_COUNTERS_CORE);
- for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) {
+ for (i = 0; i < KVM_MAX_NR_AMD_GP_COUNTERS; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
@@ -238,6 +237,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
.refresh = amd_pmu_refresh,
.init = amd_pmu_init,
.EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
- .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS,
.MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 95095a233a45..a16c873b3232 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -19,12 +19,14 @@
#include <linux/misc_cgroup.h>
#include <linux/processor.h>
#include <linux/trace_events.h>
+#include <uapi/linux/sev-guest.h>
#include <asm/pkru.h>
#include <asm/trapnr.h>
#include <asm/fpu/xcr.h>
#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
+#include <asm/sev.h>
#include "mmu.h"
#include "x86.h"
@@ -37,7 +39,7 @@
#define GHCB_VERSION_DEFAULT 2ULL
#define GHCB_VERSION_MIN 1ULL
-#define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP
+#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
/* enable/disable SEV support */
static bool sev_enabled = true;
@@ -47,6 +49,10 @@ module_param_named(sev, sev_enabled, bool, 0444);
static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
+/* enable/disable SEV-SNP support */
+static bool sev_snp_enabled = true;
+module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
+
/* enable/disable SEV-ES DebugSwap support */
static bool sev_es_debug_swap_enabled = true;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
@@ -56,6 +62,23 @@ static u64 sev_supported_vmsa_features;
#define AP_RESET_HOLD_NAE_EVENT 1
#define AP_RESET_HOLD_MSR_PROTO 2
+/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
+#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
+#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
+#define SNP_POLICY_MASK_SMT BIT_ULL(16)
+#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
+#define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
+#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
+
+#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
+ SNP_POLICY_MASK_API_MAJOR | \
+ SNP_POLICY_MASK_SMT | \
+ SNP_POLICY_MASK_RSVD_MBO | \
+ SNP_POLICY_MASK_DEBUG | \
+ SNP_POLICY_MASK_SINGLE_SOCKET)
+
+#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
+
static u8 sev_enc_bit;
static DECLARE_RWSEM(sev_deactivate_lock);
static DEFINE_MUTEX(sev_bitmap_lock);
@@ -66,6 +89,8 @@ static unsigned int nr_asids;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
+static int snp_decommission_context(struct kvm *kvm);
+
struct enc_region {
struct list_head list;
unsigned long npages;
@@ -92,12 +117,17 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
down_write(&sev_deactivate_lock);
wbinvd_on_all_cpus();
- ret = sev_guest_df_flush(&error);
+
+ if (sev_snp_enabled)
+ ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
+ else
+ ret = sev_guest_df_flush(&error);
up_write(&sev_deactivate_lock);
if (ret)
- pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+ pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
+ sev_snp_enabled ? "-SNP" : "", ret, error);
return ret;
}
@@ -233,6 +263,53 @@ static void sev_decommission(unsigned int handle)
sev_guest_decommission(&decommission, NULL);
}
+/*
+ * Transition a page to hypervisor-owned/shared state in the RMP table. This
+ * should not fail under normal conditions, but leak the page should that
+ * happen since it will no longer be usable by the host due to RMP protections.
+ */
+static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
+{
+ if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
+ snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * Certain page-states, such as Pre-Guest and Firmware pages (as documented
+ * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
+ * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
+ * unless they are reclaimed first.
+ *
+ * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
+ * might not be usable by the host due to being set as immutable or still
+ * being associated with a guest ASID.
+ *
+ * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
+ * converted back to shared, as the page is no longer usable due to RMP
+ * protections, and it's infeasible for the guest to continue on.
+ */
+static int snp_page_reclaim(struct kvm *kvm, u64 pfn)
+{
+ struct sev_data_snp_page_reclaim data = {0};
+ int fw_err, rc;
+
+ data.paddr = __sme_set(pfn << PAGE_SHIFT);
+ rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err);
+ if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) {
+ snp_leak_pages(pfn, 1);
+ return -EIO;
+ }
+
+ if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K))
+ return -EIO;
+
+ return rc;
+}
+
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
{
struct sev_data_deactivate deactivate;
@@ -250,6 +327,78 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
sev_decommission(handle);
}
+/*
+ * This sets up bounce buffers/firmware pages to handle SNP Guest Request
+ * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
+ * 2.0 specification for more details.
+ *
+ * Technically, when an SNP Guest Request is issued, the guest will provide its
+ * own request/response pages, which could in theory be passed along directly
+ * to firmware rather than using bounce pages. However, these pages would need
+ * special care:
+ *
+ * - Both pages are from shared guest memory, so they need to be protected
+ * from migration/etc. occurring while firmware reads/writes to them. At a
+ * minimum, this requires elevating the ref counts and potentially needing
+ * an explicit pinning of the memory. This places additional restrictions
+ * on what type of memory backends userspace can use for shared guest
+ * memory since there is some reliance on using refcounted pages.
+ *
+ * - The response page needs to be switched to Firmware-owned[1] state
+ * before the firmware can write to it, which can lead to potential
+ * host RMP #PFs if the guest is misbehaved and hands the host a
+ * guest page that KVM might write to for other reasons (e.g. virtio
+ * buffers/etc.).
+ *
+ * Both of these issues can be avoided completely by using separately-allocated
+ * bounce pages for both the request/response pages and passing those to
+ * firmware instead. So that's what is being set up here.
+ *
+ * Guest requests rely on message sequence numbers to ensure requests are
+ * issued to firmware in the order the guest issues them, so concurrent guest
+ * requests generally shouldn't happen. But a misbehaved guest could issue
+ * concurrent guest requests in theory, so a mutex is used to serialize
+ * access to the bounce buffers.
+ *
+ * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
+ * details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
+ * in the APM for details on the related RMP restrictions.
+ */
+static int snp_guest_req_init(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct page *req_page;
+
+ req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!req_page)
+ return -ENOMEM;
+
+ sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!sev->guest_resp_buf) {
+ __free_page(req_page);
+ return -EIO;
+ }
+
+ sev->guest_req_buf = page_address(req_page);
+ mutex_init(&sev->guest_req_mutex);
+
+ return 0;
+}
+
+static void snp_guest_req_cleanup(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ if (sev->guest_resp_buf)
+ snp_free_firmware_page(sev->guest_resp_buf);
+
+ if (sev->guest_req_buf)
+ __free_page(virt_to_page(sev->guest_req_buf));
+
+ sev->guest_req_buf = NULL;
+ sev->guest_resp_buf = NULL;
+}
+
static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct kvm_sev_init *data,
unsigned long vm_type)
@@ -288,6 +437,9 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
if (sev->es_active && !sev->ghcb_version)
sev->ghcb_version = GHCB_VERSION_DEFAULT;
+ if (vm_type == KVM_X86_SNP_VM)
+ sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
+
ret = sev_asid_new(sev);
if (ret)
goto e_no_asid;
@@ -297,6 +449,10 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
if (ret)
goto e_free;
+ /* This needs to happen after SEV/SNP firmware initialization. */
+ if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm))
+ goto e_free;
+
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
sev->need_init = false;
@@ -348,7 +504,8 @@ static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -EINVAL;
if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
- kvm->arch.vm_type != KVM_X86_SEV_ES_VM)
+ kvm->arch.vm_type != KVM_X86_SEV_ES_VM &&
+ kvm->arch.vm_type != KVM_X86_SNP_VM)
return -EINVAL;
if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
@@ -1999,6 +2156,410 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
}
}
+/*
+ * The guest context contains all the information, keys and metadata
+ * associated with the guest that the firmware tracks to implement SEV
+ * and SNP features. The firmware stores the guest context in hypervisor
+ * provide page via the SNP_GCTX_CREATE command.
+ */
+static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_snp_addr data = {};
+ void *context;
+ int rc;
+
+ /* Allocate memory for context page */
+ context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
+ if (!context)
+ return NULL;
+
+ data.address = __psp_pa(context);
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
+ if (rc) {
+ pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
+ rc, argp->error);
+ snp_free_firmware_page(context);
+ return NULL;
+ }
+
+ return context;
+}
+
+static int snp_bind_asid(struct kvm *kvm, int *error)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_activate data = {0};
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.asid = sev_get_asid(kvm);
+ return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
+}
+
+static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_launch_start start = {0};
+ struct kvm_sev_snp_launch_start params;
+ int rc;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ /* Don't allow userspace to allocate memory for more than 1 SNP context. */
+ if (sev->snp_context)
+ return -EINVAL;
+
+ sev->snp_context = snp_context_create(kvm, argp);
+ if (!sev->snp_context)
+ return -ENOTTY;
+
+ if (params.flags)
+ return -EINVAL;
+
+ if (params.policy & ~SNP_POLICY_MASK_VALID)
+ return -EINVAL;
+
+ /* Check for policy bits that must be set */
+ if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) ||
+ !(params.policy & SNP_POLICY_MASK_SMT))
+ return -EINVAL;
+
+ if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
+ return -EINVAL;
+
+ start.gctx_paddr = __psp_pa(sev->snp_context);
+ start.policy = params.policy;
+ memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
+ if (rc) {
+ pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ sev->fd = argp->sev_fd;
+ rc = snp_bind_asid(kvm, &argp->error);
+ if (rc) {
+ pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ return 0;
+
+e_free_context:
+ snp_decommission_context(kvm);
+
+ return rc;
+}
+
+struct sev_gmem_populate_args {
+ __u8 type;
+ int sev_fd;
+ int fw_error;
+};
+
+static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
+ void __user *src, int order, void *opaque)
+{
+ struct sev_gmem_populate_args *sev_populate_args = opaque;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ int n_private = 0, ret, i;
+ int npages = (1 << order);
+ gfn_t gfn;
+
+ if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
+ return -EINVAL;
+
+ for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
+ struct sev_data_snp_launch_update fw_args = {0};
+ bool assigned;
+ int level;
+
+ if (!kvm_mem_is_private(kvm, gfn)) {
+ pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n",
+ __func__, gfn);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
+ if (ret || assigned) {
+ pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
+ __func__, gfn, ret, assigned);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (src) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE);
+ if (ret)
+ goto err;
+ kunmap_local(vaddr);
+ }
+
+ ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
+ sev_get_asid(kvm), true);
+ if (ret)
+ goto err;
+
+ n_private++;
+
+ fw_args.gctx_paddr = __psp_pa(sev->snp_context);
+ fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
+ fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
+ fw_args.page_type = sev_populate_args->type;
+
+ ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &fw_args, &sev_populate_args->fw_error);
+ if (ret)
+ goto fw_err;
+ }
+
+ return 0;
+
+fw_err:
+ /*
+ * If the firmware command failed handle the reclaim and cleanup of that
+ * PFN specially vs. prior pages which can be cleaned up below without
+ * needing to reclaim in advance.
+ *
+ * Additionally, when invalid CPUID function entries are detected,
+ * firmware writes the expected values into the page and leaves it
+ * unencrypted so it can be used for debugging and error-reporting.
+ *
+ * Copy this page back into the source buffer so userspace can use this
+ * information to provide information on which CPUID leaves/fields
+ * failed CPUID validation.
+ */
+ if (!snp_page_reclaim(kvm, pfn + i) &&
+ sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
+ sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
+ pr_debug("Failed to write CPUID page back to userspace\n");
+
+ kunmap_local(vaddr);
+ }
+
+ /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
+ n_private--;
+
+err:
+ pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
+ __func__, ret, sev_populate_args->fw_error, n_private);
+ for (i = 0; i < n_private; i++)
+ kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
+
+ return ret;
+}
+
+static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_gmem_populate_args sev_populate_args = {0};
+ struct kvm_sev_snp_launch_update params;
+ struct kvm_memory_slot *memslot;
+ long npages, count;
+ void __user *src;
+ int ret = 0;
+
+ if (!sev_snp_guest(kvm) || !sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
+ params.gfn_start, params.len, params.type, params.flags);
+
+ if (!PAGE_ALIGNED(params.len) || params.flags ||
+ (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
+ return -EINVAL;
+
+ npages = params.len / PAGE_SIZE;
+
+ /*
+ * For each GFN that's being prepared as part of the initial guest
+ * state, the following pre-conditions are verified:
+ *
+ * 1) The backing memslot is a valid private memslot.
+ * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
+ * beforehand.
+ * 3) The PFN of the guest_memfd has not already been set to private
+ * in the RMP table.
+ *
+ * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
+ * faults if there's a race between a fault and an attribute update via
+ * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
+ * here. However, kvm->slots_lock guards against both this as well as
+ * concurrent memslot updates occurring while these checks are being
+ * performed, so use that here to make it easier to reason about the
+ * initial expected state and better guard against unexpected
+ * situations.
+ */
+ mutex_lock(&kvm->slots_lock);
+
+ memslot = gfn_to_memslot(kvm, params.gfn_start);
+ if (!kvm_slot_can_be_private(memslot)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sev_populate_args.sev_fd = argp->sev_fd;
+ sev_populate_args.type = params.type;
+ src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
+
+ count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
+ sev_gmem_post_populate, &sev_populate_args);
+ if (count < 0) {
+ argp->error = sev_populate_args.fw_error;
+ pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
+ __func__, count, argp->error);
+ ret = -EIO;
+ } else {
+ params.gfn_start += count;
+ params.len -= count * PAGE_SIZE;
+ if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
+ params.uaddr += count * PAGE_SIZE;
+
+ ret = 0;
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
+ ret = -EFAULT;
+ }
+
+out:
+ mutex_unlock(&kvm->slots_lock);
+
+ return ret;
+}
+
+static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_launch_update data = {};
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.page_type = SNP_PAGE_TYPE_VMSA;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /* Transition the VMSA page to a firmware state. */
+ ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
+ if (ret)
+ return ret;
+
+ /* Issue the SNP command to encrypt the VMSA */
+ data.address = __sme_pa(svm->sev_es.vmsa);
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &data, &argp->error);
+ if (ret) {
+ snp_page_reclaim(kvm, pfn);
+
+ return ret;
+ }
+
+ svm->vcpu.arch.guest_state_protected = true;
+ /*
+ * SEV-ES (and thus SNP) guest mandates LBR Virtualization to
+ * be _always_ ON. Enable it only after setting
+ * guest_state_protected because KVM_SET_MSRS allows dynamic
+ * toggling of LBRV (for performance reason) on write access to
+ * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
+ */
+ svm_enable_lbrv(vcpu);
+ }
+
+ return 0;
+}
+
+static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_snp_launch_finish params;
+ struct sev_data_snp_launch_finish *data;
+ void *id_block = NULL, *id_auth = NULL;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (!sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ if (params.flags)
+ return -EINVAL;
+
+ /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
+ ret = snp_launch_update_vmsa(kvm, argp);
+ if (ret)
+ return ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ if (params.id_block_en) {
+ id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
+ if (IS_ERR(id_block)) {
+ ret = PTR_ERR(id_block);
+ goto e_free;
+ }
+
+ data->id_block_en = 1;
+ data->id_block_paddr = __sme_pa(id_block);
+
+ id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
+ if (IS_ERR(id_auth)) {
+ ret = PTR_ERR(id_auth);
+ goto e_free_id_block;
+ }
+
+ data->id_auth_paddr = __sme_pa(id_auth);
+
+ if (params.auth_key_en)
+ data->auth_key_en = 1;
+ }
+
+ data->vcek_disabled = params.vcek_disabled;
+
+ memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
+ data->gctx_paddr = __psp_pa(sev->snp_context);
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+
+ kfree(id_auth);
+
+e_free_id_block:
+ kfree(id_block);
+
+e_free:
+ kfree(data);
+
+ return ret;
+}
+
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -2022,6 +2583,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
goto out;
}
+ /*
+ * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
+ * allow the use of SNP-specific commands.
+ */
+ if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
+ r = -EPERM;
+ goto out;
+ }
+
switch (sev_cmd.id) {
case KVM_SEV_ES_INIT:
if (!sev_es_enabled) {
@@ -2086,6 +2656,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
case KVM_SEV_RECEIVE_FINISH:
r = sev_receive_finish(kvm, &sev_cmd);
break;
+ case KVM_SEV_SNP_LAUNCH_START:
+ r = snp_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_UPDATE:
+ r = snp_launch_update(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_FINISH:
+ r = snp_launch_finish(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
@@ -2281,6 +2860,31 @@ e_source_fput:
return ret;
}
+static int snp_decommission_context(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_addr data = {};
+ int ret;
+
+ /* If context is not created then do nothing */
+ if (!sev->snp_context)
+ return 0;
+
+ /* Do the decommision, which will unbind the ASID from the SNP context */
+ data.address = __sme_pa(sev->snp_context);
+ down_write(&sev_deactivate_lock);
+ ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
+ up_write(&sev_deactivate_lock);
+
+ if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret))
+ return ret;
+
+ snp_free_firmware_page(sev->snp_context);
+ sev->snp_context = NULL;
+
+ return 0;
+}
+
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -2322,7 +2926,19 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
- sev_unbind_asid(kvm, sev->handle);
+ if (sev_snp_guest(kvm)) {
+ snp_guest_req_cleanup(kvm);
+
+ /*
+ * Decomission handles unbinding of the ASID. If it fails for
+ * some unexpected reason, just leak the ASID.
+ */
+ if (snp_decommission_context(kvm))
+ return;
+ } else {
+ sev_unbind_asid(kvm, sev->handle);
+ }
+
sev_asid_free(sev);
}
@@ -2336,11 +2952,16 @@ void __init sev_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
}
+ if (sev_snp_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
+ }
}
void __init sev_hardware_setup(void)
{
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
+ bool sev_snp_supported = false;
bool sev_es_supported = false;
bool sev_supported = false;
@@ -2427,6 +3048,7 @@ void __init sev_hardware_setup(void)
sev_es_asid_count = min_sev_asid - 1;
WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
sev_es_supported = true;
+ sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
out:
if (boot_cpu_has(X86_FEATURE_SEV))
@@ -2439,9 +3061,15 @@ out:
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
sev_es_supported ? "enabled" : "disabled",
min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
+ if (boot_cpu_has(X86_FEATURE_SEV_SNP))
+ pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
+ sev_snp_supported ? "enabled" : "disabled",
+ min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
sev_enabled = sev_supported;
sev_es_enabled = sev_es_supported;
+ sev_snp_enabled = sev_snp_supported;
+
if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
!cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
sev_es_debug_swap_enabled = false;
@@ -2520,7 +3148,13 @@ do_wbinvd:
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
- if (!sev_guest(kvm))
+ /*
+ * With SNP+gmem, private/encrypted memory is unreachable via the
+ * hva-based mmu notifiers, so these events are only actually
+ * pertaining to shared pages where there is no need to perform
+ * the WBINVD to flush associated caches.
+ */
+ if (!sev_guest(kvm) || sev_snp_guest(kvm))
return;
wbinvd_on_all_cpus();
@@ -2535,11 +3169,24 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
+ /*
+ * If it's an SNP guest, then the VMSA was marked in the RMP table as
+ * a guest-owned page. Transition the page to hypervisor state before
+ * releasing it back to the system.
+ */
+ if (sev_snp_guest(vcpu->kvm)) {
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
+ goto skip_vmsa_free;
+ }
+
if (vcpu->arch.guest_state_protected)
sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
__free_page(virt_to_page(svm->sev_es.vmsa));
+skip_vmsa_free:
if (svm->sev_es.ghcb_sa_free)
kvfree(svm->sev_es.ghcb_sa);
}
@@ -2735,6 +3382,13 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
if (!kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
break;
+ case SVM_VMGEXIT_AP_CREATION:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto vmgexit_err;
+ if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ break;
case SVM_VMGEXIT_NMI_COMPLETE:
case SVM_VMGEXIT_AP_HLT_LOOP:
case SVM_VMGEXIT_AP_JUMP_TABLE:
@@ -2742,6 +3396,18 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
case SVM_VMGEXIT_HV_FEATURES:
case SVM_VMGEXIT_TERM_REQUEST:
break;
+ case SVM_VMGEXIT_PSC:
+ if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ if (!sev_snp_guest(vcpu->kvm) ||
+ !PAGE_ALIGNED(control->exit_info_1) ||
+ !PAGE_ALIGNED(control->exit_info_2) ||
+ control->exit_info_1 == control->exit_info_2)
+ goto vmgexit_err;
+ break;
default:
reason = GHCB_ERR_INVALID_EVENT;
goto vmgexit_err;
@@ -2929,6 +3595,534 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
svm->vmcb->control.ghcb_gpa = value;
}
+static int snp_rmptable_psmash(kvm_pfn_t pfn)
+{
+ int ret;
+
+ pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
+
+ /*
+ * PSMASH_FAIL_INUSE indicates another processor is modifying the
+ * entry, so retry until that's no longer the case.
+ */
+ do {
+ ret = psmash(pfn);
+ } while (ret == PSMASH_FAIL_INUSE);
+
+ return ret;
+}
+
+static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->run->hypercall.ret)
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ else
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
+
+ return 1; /* resume guest */
+}
+
+static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
+{
+ u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
+ u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = 1;
+ vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+
+ vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
+
+ return 0; /* forward request to userspace */
+}
+
+struct psc_buffer {
+ struct psc_hdr hdr;
+ struct psc_entry entries[];
+} __packed;
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
+
+static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
+{
+ svm->sev_es.psc_inflight = 0;
+ svm->sev_es.psc_idx = 0;
+ svm->sev_es.psc_2m = false;
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
+}
+
+static void __snp_complete_one_psc(struct vcpu_svm *svm)
+{
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+ struct psc_entry *entries = psc->entries;
+ struct psc_hdr *hdr = &psc->hdr;
+ __u16 idx;
+
+ /*
+ * Everything in-flight has been processed successfully. Update the
+ * corresponding entries in the guest's PSC buffer and zero out the
+ * count of in-flight PSC entries.
+ */
+ for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
+ svm->sev_es.psc_inflight--, idx++) {
+ struct psc_entry *entry = &entries[idx];
+
+ entry->cur_page = entry->pagesize ? 512 : 1;
+ }
+
+ hdr->cur_entry = idx;
+}
+
+static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+
+ if (vcpu->run->hypercall.ret) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1; /* resume guest */
+ }
+
+ __snp_complete_one_psc(svm);
+
+ /* Handle the next range (if any). */
+ return snp_begin_psc(svm, psc);
+}
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
+{
+ struct psc_entry *entries = psc->entries;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct psc_hdr *hdr = &psc->hdr;
+ struct psc_entry entry_start;
+ u16 idx, idx_start, idx_end;
+ int npages;
+ bool huge;
+ u64 gfn;
+
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+next_range:
+ /* There should be no other PSCs in-flight at this point. */
+ if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+ /*
+ * The PSC descriptor buffer can be modified by a misbehaved guest after
+ * validation, so take care to only use validated copies of values used
+ * for things like array indexing.
+ */
+ idx_start = hdr->cur_entry;
+ idx_end = hdr->end_entry;
+
+ if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
+ return 1;
+ }
+
+ /* Find the start of the next range which needs processing. */
+ for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
+ entry_start = entries[idx];
+
+ gfn = entry_start.gfn;
+ huge = entry_start.pagesize;
+ npages = huge ? 512 : 1;
+
+ if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
+ return 1;
+ }
+
+ if (entry_start.cur_page) {
+ /*
+ * If this is a partially-completed 2M range, force 4K handling
+ * for the remaining pages since they're effectively split at
+ * this point. Subsequent code should ensure this doesn't get
+ * combined with adjacent PSC entries where 2M handling is still
+ * possible.
+ */
+ npages -= entry_start.cur_page;
+ gfn += entry_start.cur_page;
+ huge = false;
+ }
+
+ if (npages)
+ break;
+ }
+
+ if (idx > idx_end) {
+ /* Nothing more to process. */
+ snp_complete_psc(svm, 0);
+ return 1;
+ }
+
+ svm->sev_es.psc_2m = huge;
+ svm->sev_es.psc_idx = idx;
+ svm->sev_es.psc_inflight = 1;
+
+ /*
+ * Find all subsequent PSC entries that contain adjacent GPA
+ * ranges/operations and can be combined into a single
+ * KVM_HC_MAP_GPA_RANGE exit.
+ */
+ while (++idx <= idx_end) {
+ struct psc_entry entry = entries[idx];
+
+ if (entry.operation != entry_start.operation ||
+ entry.gfn != entry_start.gfn + npages ||
+ entry.cur_page || !!entry.pagesize != huge)
+ break;
+
+ svm->sev_es.psc_inflight++;
+ npages += huge ? 512 : 1;
+ }
+
+ switch (entry_start.operation) {
+ case VMGEXIT_PSC_OP_PRIVATE:
+ case VMGEXIT_PSC_OP_SHARED:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= entry_start.pagesize
+ ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
+ : KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+ vcpu->arch.complete_userspace_io = snp_complete_one_psc;
+ return 0; /* forward request to userspace */
+ default:
+ /*
+ * Only shared/private PSC operations are currently supported, so if the
+ * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
+ * then consider the entire range completed and avoid exiting to
+ * userspace. In theory snp_complete_psc() can always be called directly
+ * at this point to complete the current range and start the next one,
+ * but that could lead to unexpected levels of recursion.
+ */
+ __snp_complete_one_psc(svm);
+ goto next_range;
+ }
+
+ unreachable();
+}
+
+static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex));
+
+ /* Mark the vCPU as offline and not runnable */
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+
+ /* Clear use of the VMSA */
+ svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+
+ if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
+ gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
+ struct kvm_memory_slot *slot;
+ kvm_pfn_t pfn;
+
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot)
+ return -EINVAL;
+
+ /*
+ * The new VMSA will be private memory guest memory, so
+ * retrieve the PFN from the gmem backend.
+ */
+ if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL))
+ return -EINVAL;
+
+ /*
+ * From this point forward, the VMSA will always be a
+ * guest-mapped page rather than the initial one allocated
+ * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa
+ * could be free'd and cleaned up here, but that involves
+ * cleanups like wbinvd_on_all_cpus() which would ideally
+ * be handled during teardown rather than guest boot.
+ * Deferring that also allows the existing logic for SEV-ES
+ * VMSAs to be re-used with minimal SNP-specific changes.
+ */
+ svm->sev_es.snp_has_guest_vmsa = true;
+
+ /* Use the new VMSA */
+ svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
+
+ /* Mark the vCPU as runnable */
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+
+ /*
+ * gmem pages aren't currently migratable, but if this ever
+ * changes then care should be taken to ensure
+ * svm->sev_es.vmsa is pinned through some other means.
+ */
+ kvm_release_pfn_clean(pfn);
+ }
+
+ /*
+ * When replacing the VMSA during SEV-SNP AP creation,
+ * mark the VMCB dirty so that full state is always reloaded.
+ */
+ vmcb_mark_all_dirty(svm->vmcb);
+
+ return 0;
+}
+
+/*
+ * Invoked as part of svm_vcpu_reset() processing of an init event.
+ */
+void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (!sev_snp_guest(vcpu->kvm))
+ return;
+
+ mutex_lock(&svm->sev_es.snp_vmsa_mutex);
+
+ if (!svm->sev_es.snp_ap_waiting_for_reset)
+ goto unlock;
+
+ svm->sev_es.snp_ap_waiting_for_reset = false;
+
+ ret = __sev_snp_update_protected_guest_state(vcpu);
+ if (ret)
+ vcpu_unimpl(vcpu, "snp: AP state update on init failed\n");
+
+unlock:
+ mutex_unlock(&svm->sev_es.snp_vmsa_mutex);
+}
+
+static int sev_snp_ap_creation(struct vcpu_svm *svm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_vcpu *target_vcpu;
+ struct vcpu_svm *target_svm;
+ unsigned int request;
+ unsigned int apic_id;
+ bool kick;
+ int ret;
+
+ request = lower_32_bits(svm->vmcb->control.exit_info_1);
+ apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
+
+ /* Validate the APIC ID */
+ target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
+ if (!target_vcpu) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
+ apic_id);
+ return -EINVAL;
+ }
+
+ ret = 0;
+
+ target_svm = to_svm(target_vcpu);
+
+ /*
+ * The target vCPU is valid, so the vCPU will be kicked unless the
+ * request is for CREATE_ON_INIT. For any errors at this stage, the
+ * kick will place the vCPU in an non-runnable state.
+ */
+ kick = true;
+
+ mutex_lock(&target_svm->sev_es.snp_vmsa_mutex);
+
+ target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+ target_svm->sev_es.snp_ap_waiting_for_reset = true;
+
+ /* Interrupt injection mode shouldn't change for AP creation */
+ if (request < SVM_VMGEXIT_AP_DESTROY) {
+ u64 sev_features;
+
+ sev_features = vcpu->arch.regs[VCPU_REGS_RAX];
+ sev_features ^= sev->vmsa_features;
+
+ if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n",
+ vcpu->arch.regs[VCPU_REGS_RAX]);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ switch (request) {
+ case SVM_VMGEXIT_AP_CREATE_ON_INIT:
+ kick = false;
+ fallthrough;
+ case SVM_VMGEXIT_AP_CREATE:
+ if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
+ svm->vmcb->control.exit_info_2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Malicious guest can RMPADJUST a large page into VMSA which
+ * will hit the SNP erratum where the CPU will incorrectly signal
+ * an RMP violation #PF if a hugepage collides with the RMP entry
+ * of VMSA page, reject the AP CREATE request if VMSA address from
+ * guest is 2M aligned.
+ */
+ if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
+ vcpu_unimpl(vcpu,
+ "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
+ svm->vmcb->control.exit_info_2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
+ break;
+ case SVM_VMGEXIT_AP_DESTROY:
+ break;
+ default:
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
+ request);
+ ret = -EINVAL;
+ break;
+ }
+
+out:
+ if (kick) {
+ kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
+ kvm_vcpu_kick(target_vcpu);
+ }
+
+ mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex);
+
+ return ret;
+}
+
+static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct sev_data_snp_guest_request data = {0};
+ struct kvm *kvm = svm->vcpu.kvm;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ sev_ret_code fw_err = 0;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ mutex_lock(&sev->guest_req_mutex);
+
+ if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.req_paddr = __psp_pa(sev->guest_req_buf);
+ data.res_paddr = __psp_pa(sev->guest_resp_buf);
+
+ /*
+ * Firmware failures are propagated on to guest, but any other failure
+ * condition along the way should be reported to userspace. E.g. if
+ * the PSP is dead and commands are timing out.
+ */
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
+ if (ret && !fw_err)
+ goto out_unlock;
+
+ if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(0, fw_err));
+
+ ret = 1; /* resume guest */
+
+out_unlock:
+ mutex_unlock(&sev->guest_req_mutex);
+ return ret;
+}
+
+static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct kvm *kvm = svm->vcpu.kvm;
+ u8 msg_type;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type),
+ &msg_type, 1))
+ return -EIO;
+
+ /*
+ * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
+ * additional certificate data to be provided alongside the attestation
+ * report via the guest-provided data pages indicated by RAX/RBX. The
+ * certificate data is optional and requires additional KVM enablement
+ * to provide an interface for userspace to provide it, but KVM still
+ * needs to be able to handle extended guest requests either way. So
+ * provide a stub implementation that will always return an empty
+ * certificate table in the guest-provided data pages.
+ */
+ if (msg_type == SNP_MSG_REPORT_REQ) {
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 data_npages;
+ gpa_t data_gpa;
+
+ if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm))
+ goto request_invalid;
+
+ data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
+ data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
+
+ if (!PAGE_ALIGNED(data_gpa))
+ goto request_invalid;
+
+ /*
+ * As per GHCB spec (see "SNP Extended Guest Request"), the
+ * certificate table is terminated by 24-bytes of zeroes.
+ */
+ if (data_npages && kvm_clear_guest(kvm, data_gpa, 24))
+ return -EIO;
+ }
+
+ return snp_handle_guest_req(svm, req_gpa, resp_gpa);
+
+request_invalid:
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ return 1; /* resume guest */
+}
+
static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
@@ -3008,6 +4202,38 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
break;
+ case GHCB_MSR_PREF_GPA_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_REG_GPA_REQ: {
+ u64 gfn;
+
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+
+ svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
+
+ set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_PSC_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ ret = snp_begin_psc_msr(svm, control->ghcb_gpa);
+ break;
case GHCB_MSR_TERM_REQ: {
u64 reason_set, reason_code;
@@ -3020,12 +4246,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
- vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
- vcpu->run->system_event.ndata = 1;
- vcpu->run->system_event.data[0] = control->ghcb_gpa;
-
- return 0;
+ goto out_terminate;
}
default:
/* Error, keep GHCB MSR value as-is */
@@ -3036,6 +4257,14 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
control->ghcb_gpa, ret);
return ret;
+
+out_terminate:
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
}
int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
@@ -3071,6 +4300,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
sev_es_sync_from_ghcb(svm);
+
+ /* SEV-SNP guest requires that the GHCB GPA must be registered */
+ if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
+ vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
+ return -EINVAL;
+ }
+
ret = sev_es_validate_vmgexit(svm);
if (ret)
return ret;
@@ -3145,6 +4381,28 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
vcpu->run->system_event.ndata = 1;
vcpu->run->system_event.data[0] = control->ghcb_gpa;
break;
+ case SVM_VMGEXIT_PSC:
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_AP_CREATION:
+ ret = sev_snp_ap_creation(svm);
+ if (ret) {
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ }
+
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
@@ -3238,7 +4496,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
* the VMSA will be NULL if this vCPU is the destination for intrahost
* migration, and will be copied later.
*/
- if (svm->sev_es.vmsa)
+ if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa)
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
/* Can't intercept CR register access, HV can't modify CR registers */
@@ -3310,6 +4568,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
+
+ mutex_init(&svm->sev_es.snp_vmsa_mutex);
}
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
@@ -3331,9 +4591,9 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are
* isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
* by common SVM code).
*/
- hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ hostsa->xcr0 = kvm_host.xcr0;
hostsa->pkru = read_pkru();
- hostsa->xss = host_xss;
+ hostsa->xss = kvm_host.xss;
/*
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
@@ -3389,13 +4649,13 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
}
}
-struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
+struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
{
unsigned long pfn;
struct page *p;
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
- return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
/*
* Allocate an SNP-safe page to workaround the SNP erratum where
@@ -3406,7 +4666,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
* Allocate one extra page, choose a page which is not
* 2MB-aligned, and free the other.
*/
- p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 1);
if (!p)
return NULL;
@@ -3420,3 +4680,271 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
return p;
}
+
+void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = vcpu->kvm;
+ int order, rmp_level, ret;
+ bool assigned;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+
+ gfn = gpa >> PAGE_SHIFT;
+
+ /*
+ * The only time RMP faults occur for shared pages is when the guest is
+ * triggering an RMP fault for an implicit page-state change from
+ * shared->private. Implicit page-state changes are forwarded to
+ * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
+ * for shared pages should not end up here.
+ */
+ if (!kvm_mem_is_private(kvm, gfn)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!kvm_slot_can_be_private(slot)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret || !assigned) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
+ gpa, pfn, ret);
+ goto out_no_trace;
+ }
+
+ /*
+ * There are 2 cases where a PSMASH may be needed to resolve an #NPF
+ * with PFERR_GUEST_RMP_BIT set:
+ *
+ * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
+ * bit set if the guest issues them with a smaller granularity than
+ * what is indicated by the page-size bit in the 2MB RMP entry for
+ * the PFN that backs the GPA.
+ *
+ * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
+ * smaller than what is indicated by the 2MB RMP entry for the PFN
+ * that backs the GPA.
+ *
+ * In both these cases, the corresponding 2M RMP entry needs to
+ * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already
+ * split into 4K RMP entries, then this is likely a spurious case which
+ * can occur when there are concurrent accesses by the guest to a 2MB
+ * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
+ * the process of being PMASH'd into 4K entries. These cases should
+ * resolve automatically on subsequent accesses, so just ignore them
+ * here.
+ */
+ if (rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ ret = snp_rmptable_psmash(pfn);
+ if (ret) {
+ /*
+ * Look it up again. If it's 4K now then the PSMASH may have
+ * raced with another process and the issue has already resolved
+ * itself.
+ */
+ if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
+ assigned && rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
+ gpa, pfn, ret);
+ }
+
+ kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
+out:
+ trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
+out_no_trace:
+ put_page(pfn_to_page(pfn));
+}
+
+static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn = start;
+
+ while (pfn < end) {
+ int ret, rmp_level;
+ bool assigned;
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
+ pfn, start, end, rmp_level, ret);
+ return false;
+ }
+
+ if (assigned) {
+ pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
+ __func__, pfn, start, end, rmp_level);
+ return false;
+ }
+
+ pfn++;
+ }
+
+ return true;
+}
+
+static u8 max_level_for_order(int order)
+{
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
+{
+ kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+
+ /*
+ * If this is a large folio, and the entire 2M range containing the
+ * PFN is currently shared, then the entire 2M-aligned range can be
+ * set to private via a single 2M RMP entry.
+ */
+ if (max_level_for_order(order) > PG_LEVEL_4K &&
+ is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
+ return true;
+
+ return false;
+}
+
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ kvm_pfn_t pfn_aligned;
+ gfn_t gfn_aligned;
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
+ gfn, pfn, rc);
+ return -ENOENT;
+ }
+
+ if (assigned) {
+ pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
+ __func__, gfn, pfn, max_order, level);
+ return 0;
+ }
+
+ if (is_large_rmp_possible(kvm, pfn, max_order)) {
+ level = PG_LEVEL_2M;
+ pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+ gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
+ } else {
+ level = PG_LEVEL_4K;
+ pfn_aligned = pfn;
+ gfn_aligned = gfn;
+ }
+
+ rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
+ gfn, pfn, level, rc);
+ return -EINVAL;
+ }
+
+ pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
+ __func__, gfn, pfn, pfn_aligned, max_order, level);
+
+ return 0;
+}
+
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
+ pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
+
+ for (pfn = start; pfn < end;) {
+ bool use_2m_update = false;
+ int rc, rmp_level;
+ bool assigned;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (rc || !assigned)
+ goto next_pfn;
+
+ use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
+ end >= (pfn + PTRS_PER_PMD) &&
+ rmp_level > PG_LEVEL_4K;
+
+ /*
+ * If an unaligned PFN corresponds to a 2M region assigned as a
+ * large page in the RMP table, PSMASH the region into individual
+ * 4K RMP entries before attempting to convert a 4K sub-page.
+ */
+ if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
+ /*
+ * This shouldn't fail, but if it does, report it, but
+ * still try to update RMP entry to shared and pray this
+ * was a spurious error that can be addressed later.
+ */
+ rc = snp_rmptable_psmash(pfn);
+ WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc);
+ }
+
+ rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
+ if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc))
+ goto next_pfn;
+
+ /*
+ * SEV-ES avoids host/guest cache coherency issues through
+ * WBINVD hooks issued via MMU notifiers during run-time, and
+ * KVM's VM destroy path at shutdown. Those MMU notifier events
+ * don't cover gmem since there is no requirement to map pages
+ * to a HVA in order to use them for a running guest. While the
+ * shutdown path would still likely cover things for SNP guests,
+ * userspace may also free gmem pages during run-time via
+ * hole-punching operations on the guest_memfd, so flush the
+ * cache entries for these pages before free'ing them back to
+ * the host.
+ */
+ clflush_cache_range(__va(pfn_to_hpa(pfn)),
+ use_2m_update ? PMD_SIZE : PAGE_SIZE);
+next_pfn:
+ pfn += use_2m_update ? PTRS_PER_PMD : 1;
+ cond_resched();
+ }
+}
+
+int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc || !assigned)
+ return PG_LEVEL_4K;
+
+ return level;
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c95d3900fe56..c115d26844f7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -53,6 +53,7 @@
#include "svm_onhyperv.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -570,6 +571,11 @@ static void __svm_write_tsc_multiplier(u64 multiplier)
__this_cpu_write(current_tsc_ratio, multiplier);
}
+static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
+{
+ return page_address(sd->save_area) + 0x400;
+}
+
static inline void kvm_cpu_svm_disable(void)
{
uint64_t efer;
@@ -674,12 +680,9 @@ static int svm_hardware_enable(void)
* TSC_AUX field now to avoid a RDMSR on every vCPU run.
*/
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
- struct sev_es_save_area *hostsa;
u32 __maybe_unused msr_hi;
- hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
-
- rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi);
+ rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}
return 0;
@@ -704,7 +707,7 @@ static int svm_cpu_init(int cpu)
int ret = -ENOMEM;
memset(sd, 0, sizeof(struct svm_cpu_data));
- sd->save_area = snp_safe_alloc_page(NULL);
+ sd->save_area = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
if (!sd->save_area)
return ret;
@@ -1202,7 +1205,7 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (guest_cpuid_is_intel(vcpu)) {
+ if (guest_cpuid_is_intel_compatible(vcpu)) {
/*
* We must intercept SYSENTER_EIP and SYSENTER_ESP
* accesses because the processor only stores 32 bits.
@@ -1404,6 +1407,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
svm->spec_ctrl = 0;
svm->virt_spec_ctrl = 0;
+ if (init_event)
+ sev_snp_init_protected_guest_state(vcpu);
+
init_vmcb(vcpu);
if (!init_event)
@@ -1427,7 +1433,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
err = -ENOMEM;
- vmcb01_page = snp_safe_alloc_page(vcpu);
+ vmcb01_page = snp_safe_alloc_page();
if (!vmcb01_page)
goto out;
@@ -1436,7 +1442,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
* SEV-ES guests require a separate VMSA page used to contain
* the encrypted register state of the guest.
*/
- vmsa_page = snp_safe_alloc_page(vcpu);
+ vmsa_page = snp_safe_alloc_page();
if (!vmsa_page)
goto error_free_vmcb_page;
}
@@ -1501,11 +1507,6 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
-static struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
-{
- return page_address(sd->save_area) + 0x400;
-}
-
static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1551,6 +1552,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
@@ -2050,6 +2054,7 @@ static int pf_interception(struct kvm_vcpu *vcpu)
static int npf_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ int rc;
u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
@@ -2063,11 +2068,19 @@ static int npf_interception(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
error_code &= ~PFERR_SYNTHETIC_MASK;
+ if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
trace_kvm_page_fault(vcpu, fault_address, error_code);
- return kvm_mmu_page_fault(vcpu, fault_address, error_code,
- static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
- svm->vmcb->control.insn_bytes : NULL,
- svm->vmcb->control.insn_len);
+ rc = kvm_mmu_page_fault(vcpu, fault_address, error_code,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+
+ if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
+ sev_handle_rmp_fault(vcpu, fault_address, error_code);
+
+ return rc;
}
static int db_interception(struct kvm_vcpu *vcpu)
@@ -2875,12 +2888,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SYSENTER_EIP:
msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
- if (guest_cpuid_is_intel(vcpu))
+ if (guest_cpuid_is_intel_compatible(vcpu))
msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
break;
case MSR_IA32_SYSENTER_ESP:
msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
- if (guest_cpuid_is_intel(vcpu))
+ if (guest_cpuid_is_intel_compatible(vcpu))
msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
break;
case MSR_TSC_AUX:
@@ -3107,11 +3120,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* 32 bit part of these msrs to support Intel's
* implementation of SYSENTER/SYSEXIT.
*/
- svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
break;
case MSR_IA32_SYSENTER_ESP:
svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
- svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
break;
case MSR_TSC_AUX:
/*
@@ -4372,11 +4385,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
/*
- * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
+ * Intercept VMLOAD if the vCPU model is Intel in order to emulate that
* VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
* SVM on Intel is bonkers and extremely unlikely to work).
*/
- if (!guest_cpuid_is_intel(vcpu))
+ if (!guest_cpuid_is_intel_compatible(vcpu))
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
@@ -4595,12 +4608,6 @@ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
vcpu->arch.at_instruction_boundary = true;
}
-static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- if (!kvm_pause_in_guest(vcpu->kvm))
- shrink_ple_window(vcpu);
-}
-
static void svm_setup_mce(struct kvm_vcpu *vcpu)
{
/* [63:9] are reserved. */
@@ -4937,8 +4944,11 @@ static int svm_vm_init(struct kvm *kvm)
if (type != KVM_X86_DEFAULT_VM &&
type != KVM_X86_SW_PROTECTED_VM) {
- kvm->arch.has_protected_state = (type == KVM_X86_SEV_ES_VM);
+ kvm->arch.has_protected_state =
+ (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM);
to_kvm_sev_info(kvm)->need_init = true;
+
+ kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
}
if (!pause_filter_count || !pause_filter_thresh)
@@ -4955,7 +4965,7 @@ static int svm_vm_init(struct kvm *kvm)
static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
{
- struct page *page = snp_safe_alloc_page(vcpu);
+ struct page *page = snp_safe_alloc_page();
if (!page)
return NULL;
@@ -5060,8 +5070,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,
- .sched_in = svm_sched_in,
-
.nested_ops = &svm_nested_ops,
.deliver_interrupt = svm_deliver_interrupt,
@@ -5095,6 +5103,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
.alloc_apic_backing_page = svm_alloc_apic_backing_page,
+
+ .gmem_prepare = sev_gmem_prepare,
+ .gmem_invalidate = sev_gmem_invalidate,
+ .private_max_mapping_level = sev_private_max_mapping_level,
};
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 0f1472690b59..76107c7d0595 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -94,6 +94,10 @@ struct kvm_sev_info {
struct list_head mirror_entry; /* Use as a list entry of mirrors */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
+ void *snp_context; /* SNP guest context page */
+ void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
+ void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
+ struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
};
struct kvm_svm {
@@ -209,6 +213,18 @@ struct vcpu_sev_es_state {
u32 ghcb_sa_len;
bool ghcb_sa_sync;
bool ghcb_sa_free;
+
+ /* SNP Page-State-Change buffer entries currently being processed */
+ u16 psc_idx;
+ u16 psc_inflight;
+ bool psc_2m;
+
+ u64 ghcb_registered_gpa;
+
+ struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */
+ gpa_t snp_vmsa_gpa;
+ bool snp_ap_waiting_for_reset;
+ bool snp_has_guest_vmsa;
};
struct vcpu_svm {
@@ -350,6 +366,23 @@ static __always_inline bool sev_es_guest(struct kvm *kvm)
#endif
}
+static __always_inline bool sev_snp_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) &&
+ !WARN_ON_ONCE(!sev_es_guest(kvm));
+#else
+ return false;
+#endif
+}
+
+static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val)
+{
+ return svm->sev_es.ghcb_registered_gpa == val;
+}
+
static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
{
vmcb->control.clean = 0;
@@ -638,7 +671,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
#define AVIC_REQUIRED_APICV_INHIBITS \
( \
- BIT(APICV_INHIBIT_REASON_DISABLE) | \
+ BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
BIT(APICV_INHIBIT_REASON_HYPERV) | \
BIT(APICV_INHIBIT_REASON_NESTED) | \
@@ -696,7 +729,13 @@ void sev_guest_memory_reclaimed(struct kvm *kvm);
int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
/* These symbols are used in common code and are stubbed below. */
-struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
+
+struct page *snp_safe_alloc_page_node(int node, gfp_t gfp);
+static inline struct page *snp_safe_alloc_page(void)
+{
+ return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT);
+}
+
void sev_free_vcpu(struct kvm_vcpu *vcpu);
void sev_vm_destroy(struct kvm *kvm);
void __init sev_set_cpu_caps(void);
@@ -705,9 +744,20 @@ void sev_hardware_unsetup(void);
int sev_cpu_init(struct svm_cpu_data *sd);
int sev_dev_get_attr(u32 group, u64 attr, u64 *val);
extern unsigned int max_sev_asid;
+void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
+void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
+int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
#else
-static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) {
- return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
+{
+ return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+}
+
+static inline struct page *snp_safe_alloc_page(void)
+{
+ return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT);
}
static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {}
@@ -718,6 +768,18 @@ static inline void sev_hardware_unsetup(void) {}
static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; }
static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; }
#define max_sev_asid 0
+static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {}
+static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {}
+static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+ return 0;
+}
+static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
+static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ return 0;
+}
+
#endif
/* vmenter.S */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index e19fed438a67..d3aeffd6ae75 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -314,12 +314,12 @@ TRACE_EVENT(name, \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
- static_call(kvm_x86_get_exit_info)(vcpu, \
- &__entry->exit_reason, \
- &__entry->info1, \
- &__entry->info2, \
- &__entry->intr_info, \
- &__entry->error_code); \
+ kvm_x86_call(get_exit_info)(vcpu, \
+ &__entry->exit_reason, \
+ &__entry->info1, \
+ &__entry->info2, \
+ &__entry->intr_info, \
+ &__entry->error_code); \
), \
\
TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \
@@ -828,7 +828,8 @@ TRACE_EVENT(kvm_emulate_insn,
),
TP_fast_assign(
- __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS);
+ __entry->csbase = kvm_x86_call(get_segment_base)(vcpu,
+ VCPU_SREG_CS);
__entry->len = vcpu->arch.emulate_ctxt->fetch.ptr
- vcpu->arch.emulate_ctxt->fetch.data;
__entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
@@ -1375,6 +1376,10 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
__entry->vcpu_id, __entry->timer_index)
);
+#define kvm_print_apicv_inhibit_reasons(inhibits) \
+ (inhibits), (inhibits) ? " " : "", \
+ (inhibits) ? __print_flags(inhibits, "|", APICV_INHIBIT_REASONS) : ""
+
TRACE_EVENT(kvm_apicv_inhibit_changed,
TP_PROTO(int reason, bool set, unsigned long inhibits),
TP_ARGS(reason, set, inhibits),
@@ -1391,9 +1396,10 @@ TRACE_EVENT(kvm_apicv_inhibit_changed,
__entry->inhibits = inhibits;
),
- TP_printk("%s reason=%u, inhibits=0x%lx",
+ TP_printk("%s reason=%u, inhibits=0x%lx%s%s",
__entry->set ? "set" : "cleared",
- __entry->reason, __entry->inhibits)
+ __entry->reason,
+ kvm_print_apicv_inhibit_reasons(__entry->inhibits))
);
TRACE_EVENT(kvm_apicv_accept_irq,
@@ -1834,6 +1840,37 @@ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit,
__entry->vcpu_id, __entry->ghcb_gpa, __entry->result)
);
+/*
+ * Tracepoint for #NPFs due to RMP faults.
+ */
+TRACE_EVENT(kvm_rmp_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, u64 gpa, u64 pfn, u64 error_code,
+ int rmp_level, int psmash_ret),
+ TP_ARGS(vcpu, gpa, pfn, error_code, rmp_level, psmash_ret),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, gpa)
+ __field(u64, pfn)
+ __field(u64, error_code)
+ __field(int, rmp_level)
+ __field(int, psmash_ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->gpa = gpa;
+ __entry->pfn = pfn;
+ __entry->error_code = error_code;
+ __entry->rmp_level = rmp_level;
+ __entry->psmash_ret = psmash_ret;
+ ),
+
+ TP_printk("vcpu %u gpa %016llx pfn 0x%llx error_code 0x%llx rmp_level %d psmash_ret %d",
+ __entry->vcpu_id, __entry->gpa, __entry->pfn,
+ __entry->error_code, __entry->rmp_level, __entry->psmash_ret)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index d4ed681785fd..0bf35ebe8a1b 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -8,7 +8,7 @@
#include "posted_intr.h"
#define VMX_REQUIRED_APICV_INHIBITS \
- (BIT(APICV_INHIBIT_REASON_DISABLE)| \
+ (BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
BIT(APICV_INHIBIT_REASON_HYPERV) | \
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
@@ -97,7 +97,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_interrupt = vmx_deliver_interrupt,
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
@@ -122,8 +121,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
- .sched_in = vmx_sched_in,
-
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 643935a0f70a..2392a7ef254d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -12,6 +12,7 @@
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "posted_intr.h"
#include "sgx.h"
#include "trace.h"
#include "vmx.h"
@@ -2425,7 +2426,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
}
vm_entry_controls_set(vmx, exec_control);
@@ -2438,7 +2439,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
exec_control = __vm_exit_controls_get(vmcs01);
- if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
+ if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
else
exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
@@ -3899,8 +3900,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
return 0;
- max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
- if (max_irr != 256) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0) {
vapic_page = vmx->nested.virtual_apic_map.hva;
if (!vapic_page)
goto mmio_needed;
@@ -4031,10 +4032,46 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->nested.preemption_timer_expired;
}
-static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
{
- return nested_vmx_preemption_timer_pending(vcpu) ||
- to_vmx(vcpu)->nested.mtf_pending;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic = vmx->nested.virtual_apic_map.hva;
+ int max_irr, vppr;
+
+ if (nested_vmx_preemption_timer_pending(vcpu) ||
+ vmx->nested.mtf_pending)
+ return true;
+
+ /*
+ * Virtual Interrupt Delivery doesn't require manual injection. Either
+ * the interrupt is already in GUEST_RVI and will be recognized by CPU
+ * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
+ * the interrupt from the PIR to RVI prior to entering the guest.
+ */
+ if (for_injection)
+ return false;
+
+ if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ __vmx_interrupt_blocked(vcpu))
+ return false;
+
+ if (!vapic)
+ return false;
+
+ vppr = *((u32 *)(vapic + APIC_PROCPRI));
+
+ max_irr = vmx_get_rvi();
+ if ((max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+
+ if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
+ pi_test_on(vmx->nested.pi_desc)) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+ }
+
+ return false;
}
/*
@@ -4665,7 +4702,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
return vmcs_read64(GUEST_IA32_EFER);
if (cpu_has_load_ia32_efer())
- return host_efer;
+ return kvm_host.efer;
for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
@@ -4676,7 +4713,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
if (efer_msr)
return efer_msr->data;
- return host_efer;
+ return kvm_host.efer;
}
static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index be40474de6e4..83382a4d1d66 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -348,14 +348,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (data & pmu->fixed_ctr_ctrl_mask)
+ if (data & pmu->fixed_ctr_ctrl_rsvd)
return 1;
if (pmu->fixed_ctr_ctrl != data)
reprogram_fixed_counters(pmu, data);
break;
case MSR_IA32_PEBS_ENABLE:
- if (data & pmu->pebs_enable_mask)
+ if (data & pmu->pebs_enable_rsvd)
return 1;
if (pmu->pebs_enable != data) {
@@ -371,7 +371,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmu->ds_area = data;
break;
case MSR_PEBS_DATA_CFG:
- if (data & pmu->pebs_data_cfg_mask)
+ if (data & pmu->pebs_data_cfg_rsvd)
return 1;
pmu->pebs_data_cfg = data;
@@ -436,8 +436,8 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
};
u64 eventsel;
- BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED);
- BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED);
+ BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS);
+ BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS);
/*
* Yell if perf reports support for a fixed counter but perf doesn't
@@ -448,6 +448,14 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
return eventsel;
}
+static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+ pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits);
+}
+
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -456,8 +464,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
union cpuid10_eax eax;
union cpuid10_edx edx;
u64 perf_capabilities;
- u64 counter_mask;
- int i;
+ u64 counter_rsvd;
memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
@@ -501,22 +508,24 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
((u64)1 << edx.split.bit_width_fixed) - 1;
}
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
- pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
- counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
+ intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL |
+ INTEL_FIXED_0_USER |
+ INTEL_FIXED_0_ENABLE_PMI);
+
+ counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX));
- pmu->global_ctrl_mask = counter_mask;
+ pmu->global_ctrl_rsvd = counter_rsvd;
/*
* GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET)
* share reserved bit definitions. The kernel just happens to use
* OVF_CTRL for the names.
*/
- pmu->global_status_mask = pmu->global_ctrl_mask
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
if (vmx_pt_mode_is_host_guest())
- pmu->global_status_mask &=
+ pmu->global_status_rsvd &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
@@ -544,15 +553,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (perf_capabilities & PERF_CAP_PEBS_FORMAT) {
if (perf_capabilities & PERF_CAP_PEBS_BASELINE) {
- pmu->pebs_enable_mask = counter_mask;
+ pmu->pebs_enable_rsvd = counter_rsvd;
pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- pmu->fixed_ctr_ctrl_mask &=
- ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4));
- }
- pmu->pebs_data_cfg_mask = ~0xff00000full;
+ pmu->pebs_data_cfg_rsvd = ~0xff00000full;
+ intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE);
} else {
- pmu->pebs_enable_mask =
+ pmu->pebs_enable_rsvd =
~((1ull << pmu->nr_arch_gp_counters) - 1);
}
}
@@ -564,14 +570,14 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
- for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) {
+ for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
pmu->gp_counters[i].current_config = 0;
}
- for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX;
@@ -731,6 +737,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
.deliver_pmi = intel_pmu_deliver_pmi,
.cleanup = intel_pmu_cleanup,
.EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
- .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS,
.MIN_NR_GP_COUNTERS = 1,
};
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 6b2a0226257e..1715d2ab07be 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -1,6 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_VMX_POSTED_INTR_H
#define __KVM_X86_VMX_POSTED_INTR_H
+
+#include <linux/find.h>
#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
@@ -12,4 +14,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
+static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
+{
+ int vec;
+
+ vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
+ return vec < 256 ? vec : -1;
+}
+
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 01936013428b..56fd150a6f24 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -188,12 +188,13 @@ struct __packed vmcs12 {
};
/*
- * VMCS12_REVISION is an arbitrary id that should be changed if the content or
- * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
- * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ * VMCS12_REVISION is KVM's arbitrary ID for the layout of struct vmcs12. KVM
+ * enumerates this value to L1 via MSR_IA32_VMX_BASIC, and checks the revision
+ * ID during nested VMPTRLD to verify that L1 is loading a VMCS that adhere's
+ * to KVM's virtual CPU definition.
*
- * IMPORTANT: Changing this value will break save/restore compatibility with
- * older kvm releases.
+ * DO NOT change this value, as it will break save/restore compatibility with
+ * older KVM releases.
*/
#define VMCS12_REVISION 0x11e57ed0
@@ -206,7 +207,8 @@ struct __packed vmcs12 {
#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE
/*
- * For save/restore compatibility, the vmcs12 field offsets must not change.
+ * For save/restore compatibility, the vmcs12 field offsets must not change,
+ * although appending fields and/or filling gaps is obviously allowed.
*/
#define CHECK_OFFSET(field, loc) \
ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b3c83c06f826..f18c2d8c7476 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -74,6 +74,7 @@
#include "posted_intr.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -259,7 +260,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
- if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
return 0;
}
@@ -404,7 +405,7 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
* and VM-Exit.
*/
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
- (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
@@ -1123,12 +1124,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
- (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer, false);
+ guest_efer, kvm_host.efer, false);
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
@@ -1141,7 +1142,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, MSR_EFER);
guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
+ guest_efer |= kvm_host.efer & ignore_bits;
vmx->guest_uret_msrs[i].data = guest_efer;
vmx->guest_uret_msrs[i].mask = ~ignore_bits;
@@ -1411,6 +1412,38 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
}
#endif
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy)
{
@@ -1486,6 +1519,9 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
@@ -2525,17 +2561,15 @@ static bool cpu_has_sgx(void)
*/
static bool cpu_has_perf_global_ctrl_bug(void)
{
- if (boot_cpu_data.x86 == 0x6) {
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_NEHALEM_EP: /* AAK155 */
- case INTEL_FAM6_NEHALEM: /* AAP115 */
- case INTEL_FAM6_WESTMERE: /* AAT100 */
- case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */
- case INTEL_FAM6_NEHALEM_EX: /* BA97 */
- return true;
- default:
- break;
- }
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM_EP: /* AAK155 */
+ case INTEL_NEHALEM: /* AAP115 */
+ case INTEL_WESTMERE: /* AAT100 */
+ case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
+ case INTEL_NEHALEM_EX: /* BA97 */
+ return true;
+ default:
+ break;
}
return false;
@@ -2834,9 +2868,6 @@ int vmx_hardware_enable(void)
return r;
}
- if (enable_ept)
- ept_sync_global();
-
return 0;
}
@@ -4108,26 +4139,6 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- void *vapic_page;
- u32 vppr;
- int rvi;
-
- if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
- !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
- WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
- return false;
-
- rvi = vmx_get_rvi();
-
- vapic_page = vmx->nested.virtual_apic_map.hva;
- vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
-
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
-
void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4357,7 +4368,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
}
if (cpu_has_load_ia32_efer())
- vmcs_write64(HOST_IA32_EFER, host_efer);
+ vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
}
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -5052,14 +5063,19 @@ int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_nmi_blocked(vcpu);
}
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
return false;
- return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+ return __vmx_interrupt_blocked(vcpu);
}
int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
@@ -5897,38 +5913,6 @@ int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
return 1;
}
-static void grow_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __grow_ple_window(old, ple_window,
- ple_window_grow,
- ple_window_max);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
-}
-
-static void shrink_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __shrink_ple_window(old, ple_window,
- ple_window_shrink,
- ple_window);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
-}
-
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -6677,9 +6661,10 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
bool flush_l1d;
/*
- * Clear the per-vcpu flush bit, it gets set again
- * either from vcpu_run() or from one of the unsafe
- * VMEXIT handlers.
+ * Clear the per-vcpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator.
*/
flush_l1d = vcpu->arch.l1tf_flush_l1d;
vcpu->arch.l1tf_flush_l1d = false;
@@ -7665,39 +7650,25 @@ int vmx_vm_init(struct kvm *kvm)
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
- * memory aliases with conflicting memory types and sometimes MCEs.
- * We have to be careful as to what are honored and when.
- *
- * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
- * UC. The effective memory type is UC or WC depending on guest PAT.
- * This was historically the source of MCEs and we want to be
- * conservative.
- *
- * When there is no need to deal with noncoherent DMA (e.g., no VT-d
- * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
- * EPT memory type is set to WB. The effective memory type is forced
- * WB.
- *
- * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
- * EPT memory type is used to emulate guest CD/MTRR.
+ /*
+ * Force UC for host MMIO regions, as allowing the guest to access MMIO
+ * with cacheable accesses will result in Machine Checks.
*/
-
if (is_mmio)
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ /*
+ * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
+ * device attached and the CPU doesn't support self-snoop. Letting the
+ * guest control memory types on Intel CPUs without self-snoop may
+ * result in unexpected behavior, and so KVM's (historical) ABI is to
+ * trust the guest to behave only as a last resort.
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP) &&
+ !kvm_arch_has_noncoherent_dma(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
- if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
- else
- return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
- VMX_EPT_IPAT_BIT;
- }
-
- return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
@@ -8179,12 +8150,6 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
}
#endif
-void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- if (!kvm_pause_in_guest(vcpu->kvm))
- shrink_ple_window(vcpu);
-}
-
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8396,18 +8361,16 @@ static void __init vmx_setup_me_spte_mask(void)
u64 me_mask = 0;
/*
- * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
- * the former to avoid exposing shadow_phys_bits.
- *
* On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
- * shadow_phys_bits. On MKTME and/or TDX capable systems,
+ * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
* boot_cpu_data.x86_phys_bits holds the actual physical address
- * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
- * reported by CPUID. Those bits between are KeyID bits.
+ * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
+ * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
*/
- if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
+ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
- kvm_get_shadow_phys_bits() - 1);
+ kvm_host.maxphyaddr - 1);
+
/*
* Unlike SME, host kernel doesn't support setting up any
* MKTME KeyID on Intel platforms. No memory encryption
@@ -8629,9 +8592,9 @@ static void __vmx_exit(void)
static void vmx_exit(void)
{
kvm_exit();
+ __vmx_exit();
kvm_x86_vendor_exit();
- __vmx_exit();
}
module_exit(vmx_exit);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 7b64e271a931..42498fa63abb 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -406,6 +406,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
@@ -727,7 +728,7 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
return true;
return allow_smaller_maxphyaddr &&
- cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits();
+ cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr;
}
static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 502704596c83..ce3221cd1d01 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -46,10 +46,8 @@ bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
void vmx_migrate_timers(struct kvm_vcpu *vcpu);
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
-bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
void vmx_hwapic_isr_update(int max_isr);
-bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu);
int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector);
@@ -111,8 +109,6 @@ u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu);
-void vmx_request_immediate_exit(struct kvm_vcpu *vcpu);
-void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu);
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
#ifdef CONFIG_X86_64
int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0763a0f72a06..af6c8cf6a37a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -100,6 +100,9 @@
struct kvm_caps kvm_caps __read_mostly;
EXPORT_SYMBOL_GPL(kvm_caps);
+struct kvm_host_values kvm_host __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_host);
+
#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
#define emul_to_vcpu(ctxt) \
@@ -220,21 +223,12 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
-u64 __read_mostly host_efer;
-EXPORT_SYMBOL_GPL(host_efer);
-
bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
-u64 __read_mostly host_xss;
-EXPORT_SYMBOL_GPL(host_xss);
-
-u64 __read_mostly host_arch_capabilities;
-EXPORT_SYMBOL_GPL(host_arch_capabilities);
-
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@ -308,8 +302,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
sizeof(kvm_vcpu_stats_desc),
};
-u64 __read_mostly host_xcr0;
-
static struct kmem_cache *x86_emulator_cache;
/*
@@ -833,7 +825,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
*/
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
- if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
+ if (kvm_x86_call(get_cpl)(vcpu) <= required_cpl)
return true;
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
@@ -917,7 +909,7 @@ static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
return false;
- return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0);
+ return kvm_x86_call(is_valid_cr0)(vcpu, cr0);
}
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
@@ -954,11 +946,6 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
-
- if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
- kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
- !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
@@ -981,7 +968,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!is_pae(vcpu))
return 1;
- static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
if (cs_l)
return 1;
}
@@ -995,7 +982,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
(is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
return 1;
- static_call(kvm_x86_set_cr0)(vcpu, cr0);
+ kvm_x86_call(set_cr0)(vcpu, cr0);
kvm_post_set_cr0(vcpu, old_cr0, cr0);
@@ -1016,11 +1003,11 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
- if (vcpu->arch.xcr0 != host_xcr0)
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != host_xss)
+ vcpu->arch.ia32_xss != kvm_host.xss)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
@@ -1047,12 +1034,12 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
- if (vcpu->arch.xcr0 != host_xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != host_xss)
- wrmsrl(MSR_IA32_XSS, host_xss);
+ vcpu->arch.ia32_xss != kvm_host.xss)
+ wrmsrl(MSR_IA32_XSS, kvm_host.xss);
}
}
@@ -1113,7 +1100,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
{
/* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
- if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+ if (kvm_x86_call(get_cpl)(vcpu) != 0 ||
__kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
kvm_inject_gp(vcpu, 0);
return 1;
@@ -1138,7 +1125,7 @@ EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4);
static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
return __kvm_is_valid_cr4(vcpu, cr4) &&
- static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
+ kvm_x86_call(is_valid_cr4)(vcpu, cr4);
}
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
@@ -1206,7 +1193,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
}
- static_call(kvm_x86_set_cr4)(vcpu, cr4);
+ kvm_x86_call(set_cr4)(vcpu, cr4);
kvm_post_set_cr4(vcpu, old_cr4, cr4);
@@ -1345,7 +1332,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu)
dr7 = vcpu->arch.guest_debug_dr7;
else
dr7 = vcpu->arch.dr7;
- static_call(kvm_x86_set_dr7)(vcpu, dr7);
+ kvm_x86_call(set_dr7)(vcpu, dr7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
if (dr7 & DR7_BP_EN_MASK)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
@@ -1461,10 +1448,10 @@ static const u32 msrs_to_save_pmu[] = {
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
- MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
+ MSR_CORE_PERF_GLOBAL_CTRL,
MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
- /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */
+ /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
@@ -1477,7 +1464,7 @@ static const u32 msrs_to_save_pmu[] = {
MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
- /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */
+ /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
@@ -1619,7 +1606,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
static u64 kvm_get_arch_capabilities(void)
{
- u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
+ u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
@@ -1688,7 +1675,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
rdmsrl_safe(msr->index, &msr->data);
break;
default:
- return static_call(kvm_x86_get_msr_feature)(msr);
+ return kvm_x86_call(get_msr_feature)(msr);
}
return 0;
}
@@ -1762,7 +1749,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- r = static_call(kvm_x86_set_efer)(vcpu, efer);
+ r = kvm_x86_call(set_efer)(vcpu, efer);
if (r) {
WARN_ON(r > 0);
return r;
@@ -1877,11 +1864,11 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* incomplete and conflicting architectural behavior. Current
* AMD CPUs completely ignore bits 63:32, i.e. they aren't
* reserved and always read as zeros. Enforce Intel's reserved
- * bits check if and only if the guest CPU is Intel, and clear
- * the bits in all other cases. This ensures cross-vendor
- * migration will provide consistent behavior for the guest.
+ * bits check if the guest CPU is Intel compatible, otherwise
+ * clear the bits. This ensures cross-vendor migration will
+ * provide consistent behavior for the guest.
*/
- if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0)
return 1;
data = (u32)data;
@@ -1892,7 +1879,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
msr.index = index;
msr.host_initiated = host_initiated;
- return static_call(kvm_x86_set_msr)(vcpu, &msr);
+ return kvm_x86_call(set_msr)(vcpu, &msr);
}
static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
@@ -1934,7 +1921,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
msr.index = index;
msr.host_initiated = host_initiated;
- ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
+ ret = kvm_x86_call(get_msr)(vcpu, &msr);
if (!ret)
*data = msr.data;
return ret;
@@ -2002,7 +1989,7 @@ static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
{
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
+ return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error);
}
static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
@@ -2066,7 +2053,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
trace_kvm_msr_read_ex(ecx);
}
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
+ return kvm_x86_call(complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
@@ -2091,7 +2078,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
trace_kvm_msr_write_ex(ecx, data);
}
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
+ return kvm_x86_call(complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
@@ -2616,12 +2603,12 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
if (is_guest_mode(vcpu))
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
l1_offset,
- static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
- static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ kvm_x86_call(get_l2_tsc_offset)(vcpu),
+ kvm_x86_call(get_l2_tsc_multiplier)(vcpu));
else
vcpu->arch.tsc_offset = l1_offset;
- static_call(kvm_x86_write_tsc_offset)(vcpu);
+ kvm_x86_call(write_tsc_offset)(vcpu);
}
static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
@@ -2632,12 +2619,12 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli
if (is_guest_mode(vcpu))
vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
l1_multiplier,
- static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ kvm_x86_call(get_l2_tsc_multiplier)(vcpu));
else
vcpu->arch.tsc_scaling_ratio = l1_multiplier;
if (kvm_caps.has_tsc_control)
- static_call(kvm_x86_write_tsc_multiplier)(vcpu);
+ kvm_x86_call(write_tsc_multiplier)(vcpu);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -3610,7 +3597,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_flush_tlb_all)(vcpu);
+ kvm_x86_call(flush_tlb_all)(vcpu);
/* Flushing all ASIDs flushes the current ASID... */
kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
@@ -3631,7 +3618,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
kvm_mmu_sync_prev_roots(vcpu);
}
- static_call(kvm_x86_flush_tlb_guest)(vcpu);
+ kvm_x86_call(flush_tlb_guest)(vcpu);
/*
* Flushing all "guest" TLB is always a superset of Hyper-V's fine
@@ -3644,7 +3631,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_flush_tlb_current)(vcpu);
+ kvm_x86_call(flush_tlb_current)(vcpu);
}
/*
@@ -4703,8 +4690,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_MEMORY_FAULT_INFO:
+ case KVM_CAP_X86_GUEST_MODE:
r = 1;
break;
+ case KVM_CAP_PRE_FAULT_MEMORY:
+ r = tdp_enabled;
+ break;
+ case KVM_CAP_X86_APIC_BUS_CYCLES_NS:
+ r = APIC_BUS_CYCLE_NS_DEFAULT;
+ break;
case KVM_CAP_EXIT_HYPERCALL:
r = KVM_EXIT_HYPERCALL_VALID_MASK;
break;
@@ -4753,7 +4747,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
+ r = kvm_x86_call(has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
case KVM_CAP_NR_VCPUS:
r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
@@ -4833,7 +4827,7 @@ static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val)
{
if (attr->group) {
if (kvm_x86_ops.dev_get_attr)
- return static_call(kvm_x86_dev_get_attr)(attr->group, attr->attr, val);
+ return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val);
return -ENXIO;
}
@@ -4995,16 +4989,25 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
+ pmu->need_cleanup = true;
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
+
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
- if (static_call(kvm_x86_has_wbinvd_exit)())
+ if (kvm_x86_call(has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
}
- static_call(kvm_x86_vcpu_load)(vcpu, cpu);
+ kvm_x86_call(vcpu_load)(vcpu, cpu);
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -5112,14 +5115,14 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
- static_call(kvm_x86_vcpu_put)(vcpu);
+ kvm_x86_call(vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -5236,7 +5239,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
kvm_apic_after_set_mcg_cap(vcpu);
- static_call(kvm_x86_setup_mce)(vcpu);
+ kvm_x86_call(setup_mce)(vcpu);
out:
return r;
}
@@ -5396,11 +5399,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
- events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
+ events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu);
/* events->sipi_vector is never valid when reporting to user space */
@@ -5482,8 +5485,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- static_call(kvm_x86_set_interrupt_shadow)(vcpu,
- events->interrupt.shadow);
+ kvm_x86_call(set_interrupt_shadow)(vcpu,
+ events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
@@ -5492,7 +5495,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->nmi.pending)
kvm_make_request(KVM_REQ_NMI, vcpu);
}
- static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
+ kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
lapic_in_kernel(vcpu))
@@ -5840,7 +5843,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
if (!kvm_x86_ops.enable_l2_tlb_flush)
return -ENOTTY;
- return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
+ return kvm_x86_call(enable_l2_tlb_flush)(vcpu);
case KVM_CAP_HYPERV_ENFORCE_CPUID:
return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
@@ -5879,8 +5882,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
- GFP_KERNEL_ACCOUNT);
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
if (!u.lapic)
@@ -6073,7 +6075,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
break;
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
r = -ENOMEM;
if (!u.xsave)
break;
@@ -6104,7 +6106,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_XSAVE2: {
int size = vcpu->arch.guest_fpu.uabi_size;
- u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ u.xsave = kzalloc(size, GFP_KERNEL);
r = -ENOMEM;
if (!u.xsave)
break;
@@ -6122,7 +6124,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_GET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
r = -ENOMEM;
if (!u.xcrs)
break;
@@ -6330,14 +6332,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
if (addr > (unsigned int)(-3 * PAGE_SIZE))
return -EINVAL;
- ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
+ ret = kvm_x86_call(set_tss_addr)(kvm, addr);
return ret;
}
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
- return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
+ return kvm_x86_call(set_identity_map_addr)(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -6543,9 +6545,6 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
goto split_irqchip_unlock;
if (kvm->created_vcpus)
goto split_irqchip_unlock;
- r = kvm_setup_empty_irq_routing(kvm);
- if (r)
- goto split_irqchip_unlock;
/* Pairs with irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
@@ -6650,14 +6649,14 @@ split_irqchip_unlock:
if (!kvm_x86_ops.vm_copy_enc_context_from)
break;
- r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
+ r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]);
break;
case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
r = -EINVAL;
if (!kvm_x86_ops.vm_move_enc_context_from)
break;
- r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
+ r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]);
break;
case KVM_CAP_EXIT_HYPERCALL:
if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
@@ -6692,7 +6691,9 @@ split_irqchip_unlock:
break;
mutex_lock(&kvm->lock);
- if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+ if (kvm->arch.bsp_vcpu_id > cap->args[0]) {
+ ;
+ } else if (kvm->arch.max_vcpu_ids == cap->args[0]) {
r = 0;
} else if (!kvm->arch.max_vcpu_ids) {
kvm->arch.max_vcpu_ids = cap->args[0];
@@ -6745,6 +6746,30 @@ split_irqchip_unlock:
}
mutex_unlock(&kvm->lock);
break;
+ case KVM_CAP_X86_APIC_BUS_CYCLES_NS: {
+ u64 bus_cycle_ns = cap->args[0];
+ u64 unused;
+
+ /*
+ * Guard against overflow in tmict_to_ns(). 128 is the highest
+ * divide value that can be programmed in APIC_TDCR.
+ */
+ r = -EINVAL;
+ if (!bus_cycle_ns ||
+ check_mul_overflow((u64)U32_MAX * 128, bus_cycle_ns, &unused))
+ break;
+
+ r = 0;
+ mutex_lock(&kvm->lock);
+ if (!irqchip_in_kernel(kvm))
+ r = -ENXIO;
+ else if (kvm->created_vcpus)
+ r = -EINVAL;
+ else
+ kvm->arch.apic_bus_cycle_ns = bus_cycle_ns;
+ mutex_unlock(&kvm->lock);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -7213,6 +7238,9 @@ set_pit2_out:
mutex_lock(&kvm->lock);
if (kvm->created_vcpus)
r = -EBUSY;
+ else if (arg > KVM_MAX_VCPU_IDS ||
+ (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids))
+ r = -EINVAL;
else
kvm->arch.bsp_vcpu_id = arg;
mutex_unlock(&kvm->lock);
@@ -7289,7 +7317,7 @@ set_pit2_out:
if (!kvm_x86_ops.mem_enc_ioctl)
goto out;
- r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
+ r = kvm_x86_call(mem_enc_ioctl)(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -7303,7 +7331,7 @@ set_pit2_out:
if (!kvm_x86_ops.mem_enc_register_region)
goto out;
- r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
+ r = kvm_x86_call(mem_enc_register_region)(kvm, &region);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -7317,7 +7345,7 @@ set_pit2_out:
if (!kvm_x86_ops.mem_enc_unregister_region)
goto out;
- r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
+ r = kvm_x86_call(mem_enc_unregister_region)(kvm, &region);
break;
}
#ifdef CONFIG_KVM_HYPERV
@@ -7411,17 +7439,20 @@ static void kvm_probe_msr_to_save(u32 msr_index)
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
return;
break;
- case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
+ case MSR_ARCH_PERFMON_PERFCTR0 ...
+ MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1:
if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
kvm_pmu_cap.num_counters_gp)
return;
break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
+ case MSR_ARCH_PERFMON_EVENTSEL0 ...
+ MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1:
if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
kvm_pmu_cap.num_counters_gp)
return;
break;
- case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX:
+ case MSR_ARCH_PERFMON_FIXED_CTR0 ...
+ MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1:
if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
kvm_pmu_cap.num_counters_fixed)
return;
@@ -7452,7 +7483,7 @@ static void kvm_init_msr_lists(void)
{
unsigned i;
- BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
+ BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3,
"Please update the fixed PMCs in msrs_to_save_pmu[]");
num_msrs_to_save = 0;
@@ -7468,7 +7499,8 @@ static void kvm_init_msr_lists(void)
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
- if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
+ if (!kvm_x86_call(has_emulated_msr)(NULL,
+ emulated_msrs_all[i]))
continue;
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
@@ -7527,13 +7559,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
void kvm_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- static_call(kvm_x86_set_segment)(vcpu, var, seg);
+ kvm_x86_call(set_segment)(vcpu, var, seg);
}
void kvm_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- static_call(kvm_x86_get_segment)(vcpu, var, seg);
+ kvm_x86_call(get_segment)(vcpu, var, seg);
}
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
@@ -7556,7 +7588,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
@@ -7566,7 +7598,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
@@ -7619,7 +7651,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
@@ -7644,7 +7676,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -7667,7 +7699,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
if (system)
access |= PFERR_IMPLICIT_ACCESS;
- else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ else if (kvm_x86_call(get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -7712,7 +7744,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
if (system)
access |= PFERR_IMPLICIT_ACCESS;
- else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ else if (kvm_x86_call(get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -7733,8 +7765,8 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
- return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type,
- insn, insn_len);
+ return kvm_x86_call(check_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
}
int handle_ud(struct kvm_vcpu *vcpu)
@@ -7784,8 +7816,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
bool write)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
- | (write ? PFERR_WRITE_MASK : 0);
+ u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ | (write ? PFERR_WRITE_MASK : 0);
/*
* currently PKRU is only applied to ept enabled guest so
@@ -8211,7 +8243,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- return static_call(kvm_x86_get_segment_base)(vcpu, seg);
+ return kvm_x86_call(get_segment_base)(vcpu, seg);
}
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
@@ -8224,7 +8256,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
- if (static_call(kvm_x86_has_wbinvd_exit)()) {
+ if (kvm_x86_call(has_wbinvd_exit)()) {
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
@@ -8328,27 +8360,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
- return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
+ return kvm_x86_call(get_cpl)(emul_to_vcpu(ctxt));
}
static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(get_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(get_idt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(set_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(set_idt)(emul_to_vcpu(ctxt), dt);
}
static unsigned long emulator_get_cached_segment_base(
@@ -8495,8 +8527,8 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
- return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
- &ctxt->exception);
+ return kvm_x86_call(check_intercept)(emul_to_vcpu(ctxt), info, stage,
+ &ctxt->exception);
}
static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
@@ -8521,6 +8553,11 @@ static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
}
+static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_is_intel_compatible(emul_to_vcpu(ctxt));
+}
+
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
@@ -8533,7 +8570,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
{
- static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
+ kvm_x86_call(set_nmi_mask)(emul_to_vcpu(ctxt), masked);
}
static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
@@ -8578,7 +8615,8 @@ static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt,
if (!kvm_x86_ops.get_untagged_addr)
return addr;
- return static_call(kvm_x86_get_untagged_addr)(emul_to_vcpu(ctxt), addr, flags);
+ return kvm_x86_call(get_untagged_addr)(emul_to_vcpu(ctxt),
+ addr, flags);
}
static const struct x86_emulate_ops emulate_ops = {
@@ -8619,6 +8657,7 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_movbe = emulator_guest_has_movbe,
.guest_has_fxsr = emulator_guest_has_fxsr,
.guest_has_rdpid = emulator_guest_has_rdpid,
+ .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible,
.set_nmi_mask = emulator_set_nmi_mask,
.is_smm = emulator_is_smm,
.is_guest_mode = emulator_is_guest_mode,
@@ -8630,7 +8669,7 @@ static const struct x86_emulate_ops emulate_ops = {
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ u32 int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -8641,7 +8680,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
if (int_shadow & mask)
mask = 0;
if (unlikely(int_shadow || mask)) {
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, mask);
if (!mask)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -8682,7 +8721,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
- static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
ctxt->gpa_available = false;
ctxt->eflags = kvm_get_rflags(vcpu);
@@ -8738,9 +8777,8 @@ static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
*/
memset(&info, 0, sizeof(info));
- static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
- &info[2], (u32 *)&info[3],
- (u32 *)&info[4]);
+ kvm_x86_call(get_exit_info)(vcpu, (u32 *)&info[0], &info[1], &info[2],
+ (u32 *)&info[3], (u32 *)&info[4]);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
@@ -8817,7 +8855,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
kvm_queue_exception(vcpu, UD_VECTOR);
- if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
+ if (!is_guest_mode(vcpu) && kvm_x86_call(get_cpl)(vcpu) == 0) {
prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
@@ -8975,10 +9013,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
int r;
- r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
+ r = kvm_x86_call(skip_emulated_instruction)(vcpu);
if (unlikely(!r))
return 0;
@@ -9000,19 +9038,17 @@ EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
{
- u32 shadow;
-
if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
return true;
/*
- * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
- * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first
- * to avoid the relatively expensive CPUID lookup.
+ * Intel compatible CPUs inhibit code #DBs when MOV/POP SS blocking is
+ * active, but AMD compatible CPUs do not.
*/
- shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
- return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
- guest_cpuid_is_intel(vcpu);
+ if (!guest_cpuid_is_intel_compatible(vcpu))
+ return false;
+
+ return kvm_x86_call(get_interrupt_shadow)(vcpu) & KVM_X86_SHADOW_INT_MOV_SS;
}
static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
@@ -9284,7 +9320,7 @@ restart:
writeback:
if (writeback) {
- unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
@@ -9301,7 +9337,7 @@ writeback:
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
- static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
+ kvm_x86_call(update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -9700,7 +9736,7 @@ static int kvm_x86_check_processor_compatibility(void)
__cr4_reserved_bits(cpu_has, &boot_cpu_data))
return -EIO;
- return static_call(kvm_x86_check_processor_compatibility)();
+ return kvm_x86_call(check_processor_compatibility)();
}
static void kvm_x86_check_cpu_compat(void *ret)
@@ -9772,19 +9808,19 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
- kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+ kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
}
- rdmsrl_safe(MSR_EFER, &host_efer);
+ rdmsrl_safe(MSR_EFER, &kvm_host.efer);
if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
+ rdmsrl(MSR_IA32_XSS, kvm_host.xss);
kvm_init_pmu_capability(ops->pmu_ops);
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities);
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
r = ops->hardware_setup();
if (r != 0)
@@ -9843,7 +9879,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
out_unwind_ops:
kvm_x86_ops.hardware_enable = NULL;
- static_call(kvm_x86_hardware_unsetup)();
+ kvm_x86_call(hardware_unsetup)();
out_mmu_exit:
kvm_mmu_vendor_module_exit();
out_free_percpu:
@@ -9874,7 +9910,7 @@ void kvm_x86_vendor_exit(void)
irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif
- static_call(kvm_x86_hardware_unsetup)();
+ kvm_x86_call(hardware_unsetup)();
kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
@@ -10000,7 +10036,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated);
bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
{
ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
- ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
+ ulong vcpu_reasons =
+ kvm_x86_call(vcpu_get_apicv_inhibit_reasons)(vcpu);
return (vm_reasons | vcpu_reasons) == 0;
}
@@ -10009,6 +10046,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
enum kvm_apicv_inhibit reason, bool set)
{
+ const struct trace_print_flags apicv_inhibits[] = { APICV_INHIBIT_REASONS };
+
+ BUILD_BUG_ON(ARRAY_SIZE(apicv_inhibits) != NR_APICV_INHIBIT_REASONS);
+
if (set)
__set_bit(reason, inhibits);
else
@@ -10020,7 +10061,7 @@ static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
static void kvm_apicv_init(struct kvm *kvm)
{
enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT :
- APICV_INHIBIT_REASON_DISABLE;
+ APICV_INHIBIT_REASON_DISABLED;
set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
@@ -10182,7 +10223,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a2 = kvm_rdx_read(vcpu);
a3 = kvm_rsi_read(vcpu);
op_64_bit = is_64_bit_hypercall(vcpu);
- cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ cpl = kvm_x86_call(get_cpl)(vcpu);
ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
@@ -10214,7 +10255,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
return X86EMUL_PROPAGATE_FAULT;
}
- static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
+ kvm_x86_call(patch_hypercall)(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
&ctxt->exception);
@@ -10231,7 +10272,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
+ kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu);
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
@@ -10241,6 +10282,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
if (is_smm(vcpu))
kvm_run->flags |= KVM_RUN_X86_SMM;
+ if (is_guest_mode(vcpu))
+ kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -10266,7 +10309,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
tpr = kvm_lapic_get_cr8(vcpu);
- static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
+ kvm_x86_call(update_cr8_intercept)(vcpu, tpr, max_irr);
}
@@ -10296,7 +10339,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
vcpu->arch.exception.error_code,
vcpu->arch.exception.injected);
- static_call(kvm_x86_inject_exception)(vcpu);
+ kvm_x86_call(inject_exception)(vcpu);
}
/*
@@ -10382,9 +10425,9 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
else if (kvm_is_exception_pending(vcpu))
; /* see above */
else if (vcpu->arch.nmi_injected)
- static_call(kvm_x86_inject_nmi)(vcpu);
+ kvm_x86_call(inject_nmi)(vcpu);
else if (vcpu->arch.interrupt.injected)
- static_call(kvm_x86_inject_irq)(vcpu, true);
+ kvm_x86_call(inject_irq)(vcpu, true);
/*
* Exceptions that morph to VM-Exits are handled above, and pending
@@ -10469,7 +10512,8 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
*/
#ifdef CONFIG_KVM_SMM
if (vcpu->arch.smi_pending) {
- r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
+ r = can_inject ? kvm_x86_call(smi_allowed)(vcpu, true) :
+ -EBUSY;
if (r < 0)
goto out;
if (r) {
@@ -10478,27 +10522,29 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
enter_smm(vcpu);
can_inject = false;
} else
- static_call(kvm_x86_enable_smi_window)(vcpu);
+ kvm_x86_call(enable_smi_window)(vcpu);
}
#endif
if (vcpu->arch.nmi_pending) {
- r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
+ r = can_inject ? kvm_x86_call(nmi_allowed)(vcpu, true) :
+ -EBUSY;
if (r < 0)
goto out;
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
- static_call(kvm_x86_inject_nmi)(vcpu);
+ kvm_x86_call(inject_nmi)(vcpu);
can_inject = false;
- WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
+ WARN_ON(kvm_x86_call(nmi_allowed)(vcpu, true) < 0);
}
if (vcpu->arch.nmi_pending)
- static_call(kvm_x86_enable_nmi_window)(vcpu);
+ kvm_x86_call(enable_nmi_window)(vcpu);
}
if (kvm_cpu_has_injectable_intr(vcpu)) {
- r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
+ r = can_inject ? kvm_x86_call(interrupt_allowed)(vcpu, true) :
+ -EBUSY;
if (r < 0)
goto out;
if (r) {
@@ -10506,17 +10552,17 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
if (!WARN_ON_ONCE(irq == -1)) {
kvm_queue_interrupt(vcpu, irq, false);
- static_call(kvm_x86_inject_irq)(vcpu, false);
- WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ kvm_x86_call(inject_irq)(vcpu, false);
+ WARN_ON(kvm_x86_call(interrupt_allowed)(vcpu, true) < 0);
}
}
if (kvm_cpu_has_injectable_intr(vcpu))
- static_call(kvm_x86_enable_irq_window)(vcpu);
+ kvm_x86_call(enable_irq_window)(vcpu);
}
if (is_guest_mode(vcpu) &&
kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu))
+ kvm_x86_ops.nested_ops->has_events(vcpu, true))
*req_immediate_exit = true;
/*
@@ -10557,7 +10603,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
* blocks NMIs). KVM will immediately inject one of the two NMIs, and
* will request an NMI window to handle the second NMI.
*/
- if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
+ if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
else
limit = 2;
@@ -10566,14 +10612,14 @@ static void process_nmi(struct kvm_vcpu *vcpu)
* Adjust the limit to account for pending virtual NMIs, which aren't
* tracked in vcpu->arch.nmi_pending.
*/
- if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
+ if (kvm_x86_call(is_vnmi_pending)(vcpu))
limit--;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
if (vcpu->arch.nmi_pending &&
- (static_call(kvm_x86_set_vnmi_pending)(vcpu)))
+ (kvm_x86_call(set_vnmi_pending)(vcpu)))
vcpu->arch.nmi_pending--;
if (vcpu->arch.nmi_pending)
@@ -10584,7 +10630,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
{
return vcpu->arch.nmi_pending +
- static_call(kvm_x86_is_vnmi_pending)(vcpu);
+ kvm_x86_call(is_vnmi_pending)(vcpu);
}
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
@@ -10618,7 +10664,7 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
apic->apicv_active = activate;
kvm_apic_update_apicv(vcpu);
- static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+ kvm_x86_call(refresh_apicv_exec_ctrl)(vcpu);
/*
* When APICv gets disabled, we may still have injected interrupts
@@ -10718,7 +10764,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
@@ -10743,17 +10789,17 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
bitmap_or((ulong *)eoi_exit_bitmap,
vcpu->arch.ioapic_handled_vectors,
to_hv_synic(vcpu)->vec_bitmap, 256);
- static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ kvm_x86_call(load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
return;
}
#endif
- static_call_cond(kvm_x86_load_eoi_exitmap)(
+ kvm_x86_call(load_eoi_exitmap)(
vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
- static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
+ kvm_x86_call(guest_memory_reclaimed)(kvm);
}
static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
@@ -10761,7 +10807,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
- static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
+ kvm_x86_call(set_apic_access_page_addr)(vcpu);
}
/*
@@ -10925,10 +10971,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- static_call(kvm_x86_msr_filter_changed)(vcpu);
+ kvm_x86_call(msr_filter_changed)(vcpu);
if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
- static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+ kvm_x86_call(update_cpu_dirty_logging)(vcpu);
+
+ if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) {
+ kvm_vcpu_reset(vcpu, true);
+ if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
+ r = 1;
+ goto out;
+ }
+ }
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
@@ -10950,7 +11004,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
if (req_int_win)
- static_call(kvm_x86_enable_irq_window)(vcpu);
+ kvm_x86_call(enable_irq_window)(vcpu);
if (kvm_lapic_enabled(vcpu)) {
update_cr8_intercept(vcpu);
@@ -10965,7 +11019,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
- static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
+ kvm_x86_call(prepare_switch_to_guest)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -11001,7 +11055,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* i.e. they can post interrupts even if APICv is temporarily disabled.
*/
if (kvm_lapic_enabled(vcpu))
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -11045,12 +11099,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
+ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu,
+ req_immediate_exit);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
if (kvm_lapic_enabled(vcpu))
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
@@ -11069,7 +11124,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
- static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
+ kvm_x86_call(sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
}
@@ -11098,7 +11153,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.xfd_no_write_intercept)
fpu_sync_guest_vmexit_xfd_state();
- static_call(kvm_x86_handle_exit_irqoff)(vcpu);
+ kvm_x86_call(handle_exit_irqoff)(vcpu);
if (vcpu->arch.guest_fpu.xfd_err)
wrmsrl(MSR_IA32_XFD_ERR, 0);
@@ -11131,6 +11186,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_lock(vcpu);
/*
+ * Call this to ensure WC buffers in guest are evicted after each VM
+ * Exit, so that the evicted WC writes can be snooped across all cpus
+ */
+ smp_mb__after_srcu_read_lock();
+
+ /*
* Profile KVM exit RIPs:
*/
if (unlikely(prof_on == KVM_PROFILING)) {
@@ -11144,13 +11205,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
- r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
+ r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
return r;
cancel_injection:
if (req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- static_call(kvm_x86_cancel_injection)(vcpu);
+ kvm_x86_call(cancel_injection)(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
@@ -11200,7 +11261,10 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
* causes a spurious wakeup from HLT).
*/
if (is_guest_mode(vcpu)) {
- if (kvm_check_nested_events(vcpu) < 0)
+ int r = kvm_check_nested_events(vcpu);
+
+ WARN_ON_ONCE(r == -EBUSY);
+ if (r < 0)
return 0;
}
@@ -11237,7 +11301,6 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
int r;
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
/*
@@ -11387,7 +11450,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_lock(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
- if (kvm_run->immediate_exit) {
+ if (!vcpu->wants_to_run) {
r = -EINTR;
goto out;
}
@@ -11465,12 +11528,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vcpu->mmio_needed);
}
- if (kvm_run->immediate_exit) {
+ if (!vcpu->wants_to_run) {
r = -EINTR;
goto out;
}
- r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
+ r = kvm_x86_call(vcpu_pre_run)(vcpu);
if (r <= 0)
goto out;
@@ -11598,10 +11661,10 @@ static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
- static_call(kvm_x86_get_idt)(vcpu, &dt);
+ kvm_x86_call(get_idt)(vcpu, &dt);
sregs->idt.limit = dt.size;
sregs->idt.base = dt.address;
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ kvm_x86_call(get_gdt)(vcpu, &dt);
sregs->gdt.limit = dt.size;
sregs->gdt.base = dt.address;
@@ -11743,7 +11806,13 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
has_error_code, error_code);
- if (ret) {
+
+ /*
+ * Report an error userspace if MMIO is needed, as KVM doesn't support
+ * MMIO during a task switch (or any other complex operation).
+ */
+ if (ret || vcpu->mmio_needed) {
+ vcpu->mmio_needed = false;
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -11801,27 +11870,27 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
dt.size = sregs->gdt.limit;
dt.address = sregs->gdt.base;
- static_call(kvm_x86_set_gdt)(vcpu, &dt);
+ kvm_x86_call(set_gdt)(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
*mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
- static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
+ kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
kvm_set_cr8(vcpu, sregs->cr8);
*mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
- static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
+ kvm_x86_call(set_efer)(vcpu, sregs->efer);
*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
- static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
+ kvm_x86_call(set_cr0)(vcpu, sregs->cr0);
*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
- static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
+ kvm_x86_call(set_cr4)(vcpu, sregs->cr4);
if (update_pdptrs) {
idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -11999,7 +12068,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- static_call(kvm_x86_update_exception_bitmap)(vcpu);
+ kvm_x86_call(update_exception_bitmap)(vcpu);
kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
@@ -12136,7 +12205,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
if (id >= kvm->arch.max_vcpu_ids)
return -EINVAL;
- return static_call(kvm_x86_vcpu_precreate)(kvm);
+ return kvm_x86_call(vcpu_precreate)(kvm);
}
int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
@@ -12207,14 +12276,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.hv_root_tdp = INVALID_PAGE;
#endif
- r = static_call(kvm_x86_vcpu_create)(vcpu);
+ r = kvm_x86_call(vcpu_create)(vcpu);
if (r)
goto free_guest_fpu;
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_xen_init_vcpu(vcpu);
- kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
kvm_vcpu_reset(vcpu, false);
@@ -12265,7 +12333,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvmclock_reset(vcpu);
- static_call(kvm_x86_vcpu_free)(vcpu);
+ kvm_x86_call(vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -12383,7 +12451,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1);
kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
- static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+ kvm_x86_call(vcpu_reset)(vcpu, init_event);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0xfff0);
@@ -12402,10 +12470,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
else
new_cr0 |= X86_CR0_NW | X86_CR0_CD;
- static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
- static_call(kvm_x86_set_cr4)(vcpu, 0);
- static_call(kvm_x86_set_efer)(vcpu, 0);
- static_call(kvm_x86_update_exception_bitmap)(vcpu);
+ kvm_x86_call(set_cr0)(vcpu, new_cr0);
+ kvm_x86_call(set_cr4)(vcpu, 0);
+ kvm_x86_call(set_efer)(vcpu, 0);
+ kvm_x86_call(update_exception_bitmap)(vcpu);
/*
* On the standard CR0/CR4/EFER modification paths, there are several
@@ -12462,7 +12530,7 @@ int kvm_arch_hardware_enable(void)
if (ret)
return ret;
- ret = static_call(kvm_x86_hardware_enable)();
+ ret = kvm_x86_call(hardware_enable)();
if (ret != 0)
return ret;
@@ -12544,7 +12612,7 @@ int kvm_arch_hardware_enable(void)
void kvm_arch_hardware_disable(void)
{
- static_call(kvm_x86_hardware_disable)();
+ kvm_x86_call(hardware_disable)();
drop_user_return_notifiers();
}
@@ -12558,18 +12626,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
-
- vcpu->arch.l1tf_flush_l1d = true;
- if (pmu->version && unlikely(pmu->event_count)) {
- pmu->need_cleanup = true;
- kvm_make_request(KVM_REQ_PMU, vcpu);
- }
- static_call(kvm_x86_sched_in)(vcpu, cpu);
-}
-
void kvm_arch_free_vm(struct kvm *kvm)
{
#if IS_ENABLED(CONFIG_HYPERV)
@@ -12597,7 +12653,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_mmu_init_vm(kvm);
- ret = static_call(kvm_x86_vm_init)(kvm);
+ ret = kvm_x86_call(vm_init)(kvm);
if (ret)
goto out_uninit_mmu;
@@ -12620,6 +12676,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
+ kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
kvm->arch.guest_can_read_msr_platform_info = true;
kvm->arch.enable_pmu = enable_pmu;
@@ -12771,7 +12828,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
mutex_unlock(&kvm->slots_lock);
}
kvm_unload_vcpu_mmus(kvm);
- static_call_cond(kvm_x86_vm_destroy)(kvm);
+ kvm_x86_call(vm_destroy)(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
@@ -13100,12 +13157,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_arch_free_memslot(kvm, old);
}
-static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- return (is_guest_mode(vcpu) &&
- static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
-}
-
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
{
if (!list_empty_careful(&vcpu->async_pf.done))
@@ -13123,22 +13174,23 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
(vcpu->arch.nmi_pending &&
- static_call(kvm_x86_nmi_allowed)(vcpu, false)))
+ kvm_x86_call(nmi_allowed)(vcpu, false)))
return true;
#ifdef CONFIG_KVM_SMM
if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
(vcpu->arch.smi_pending &&
- static_call(kvm_x86_smi_allowed)(vcpu, false)))
+ kvm_x86_call(smi_allowed)(vcpu, false)))
return true;
#endif
if (kvm_test_request(KVM_REQ_PMI, vcpu))
return true;
- if (kvm_arch_interrupt_allowed(vcpu) &&
- (kvm_cpu_has_interrupt(vcpu) ||
- kvm_guest_apic_has_interrupt(vcpu)))
+ if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
+ return true;
+
+ if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
return true;
if (kvm_hv_has_stimer_pending(vcpu))
@@ -13146,7 +13198,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu) &&
kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu))
+ kvm_x86_ops.nested_ops->has_events(vcpu, false))
return true;
if (kvm_xen_has_pending_events(vcpu))
@@ -13163,7 +13215,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_apicv_active(vcpu) &&
- static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu);
+ kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
}
bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
@@ -13191,7 +13243,7 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return true;
- return static_call(kvm_x86_get_cpl)(vcpu) == 0;
+ return kvm_x86_call(get_cpl)(vcpu) == 0;
}
unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
@@ -13206,7 +13258,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
+ return kvm_x86_call(interrupt_allowed)(vcpu, false);
}
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -13232,7 +13284,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
- rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ rflags = kvm_x86_call(get_rflags)(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
rflags &= ~X86_EFLAGS_TF;
return rflags;
@@ -13244,7 +13296,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
- static_call(kvm_x86_set_rflags)(vcpu, rflags);
+ kvm_x86_call(set_rflags)(vcpu, rflags);
}
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
@@ -13356,7 +13408,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
return false;
if (vcpu->arch.apf.send_user_only &&
- static_call(kvm_x86_get_cpl)(vcpu) == 0)
+ kvm_x86_call(get_cpl)(vcpu) == 0)
return false;
if (is_guest_mode(vcpu)) {
@@ -13467,7 +13519,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- static_call_cond(kvm_x86_pi_start_assignment)(kvm);
+ kvm_x86_call(pi_start_assignment)(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@@ -13486,13 +13538,13 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
/*
- * Non-coherent DMA assignment and de-assignment will affect
- * whether KVM honors guest MTRRs and cause changes in memtypes
- * in TDP.
- * So, pass %true unconditionally to indicate non-coherent DMA was,
- * or will be involved, and that zapping SPTEs might be necessary.
+ * Non-coherent DMA assignment and de-assignment may affect whether or
+ * not KVM honors guest PAT, and thus may cause changes in EPT SPTEs
+ * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
+ * (or last) non-coherent device is (un)registered to so that new SPTEs
+ * with the correct "ignore guest PAT" setting are created.
*/
- if (__kvm_mmu_honors_guest_mtrrs(true))
+ if (kvm_mmu_may_ignore_guest_pat())
kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
}
@@ -13530,9 +13582,8 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
- ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
-
+ ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
if (ret)
kvm_arch_end_assignment(irqfd->kvm);
@@ -13555,7 +13606,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
+ prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
@@ -13566,7 +13618,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
+ return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
}
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
@@ -13589,6 +13641,24 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+bool kvm_arch_gmem_prepare_needed(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_SNP_VM;
+}
+
+int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
+{
+ return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order);
+}
+#endif
+
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_x86_call(gmem_invalidate)(start, end);
+}
+#endif
int kvm_spec_ctrl_test_value(u64 value)
{
@@ -13974,6 +14044,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault);
static int __init kvm_x86_init(void)
{
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d80a4c6b5a38..50596f6f8320 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -33,6 +33,20 @@ struct kvm_caps {
u64 supported_perf_cap;
};
+struct kvm_host_values {
+ /*
+ * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical
+ * address bits irrespective of features that repurpose legal bits,
+ * e.g. MKTME.
+ */
+ u8 maxphyaddr;
+
+ u64 efer;
+ u64 xcr0;
+ u64 xss;
+ u64 arch_capabilities;
+};
+
void kvm_spurious_fault(void);
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
@@ -159,7 +173,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
if (!is_long_mode(vcpu))
return false;
- static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
return cs_l;
}
@@ -311,12 +325,8 @@ int handle_ud(struct kvm_vcpu *vcpu);
void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
struct kvm_queued_exception *ex);
-void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
-u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
- int page_num);
bool kvm_vector_hashing_enabled(void);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
@@ -325,11 +335,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
-extern u64 host_xcr0;
-extern u64 host_xss;
-extern u64 host_arch_capabilities;
-
extern struct kvm_caps kvm_caps;
+extern struct kvm_host_values kvm_host;
extern bool enable_pmu;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index f65b35a05d91..622fe24da910 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -741,7 +741,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
} else {
void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
- if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) {
+ if (!PAGE_ALIGNED(hva)) {
r = -EINVAL;
} else if (!hva) {
kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
@@ -1270,7 +1270,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
instructions[0] = 0xb8;
/* vmcall / vmmcall */
- static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5);
+ kvm_x86_call(patch_hypercall)(vcpu, instructions + 5);
/* ret */
instructions[8] = 0xc3;
@@ -1650,7 +1650,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
params[5] = (u64)kvm_r9_read(vcpu);
}
#endif
- cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ cpl = kvm_x86_call(get_cpl)(vcpu);
trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
params[3], params[4], params[5]);