diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-02-06 09:07:45 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-02-06 09:07:45 -0800 |
commit | 90568ecf561540fa330511e21fcd823b0c3829c6 (patch) | |
tree | 494a5874c0ca19f39d74154f38c226b1a85d9572 /arch/x86 | |
parent | d854b2d639fd61ccdc184385ee4036658a52e57e (diff) | |
parent | a8be1ad01b795bd2a13297ddbaecdb956ab0efd0 (diff) |
Merge tag 'kvm-5.6-2' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull more KVM updates from Paolo Bonzini:
"s390:
- fix register corruption
- ENOTSUPP/EOPNOTSUPP mixed
- reset cleanups/fixes
- selftests
x86:
- Bug fixes and cleanups
- AMD support for APIC virtualization even in combination with
in-kernel PIT or IOAPIC.
MIPS:
- Compilation fix.
Generic:
- Fix refcount overflow for zero page"
* tag 'kvm-5.6-2' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (42 commits)
KVM: vmx: delete meaningless vmx_decache_cr0_guest_bits() declaration
KVM: x86: Mark CR4.UMIP as reserved based on associated CPUID bit
x86: vmxfeatures: rename features for consistency with KVM and manual
KVM: SVM: relax conditions for allowing MSR_IA32_SPEC_CTRL accesses
KVM: x86: Fix perfctr WRMSR for running counters
x86/kvm/hyper-v: don't allow to turn on unsupported VMX controls for nested guests
x86/kvm/hyper-v: move VMX controls sanitization out of nested_enable_evmcs()
kvm: mmu: Separate generating and setting mmio ptes
kvm: mmu: Replace unsigned with unsigned int for PTE access
KVM: nVMX: Remove stale comment from nested_vmx_load_cr3()
KVM: MIPS: Fold comparecount_func() into comparecount_wakeup()
KVM: MIPS: Fix a build error due to referencing not-yet-defined function
x86/kvm: do not setup pv tlb flush when not paravirtualized
KVM: fix overflow of zero page refcount with ksm running
KVM: x86: Take a u64 when checking for a valid dr7 value
KVM: x86: use raw clock values consistently
KVM: x86: reorganize pvclock_gtod_data members
KVM: nVMX: delete meaningless nested_vmx_run() declaration
KVM: SVM: allow AVIC without split irqchip
kvm: ioapic: Lazy update IOAPIC EOI
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 18 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/vmxfeatures.h | 6 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 149 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 22 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 37 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 166 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 19 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.c | 85 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 13 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 34 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 139 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 2 |
19 files changed, 554 insertions, 175 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 329d01c689b7..4dffbc10d3f8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -78,6 +78,8 @@ #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) #define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) +#define KVM_REQ_APICV_UPDATE \ + KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -873,6 +875,12 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +#define APICV_INHIBIT_REASON_DISABLE 0 +#define APICV_INHIBIT_REASON_HYPERV 1 +#define APICV_INHIBIT_REASON_NESTED 2 +#define APICV_INHIBIT_REASON_IRQWIN 3 +#define APICV_INHIBIT_REASON_PIT_REINJ 4 + struct kvm_arch { unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; @@ -904,6 +912,7 @@ struct kvm_arch { struct kvm_apic_map *apic_map; bool apic_access_page_done; + unsigned long apicv_inhibit_reasons; gpa_t wall_clock; @@ -1118,7 +1127,8 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*get_enable_apicv)(struct kvm *kvm); + bool (*check_apicv_inhibit_reasons)(ulong bit); + void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); @@ -1477,7 +1487,11 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); -void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); +bool kvm_apicv_activated(struct kvm *kvm); +void kvm_apicv_init(struct kvm *kvm, bool enable); +void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); +void kvm_request_apicv_update(struct kvm *kvm, bool activate, + unsigned long bit); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index d380b3b7ddd9..2a85287b3685 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -22,8 +22,8 @@ /* * Definitions of Primary Processor-Based VM-Execution Controls. */ -#define CPU_BASED_INTR_WINDOW_EXITING VMCS_CONTROL_BIT(VIRTUAL_INTR_PENDING) -#define CPU_BASED_USE_TSC_OFFSETTING VMCS_CONTROL_BIT(TSC_OFFSETTING) +#define CPU_BASED_INTR_WINDOW_EXITING VMCS_CONTROL_BIT(INTR_WINDOW_EXITING) +#define CPU_BASED_USE_TSC_OFFSETTING VMCS_CONTROL_BIT(USE_TSC_OFFSETTING) #define CPU_BASED_HLT_EXITING VMCS_CONTROL_BIT(HLT_EXITING) #define CPU_BASED_INVLPG_EXITING VMCS_CONTROL_BIT(INVLPG_EXITING) #define CPU_BASED_MWAIT_EXITING VMCS_CONTROL_BIT(MWAIT_EXITING) @@ -34,7 +34,7 @@ #define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING) #define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING) #define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR) -#define CPU_BASED_NMI_WINDOW_EXITING VMCS_CONTROL_BIT(VIRTUAL_NMI_PENDING) +#define CPU_BASED_NMI_WINDOW_EXITING VMCS_CONTROL_BIT(NMI_WINDOW_EXITING) #define CPU_BASED_MOV_DR_EXITING VMCS_CONTROL_BIT(MOV_DR_EXITING) #define CPU_BASED_UNCOND_IO_EXITING VMCS_CONTROL_BIT(UNCOND_IO_EXITING) #define CPU_BASED_USE_IO_BITMAPS VMCS_CONTROL_BIT(USE_IO_BITMAPS) diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 0d04d8bf15a5..a50e4a0de315 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -34,8 +34,8 @@ #define VMX_FEATURE_EPTP_SWITCHING ( 0*32+ 28) /* EPTP switching (in guest) */ /* Primary Processor-Based VM-Execution Controls, word 1 */ -#define VMX_FEATURE_VIRTUAL_INTR_PENDING ( 1*32+ 2) /* "" VM-Exit if INTRs are unblocked in guest */ -#define VMX_FEATURE_TSC_OFFSETTING ( 1*32+ 3) /* "tsc_offset" Offset hardware TSC when read in guest */ +#define VMX_FEATURE_INTR_WINDOW_EXITING ( 1*32+ 2) /* "" VM-Exit if INTRs are unblocked in guest */ +#define VMX_FEATURE_USE_TSC_OFFSETTING ( 1*32+ 3) /* "tsc_offset" Offset hardware TSC when read in guest */ #define VMX_FEATURE_HLT_EXITING ( 1*32+ 7) /* "" VM-Exit on HLT */ #define VMX_FEATURE_INVLPG_EXITING ( 1*32+ 9) /* "" VM-Exit on INVLPG */ #define VMX_FEATURE_MWAIT_EXITING ( 1*32+ 10) /* "" VM-Exit on MWAIT */ @@ -46,7 +46,7 @@ #define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */ #define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */ #define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */ -#define VMX_FEATURE_VIRTUAL_NMI_PENDING ( 1*32+ 22) /* "" VM-Exit if NMIs are unblocked in guest */ +#define VMX_FEATURE_NMI_WINDOW_EXITING ( 1*32+ 22) /* "" VM-Exit if NMIs are unblocked in guest */ #define VMX_FEATURE_MOV_DR_EXITING ( 1*32+ 23) /* "" VM-Exit on accesses to debug registers */ #define VMX_FEATURE_UNCOND_IO_EXITING ( 1*32+ 24) /* "" VM-Exit on *all* IN{S} and OUT{S}*/ #define VMX_FEATURE_USE_IO_BITMAPS ( 1*32+ 25) /* "" VM-Exit based on I/O port */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 81045aabb6f4..d817f255aed8 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -736,6 +736,9 @@ static __init int kvm_setup_pv_tlb_flush(void) { int cpu; + if (!kvm_para_available() || nopv) + return 0; + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4df1c965bf1a..a86fda7a1d03 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -776,9 +776,10 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) /* * Hyper-V SynIC auto EOI SINT's are - * not compatible with APICV, so deactivate APICV + * not compatible with APICV, so request + * to deactivate APICV permanently. */ - kvm_vcpu_deactivate_apicv(vcpu); + kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; return 0; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4a6dc54cc12b..b24c606ac04b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -295,12 +295,24 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) if (atomic_read(&ps->reinject) == reinject) return; + /* + * AMD SVM AVIC accelerates EOI write and does not trap. + * This cause in-kernel PIT re-inject mode to fail + * since it checks ps->irq_ack before kvm_set_irq() + * and relies on the ack notifier to timely queue + * the pt->worker work iterm and reinject the missed tick. + * So, deactivate APICv when PIT is in reinject mode. + */ if (reinject) { + kvm_request_apicv_update(kvm, false, + APICV_INHIBIT_REASON_PIT_REINJ); /* The initial state is preserved while ps->reinject == 0. */ kvm_pit_reset_reinject(pit); kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier); kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); } else { + kvm_request_apicv_update(kvm, true, + APICV_INHIBIT_REASON_PIT_REINJ); kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier); kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 26aa22cb9b29..7668fed1ce65 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -49,6 +49,11 @@ static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); +static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, + struct kvm_ioapic *ioapic, + int trigger_mode, + int pin); + static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, unsigned long addr, unsigned long length) @@ -154,10 +159,16 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) __rtc_irq_eoi_tracking_restore_one(vcpu); } -static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu) +static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu, + int vector) { - if (test_and_clear_bit(vcpu->vcpu_id, - ioapic->rtc_status.dest_map.map)) { + struct dest_map *dest_map = &ioapic->rtc_status.dest_map; + + /* RTC special handling */ + if (test_bit(vcpu->vcpu_id, dest_map->map) && + (vector == dest_map->vectors[vcpu->vcpu_id]) && + (test_and_clear_bit(vcpu->vcpu_id, + ioapic->rtc_status.dest_map.map))) { --ioapic->rtc_status.pending_eoi; rtc_status_pending_eoi_check_valid(ioapic); } @@ -171,6 +182,31 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) return false; } +static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq) +{ + int i; + struct kvm_vcpu *vcpu; + union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; + + kvm_for_each_vcpu(i, vcpu, ioapic->kvm) { + if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, + entry->fields.dest_id, + entry->fields.dest_mode) || + kvm_apic_pending_eoi(vcpu, entry->fields.vector)) + continue; + + /* + * If no longer has pending EOI in LAPICs, update + * EOI for this vetor. + */ + rtc_irq_eoi(ioapic, vcpu, entry->fields.vector); + kvm_ioapic_update_eoi_one(vcpu, ioapic, + entry->fields.trig_mode, + irq); + break; + } +} + static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, int irq_level, bool line_status) { @@ -189,6 +225,15 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, } /* + * AMD SVM AVIC accelerate EOI write and do not trap, + * in-kernel IOAPIC will not be able to receive the EOI. + * In this case, we do lazy update of the pending EOI when + * trying to set IOAPIC irq. + */ + if (kvm_apicv_activated(ioapic->kvm)) + ioapic_lazy_update_eoi(ioapic, irq); + + /* * Return 0 for coalesced interrupts; for edge-triggered interrupts, * this only happens if a previous edge has not been delivered due * to masking. For level interrupts, the remote_irr field tells @@ -454,72 +499,68 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work) } #define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 - -static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, - struct kvm_ioapic *ioapic, int vector, int trigger_mode) +static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, + struct kvm_ioapic *ioapic, + int trigger_mode, + int pin) { - struct dest_map *dest_map = &ioapic->rtc_status.dest_map; struct kvm_lapic *apic = vcpu->arch.apic; - int i; - - /* RTC special handling */ - if (test_bit(vcpu->vcpu_id, dest_map->map) && - vector == dest_map->vectors[vcpu->vcpu_id]) - rtc_irq_eoi(ioapic, vcpu); - - for (i = 0; i < IOAPIC_NUM_PINS; i++) { - union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; - - if (ent->fields.vector != vector) - continue; + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[pin]; - /* - * We are dropping lock while calling ack notifiers because ack - * notifier callbacks for assigned devices call into IOAPIC - * recursively. Since remote_irr is cleared only after call - * to notifiers if the same vector will be delivered while lock - * is dropped it will be put into irr and will be delivered - * after ack notifier returns. - */ - spin_unlock(&ioapic->lock); - kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); - spin_lock(&ioapic->lock); + /* + * We are dropping lock while calling ack notifiers because ack + * notifier callbacks for assigned devices call into IOAPIC + * recursively. Since remote_irr is cleared only after call + * to notifiers if the same vector will be delivered while lock + * is dropped it will be put into irr and will be delivered + * after ack notifier returns. + */ + spin_unlock(&ioapic->lock); + kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin); + spin_lock(&ioapic->lock); - if (trigger_mode != IOAPIC_LEVEL_TRIG || - kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) - continue; + if (trigger_mode != IOAPIC_LEVEL_TRIG || + kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) + return; - ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); - ent->fields.remote_irr = 0; - if (!ent->fields.mask && (ioapic->irr & (1 << i))) { - ++ioapic->irq_eoi[i]; - if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { - /* - * Real hardware does not deliver the interrupt - * immediately during eoi broadcast, and this - * lets a buggy guest make slow progress - * even if it does not correctly handle a - * level-triggered interrupt. Emulate this - * behavior if we detect an interrupt storm. - */ - schedule_delayed_work(&ioapic->eoi_inject, HZ / 100); - ioapic->irq_eoi[i] = 0; - trace_kvm_ioapic_delayed_eoi_inj(ent->bits); - } else { - ioapic_service(ioapic, i, false); - } + ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); + ent->fields.remote_irr = 0; + if (!ent->fields.mask && (ioapic->irr & (1 << pin))) { + ++ioapic->irq_eoi[pin]; + if (ioapic->irq_eoi[pin] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { + /* + * Real hardware does not deliver the interrupt + * immediately during eoi broadcast, and this + * lets a buggy guest make slow progress + * even if it does not correctly handle a + * level-triggered interrupt. Emulate this + * behavior if we detect an interrupt storm. + */ + schedule_delayed_work(&ioapic->eoi_inject, HZ / 100); + ioapic->irq_eoi[pin] = 0; + trace_kvm_ioapic_delayed_eoi_inj(ent->bits); } else { - ioapic->irq_eoi[i] = 0; + ioapic_service(ioapic, pin, false); } + } else { + ioapic->irq_eoi[pin] = 0; } } void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) { + int i; struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; spin_lock(&ioapic->lock); - __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode); + rtc_irq_eoi(ioapic, vcpu, vector); + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.vector != vector) + continue; + kvm_ioapic_update_eoi_one(vcpu, ioapic, trigger_mode, i); + } spin_unlock(&ioapic->lock); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cce1e6b204c8..eafc631d305c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2187,6 +2187,21 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) pr_warn_once("APIC base relocation is unsupported by KVM"); } +void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (vcpu->arch.apicv_active) { + /* irr_pending is always true when apicv is activated. */ + apic->irr_pending = true; + apic->isr_count = 1; + } else { + apic->irr_pending = (apic_search_irr(apic) != -1); + apic->isr_count = count_vectors(apic->regs + APIC_ISR); + } +} +EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); + void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -2229,8 +2244,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0); kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = vcpu->arch.apicv_active; - apic->isr_count = vcpu->arch.apicv_active ? 1 : 0; + kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -2487,9 +2501,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); start_apic_timer(apic); - apic->irr_pending = true; - apic->isr_count = vcpu->arch.apicv_active ? - 1 : count_vectors(apic->regs + APIC_ISR); + kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { kvm_x86_ops->apicv_post_state_restore(vcpu); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ec730ce7a344..ec6fbfe325cf 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -91,6 +91,7 @@ void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); +void kvm_apic_update_apicv(struct kvm_vcpu *vcpu); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index adc84f0f16ba..7011a4e54866 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -451,9 +451,9 @@ static u64 get_mmio_spte_generation(u64 spte) return gen; } -static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, - unsigned access) +static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) { + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; @@ -464,6 +464,17 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mask |= (gpa & shadow_nonpresent_or_rsvd_mask) << shadow_nonpresent_or_rsvd_mask_len; + return mask; +} + +static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, + unsigned int access) +{ + u64 mask = make_mmio_spte(vcpu, gfn, access); + unsigned int gen = get_mmio_spte_generation(mask); + + access = mask & ACC_ALL; + trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } @@ -484,7 +495,7 @@ static unsigned get_mmio_spte_access(u64 spte) } static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - kvm_pfn_t pfn, unsigned access) + kvm_pfn_t pfn, unsigned int access) { if (unlikely(is_noslot_pfn(pfn))) { mark_mmio_spte(vcpu, sptep, gfn, access); @@ -2475,7 +2486,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gva_t gaddr, unsigned level, int direct, - unsigned access) + unsigned int access) { union kvm_mmu_page_role role; unsigned quadrant; @@ -2990,7 +3001,7 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pte_access, int level, + unsigned int pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { @@ -3081,9 +3092,10 @@ set_pte: return ret; } -static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, - int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, - bool speculative, bool host_writable) +static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned int pte_access, int write_fault, int level, + gfn_t gfn, kvm_pfn_t pfn, bool speculative, + bool host_writable) { int was_rmapped = 0; int rmap_count; @@ -3165,7 +3177,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, { struct page *pages[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; - unsigned access = sp->role.access; + unsigned int access = sp->role.access; int i, ret; gfn_t gfn; @@ -3400,7 +3412,8 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) } static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - kvm_pfn_t pfn, unsigned access, int *ret_val) + kvm_pfn_t pfn, unsigned int access, + int *ret_val) { /* The pfn is invalid, report the error! */ if (unlikely(is_error_pfn(pfn))) { @@ -4005,7 +4018,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (is_mmio_spte(spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); - unsigned access = get_mmio_spte_access(spte); + unsigned int access = get_mmio_spte_access(spte); if (!check_mmio_spte(vcpu, spte)) return RET_PF_INVALID; @@ -4349,7 +4362,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, } static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - unsigned access, int *nr_present) + unsigned int access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9dbb990c319a..a3e32d61d60c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -387,6 +387,8 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa"; static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); static void svm_complete_interrupts(struct vcpu_svm *svm); +static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); +static inline void avic_post_state_restore(struct kvm_vcpu *vcpu); static int nested_svm_exit_handled(struct vcpu_svm *svm); static int nested_svm_intercept(struct vcpu_svm *svm); @@ -1545,7 +1547,10 @@ static void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + if (kvm_apicv_activated(svm->vcpu.kvm)) + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + else + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; } static void init_vmcb(struct vcpu_svm *svm) @@ -1729,23 +1734,28 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, * field of the VMCB. Therefore, we set up the * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. */ -static int avic_init_access_page(struct kvm_vcpu *vcpu) +static int avic_update_access_page(struct kvm *kvm, bool activate) { - struct kvm *kvm = vcpu->kvm; int ret = 0; mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page_done) + /* + * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger + * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT + * memory region. So, we need to ensure that kvm->mm == current->mm. + */ + if ((kvm->arch.apic_access_page_done == activate) || + (kvm->mm != current->mm)) goto out; ret = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, - PAGE_SIZE); + activate ? PAGE_SIZE : 0); if (ret) goto out; - kvm->arch.apic_access_page_done = true; + kvm->arch.apic_access_page_done = activate; out: mutex_unlock(&kvm->slots_lock); return ret; @@ -1753,21 +1763,24 @@ out: static int avic_init_backing_page(struct kvm_vcpu *vcpu) { - int ret; u64 *entry, new_entry; int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - ret = avic_init_access_page(vcpu); - if (ret) - return ret; - if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) return -EINVAL; if (!svm->vcpu.arch.apic->regs) return -EINVAL; + if (kvm_apicv_activated(vcpu->kvm)) { + int ret; + + ret = avic_update_access_page(vcpu->kvm, true); + if (ret) + return ret; + } + svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); /* Setting AVIC backing page address in the phy APIC ID table */ @@ -2052,6 +2065,18 @@ free_avic: return err; } +static int svm_vm_init(struct kvm *kvm) +{ + if (avic) { + int ret = avic_vm_init(kvm); + if (ret) + return ret; + } + + kvm_apicv_init(kvm, avic); + return 0; +} + static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { @@ -2223,7 +2248,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. */ - svm->avic_is_running = true; + if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) + svm->avic_is_running = true; svm->nested.hsave = page_address(hsave_page); @@ -2348,6 +2374,8 @@ static void svm_vcpu_blocking(struct kvm_vcpu *vcpu) static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) { + if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) + kvm_vcpu_update_apicv(vcpu); avic_set_running(vcpu, true); } @@ -4197,6 +4225,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; @@ -4282,6 +4312,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; @@ -4440,6 +4472,14 @@ static int interrupt_window_interception(struct vcpu_svm *svm) { kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); + + /* + * For AVIC, the only reason to end up here is ExtINTs. + * In this case AVIC was temporarily disabled for + * requesting the IRQ window and we have to re-enable it. + */ + svm_toggle_avic_for_irq_window(&svm->vcpu, true); + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); ++svm->vcpu.stat.irq_window_exits; @@ -5135,30 +5175,79 @@ static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) return; } -static bool svm_get_enable_apicv(struct kvm *kvm) +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { - return avic && irqchip_split(kvm); } -static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { } -static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) { + if (!avic || !lapic_in_kernel(vcpu)) + return; + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_request_apicv_update(vcpu->kvm, activate, + APICV_INHIBIT_REASON_IRQWIN); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); +} + +static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +{ + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm)) + return 0; + + /* + * Here, we go through the per-vcpu ir_list to update all existing + * interrupt remapping table entry targeting this vcpu. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); + + if (list_empty(&svm->ir_list)) + goto out; + + list_for_each_entry(ir, &svm->ir_list, node) { + if (activate) + ret = amd_iommu_activate_guest_mode(ir->data); + else + ret = amd_iommu_deactivate_guest_mode(ir->data); + if (ret) + break; + } +out: + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + return ret; } -/* Note: Currently only used by Hyper-V. */ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; + bool activated = kvm_vcpu_apicv_active(vcpu); - if (kvm_vcpu_apicv_active(vcpu)) + if (activated) { + /** + * During AVIC temporary deactivation, guest could update + * APIC ID, DFR and LDR registers, which would not be trapped + * by avic_unaccelerated_access_interception(). In this case, + * we need to check and update the AVIC logical APIC ID table + * accordingly before re-activating. + */ + avic_post_state_restore(vcpu); vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - else + } else { vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + } mark_dirty(vmcb, VMCB_AVIC); + + svm_set_pi_irte_mode(vcpu, activated); } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -5445,9 +5534,6 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (kvm_vcpu_apicv_active(vcpu)) - return; - /* * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes * 1, because that's a separate STGI/VMRUN intercept. The next time we @@ -5457,6 +5543,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) * window under the assumption that the hardware will set the GIF. */ if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) { + /* + * IRQ window is not needed when AVIC is enabled, + * unless we have pending ExtINT since it cannot be injected + * via AVIC. In such case, we need to temporarily disable AVIC, + * and fallback to injecting IRQ via V_IRQ. + */ + svm_toggle_avic_for_irq_window(vcpu, false); svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } @@ -5929,6 +6022,14 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) return; guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); + + /* + * Currently, AVIC does not work with nested virtualization. + * So, we disable AVIC when cpuid for SVM is set in the L1 guest. + */ + if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + kvm_request_apicv_update(vcpu->kvm, false, + APICV_INHIBIT_REASON_NESTED); } #define F feature_bit @@ -7257,6 +7358,22 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); } +static bool svm_check_apicv_inhibit_reasons(ulong bit) +{ + ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_HYPERV) | + BIT(APICV_INHIBIT_REASON_NESTED) | + BIT(APICV_INHIBIT_REASON_IRQWIN) | + BIT(APICV_INHIBIT_REASON_PIT_REINJ); + + return supported & BIT(bit); +} + +static void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate) +{ + avic_update_access_page(kvm, activate); +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -7274,7 +7391,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .vm_alloc = svm_vm_alloc, .vm_free = svm_vm_free, - .vm_init = avic_vm_init, + .vm_init = svm_vm_init, .vm_destroy = svm_vm_destroy, .prepare_guest_switch = svm_prepare_guest_switch, @@ -7331,8 +7448,9 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_apic_mode = svm_set_virtual_apic_mode, - .get_enable_apicv = svm_get_enable_apicv, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, + .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, + .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 7c741a0c5f80..f194dd058470 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1291,6 +1291,25 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, __entry->vcpu_id, __entry->timer_index) ); +TRACE_EVENT(kvm_apicv_update_request, + TP_PROTO(bool activate, unsigned long bit), + TP_ARGS(activate, bit), + + TP_STRUCT__entry( + __field(bool, activate) + __field(unsigned long, bit) + ), + + TP_fast_assign( + __entry->activate = activate; + __entry->bit = bit; + ), + + TP_printk("%s bit=%lu", + __entry->activate ? "activate" : "deactivate", + __entry->bit) +); + /* * Tracepoint for AMD AVIC */ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 89c3e0caf39f..303813423c3e 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -7,6 +7,7 @@ #include "evmcs.h" #include "vmcs.h" #include "vmx.h" +#include "trace.h" DEFINE_STATIC_KEY_FALSE(enable_evmcs); @@ -346,6 +347,84 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) return 0; } +void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) +{ + u32 ctl_low = (u32)*pdata; + u32 ctl_high = (u32)(*pdata >> 32); + + /* + * Hyper-V 2016 and 2019 try using these features even when eVMCS + * is enabled but there are no corresponding fields. + */ + switch (msr_index) { + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + break; + case MSR_IA32_VMX_ENTRY_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + break; + } + + *pdata = ctl_low | ((u64)ctl_high << 32); +} + +int nested_evmcs_check_controls(struct vmcs12 *vmcs12) +{ + int ret = 0; + u32 unsupp_ctl; + + unsupp_ctl = vmcs12->pin_based_vm_exec_control & + EVMCS1_UNSUPPORTED_PINCTRL; + if (unsupp_ctl) { + trace_kvm_nested_vmenter_failed( + "eVMCS: unsupported pin-based VM-execution controls", + unsupp_ctl); + ret = -EINVAL; + } + + unsupp_ctl = vmcs12->secondary_vm_exec_control & + EVMCS1_UNSUPPORTED_2NDEXEC; + if (unsupp_ctl) { + trace_kvm_nested_vmenter_failed( + "eVMCS: unsupported secondary VM-execution controls", + unsupp_ctl); + ret = -EINVAL; + } + + unsupp_ctl = vmcs12->vm_exit_controls & + EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + if (unsupp_ctl) { + trace_kvm_nested_vmenter_failed( + "eVMCS: unsupported VM-exit controls", + unsupp_ctl); + ret = -EINVAL; + } + + unsupp_ctl = vmcs12->vm_entry_controls & + EVMCS1_UNSUPPORTED_VMENTRY_CTRL; + if (unsupp_ctl) { + trace_kvm_nested_vmenter_failed( + "eVMCS: unsupported VM-entry controls", + unsupp_ctl); + ret = -EINVAL; + } + + unsupp_ctl = vmcs12->vm_function_control & EVMCS1_UNSUPPORTED_VMFUNC; + if (unsupp_ctl) { + trace_kvm_nested_vmenter_failed( + "eVMCS: unsupported VM-function controls", + unsupp_ctl); + ret = -EINVAL; + } + + return ret; +} + int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { @@ -356,11 +435,5 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, if (vmcs_version) *vmcs_version = nested_get_evmcs_version(vcpu); - vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; - vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; - vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; - return 0; } diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 07ebf6882a45..6de47f2569c9 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -10,6 +10,7 @@ #include "capabilities.h" #include "vmcs.h" +#include "vmcs12.h" struct vmcs_config; @@ -201,5 +202,7 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); +void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata); +int nested_evmcs_check_controls(struct vmcs12 *vmcs12); #endif /* __KVM_X86_VMX_EVMCS_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2db21d59eaf5..657c2eda357c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1074,10 +1074,10 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) } /* - * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are - * emulating VM entry into a guest with EPT enabled. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. + * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are + * emulating VM-Entry into a guest with EPT enabled. On failure, the expected + * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to + * @entry_failure_code. */ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, u32 *entry_failure_code) @@ -2757,6 +2757,9 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, nested_check_vm_entry_controls(vcpu, vmcs12)) return -EINVAL; + if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) + return nested_evmcs_check_controls(vmcs12); + return 0; } @@ -4723,8 +4726,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return nested_vmx_succeed(vcpu); } -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); - /* Emulate the VMLAUNCH instruction */ static int handle_vmlaunch(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 34a3a17bb6d7..fd21cdb10b79 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -260,13 +260,12 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { - if (msr_info->host_initiated) - pmc->counter = data; - else - pmc->counter = (s32)data; + if (!msr_info->host_initiated) + data = (s64)(s32)data; + pmc->counter += data - pmc_read_counter(pmc); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { - pmc->counter = data; + pmc->counter += data - pmc_read_counter(pmc); return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c475fa2aaae0..9a6664886f2e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1428,8 +1428,6 @@ static bool emulation_required(struct kvm_vcpu *vcpu) return emulate_invalid_guest_state && !guest_state_valid(vcpu); } -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); - unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1853,8 +1851,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; - return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, - &msr_info->data); + if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, + &msr_info->data)) + return 1; + /* + * Enlightened VMCS v1 doesn't have certain fields, but buggy + * Hyper-V versions are still trying to use corresponding + * features when they are exposed. Filter out the essential + * minimum. + */ + if (!msr_info->host_initiated && + vmx->nested.enlightened_vmcs_enabled) + nested_evmcs_filter_control_msr(msr_info->index, + &msr_info->data); + break; case MSR_IA32_RTIT_CTL: if (pt_mode != PT_MODE_HOST_GUEST) return 1; @@ -3719,11 +3729,6 @@ void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) } } -static bool vmx_get_enable_apicv(struct kvm *kvm) -{ - return enable_apicv; -} - static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6813,6 +6818,7 @@ static int vmx_vm_init(struct kvm *kvm) break; } } + kvm_apicv_init(kvm, enable_apicv); return 0; } @@ -7714,6 +7720,14 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } +static bool vmx_check_apicv_inhibit_reasons(ulong bit) +{ + ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_HYPERV); + + return supported & BIT(bit); +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -7786,10 +7800,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .update_cr8_intercept = update_cr8_intercept, .set_virtual_apic_mode = vmx_set_virtual_apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .get_enable_apicv = vmx_get_enable_apicv, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, .apicv_post_state_restore = vmx_apicv_post_state_restore, + .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2d3be7f3ad67..fbabb2f06273 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -26,6 +26,7 @@ #include "cpuid.h" #include "pmu.h" #include "hyperv.h" +#include "lapic.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -897,6 +898,8 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr); __reserved_bits |= X86_CR4_PKE; \ if (!__cpu_has(__c, X86_FEATURE_LA57)) \ __reserved_bits |= X86_CR4_LA57; \ + if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ + __reserved_bits |= X86_CR4_UMIP; \ __reserved_bits; \ }) @@ -1609,6 +1612,8 @@ struct pvclock_clock { u64 mask; u32 mult; u32 shift; + u64 base_cycles; + u64 offset; }; struct pvclock_gtod_data { @@ -1617,11 +1622,8 @@ struct pvclock_gtod_data { struct pvclock_clock clock; /* extract of a clocksource struct */ struct pvclock_clock raw_clock; /* extract of a clocksource struct */ - u64 boot_ns_raw; - u64 boot_ns; - u64 nsec_base; + ktime_t offs_boot; u64 wall_time_sec; - u64 monotonic_raw_nsec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1629,10 +1631,6 @@ static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; - u64 boot_ns, boot_ns_raw; - - boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); - boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); @@ -1642,23 +1640,35 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->clock.mask = tk->tkr_mono.mask; vdata->clock.mult = tk->tkr_mono.mult; vdata->clock.shift = tk->tkr_mono.shift; + vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; + vdata->clock.offset = tk->tkr_mono.base; vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode; vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; vdata->raw_clock.mask = tk->tkr_raw.mask; vdata->raw_clock.mult = tk->tkr_raw.mult; vdata->raw_clock.shift = tk->tkr_raw.shift; - - vdata->boot_ns = boot_ns; - vdata->nsec_base = tk->tkr_mono.xtime_nsec; + vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; + vdata->raw_clock.offset = tk->tkr_raw.base; vdata->wall_time_sec = tk->xtime_sec; - vdata->boot_ns_raw = boot_ns_raw; - vdata->monotonic_raw_nsec = tk->tkr_raw.xtime_nsec; + vdata->offs_boot = tk->offs_boot; write_seqcount_end(&vdata->seq); } + +static s64 get_kvmclock_base_ns(void) +{ + /* Count up from boot time, but with the frequency of the raw clock. */ + return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); +} +#else +static s64 get_kvmclock_base_ns(void) +{ + /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ + return ktime_get_boottime_ns(); +} #endif void kvm_set_pending_timer(struct kvm_vcpu *vcpu) @@ -1672,7 +1682,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) int version; int r; struct pvclock_wall_clock wc; - struct timespec64 boot; + u64 wall_nsec; if (!wall_clock) return; @@ -1692,17 +1702,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) /* * The guest calculates current wall clock time by adding * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. guest system time equals host - * system time for us, thus we must fill in host boot time here. + * wall clock specified here. We do the reverse here. */ - getboottime64(&boot); + wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); - if (kvm->arch.kvmclock_offset) { - struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); - boot = timespec64_sub(boot, ts); - } - wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ - wc.nsec = boot.tv_nsec; + wc.nsec = do_div(wall_nsec, 1000000000); + wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version; kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -1950,7 +1955,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boottime_ns(); + ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { @@ -2125,10 +2130,10 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) do { seq = read_seqcount_begin(>od->seq); - ns = gtod->monotonic_raw_nsec; + ns = gtod->raw_clock.base_cycles; ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); - ns >>= gtod->clock.shift; - ns += gtod->boot_ns_raw; + ns >>= gtod->raw_clock.shift; + ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); } while (unlikely(read_seqcount_retry(>od->seq, seq))); *t = ns; @@ -2145,7 +2150,7 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) do { seq = read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_sec; - ns = gtod->nsec_base; + ns = gtod->clock.base_cycles; ns += vgettsc(>od->clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -2288,7 +2293,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boottime_ns() + ka->kvmclock_offset; + return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -2304,7 +2309,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else - ret = ktime_get_boottime_ns() + ka->kvmclock_offset; + ret = get_kvmclock_base_ns() + ka->kvmclock_offset; put_cpu(); @@ -2403,7 +2408,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = ktime_get_boottime_ns(); + kernel_ns = get_kvmclock_base_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -2443,6 +2448,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; + WARN_ON(vcpu->hv_clock.system_time < 0); /* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; @@ -7456,18 +7462,22 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } -void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) +bool kvm_apicv_activated(struct kvm *kvm) { - if (!lapic_in_kernel(vcpu)) { - WARN_ON_ONCE(vcpu->arch.apicv_active); - return; - } - if (!vcpu->arch.apicv_active) - return; + return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0); +} +EXPORT_SYMBOL_GPL(kvm_apicv_activated); - vcpu->arch.apicv_active = false; - kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); +void kvm_apicv_init(struct kvm *kvm, bool enable) +{ + if (enable) + clear_bit(APICV_INHIBIT_REASON_DISABLE, + &kvm->arch.apicv_inhibit_reasons); + else + set_bit(APICV_INHIBIT_REASON_DISABLE, + &kvm->arch.apicv_inhibit_reasons); } +EXPORT_SYMBOL_GPL(kvm_apicv_init); static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) { @@ -7996,6 +8006,47 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } +void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu)) + return; + + vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); + kvm_apic_update_apicv(vcpu); + kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); + +/* + * NOTE: Do not hold any lock prior to calling this. + * + * In particular, kvm_request_apicv_update() expects kvm->srcu not to be + * locked, because it calls __x86_set_memory_region() which does + * synchronize_srcu(&kvm->srcu). + */ +void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +{ + if (!kvm_x86_ops->check_apicv_inhibit_reasons || + !kvm_x86_ops->check_apicv_inhibit_reasons(bit)) + return; + + if (activate) { + if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) || + !kvm_apicv_activated(kvm)) + return; + } else { + if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) || + kvm_apicv_activated(kvm)) + return; + } + + trace_kvm_apicv_update_request(activate, bit); + if (kvm_x86_ops->pre_update_apicv_exec_ctrl) + kvm_x86_ops->pre_update_apicv_exec_ctrl(kvm, activate); + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); +} +EXPORT_SYMBOL_GPL(kvm_request_apicv_update); + static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { if (!kvm_apic_present(vcpu)) @@ -8186,6 +8237,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) kvm_hv_process_stimers(vcpu); + if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) + kvm_vcpu_update_apicv(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -9219,10 +9272,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return r; if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; + if (kvm_apicv_activated(vcpu->kvm)) + vcpu->arch.apicv_active = true; } else static_key_slow_inc(&kvm_no_apic_vcpu); @@ -9633,7 +9687,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - kvm->arch.kvmclock_offset = -ktime_get_boottime_ns(); + kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); pvclock_update_vm_gtod_copy(kvm); kvm->arch.guest_can_read_msr_platform_info = true; @@ -10448,3 +10502,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2d2ff855773b..3624665acee4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -357,7 +357,7 @@ static inline bool kvm_pat_valid(u64 data) return (data | ((data & 0x0202020202020202ull) << 1)) == data; } -static inline bool kvm_dr7_valid(unsigned long data) +static inline bool kvm_dr7_valid(u64 data) { /* Bits [63:32] are reserved */ return !(data >> 32); |