From bdaffe1d93e7eddbcc71d074a5d49eba7fe1c765 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 15:03:06 +0200 Subject: KVM: x86: set TMR when the interrupt is accepted Do not compute TMR in advance. Instead, set the TMR just before the interrupt is accepted into the IRR. This limits the coupling between IOAPIC and LAPIC. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 9 ++------- arch/x86/kvm/ioapic.h | 3 +-- arch/x86/kvm/lapic.c | 19 ++++++++++--------- arch/x86/kvm/lapic.h | 1 - arch/x86/kvm/x86.c | 5 +---- 5 files changed, 14 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 856f79105bb5..eaf4ec38d980 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -246,8 +246,7 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic) smp_wmb(); } -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, - u32 *tmr) +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; union kvm_ioapic_redirect_entry *e; @@ -260,13 +259,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode)) { + e->fields.dest_id, e->fields.dest_mode)) __set_bit(e->fields.vector, (unsigned long *)eoi_exit_bitmap); - if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG) - __set_bit(e->fields.vector, - (unsigned long *)tmr); - } } } spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index ca0b0b4e6256..3dbd0e2aac4e 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -120,7 +120,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, - u32 *tmr); +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8d9013c5e1ee..5693dd9fc163 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -551,15 +551,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int i; - - for (i = 0; i < 8; i++) - apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]); -} - static void apic_update_ppr(struct kvm_lapic *apic) { u32 tpr, isrv, ppr, old_ppr; @@ -781,6 +772,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; case APIC_DM_FIXED: + if (unlikely(trig_mode && !level)) + break; + /* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break; @@ -790,6 +784,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); + if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { + if (trig_mode) + apic_set_vector(vector, apic->regs + APIC_TMR); + else + apic_clear_vector(vector, apic->regs + APIC_TMR); + } + if (kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); else { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 764037991d26..eb46d6bcaa75 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); -void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92511d4b7236..c1ed74ebc502 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6144,17 +6144,14 @@ static void process_smi(struct kvm_vcpu *vcpu) static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; - u32 tmr[8]; if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; memset(eoi_exit_bitmap, 0, 32); - memset(tmr, 0, 32); - kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr); + kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap); kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); - kvm_apic_update_tmr(vcpu, tmr); } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 3bb345f387dd26beb097cf776e342bc0d96d805a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 10:43:18 +0200 Subject: KVM: x86: store IOAPIC-handled vectors in each VCPU We can reuse the algorithm that computes the EOI exit bitmap to figure out which vectors are handled by the IOAPIC. The only difference between the two is for edge-triggered interrupts other than IRQ8 that have no notifiers active; however, the IOAPIC does not have to do anything special for these interrupts anyway. This again limits the interactions between the IOAPIC and the LAPIC, making it easier to move the former to userspace. Inspired by a patch from Steve Rutherford. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/ioapic.c | 18 ++---------------- arch/x86/kvm/ioapic.h | 8 -------- arch/x86/kvm/lapic.c | 10 ++++++++-- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 3 ++- arch/x86/kvm/x86.c | 8 +++----- 7 files changed, 18 insertions(+), 34 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2beee0382088..33609c2c743b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -396,6 +396,7 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + u64 eoi_exit_bitmap[4]; unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; @@ -822,7 +823,7 @@ struct kvm_x86_ops { int (*vm_has_apicv)(struct kvm *kvm); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm *kvm, int isr); - void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index eaf4ec38d980..2dcda0f188ba 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -233,19 +233,6 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) } -static void update_handled_vectors(struct kvm_ioapic *ioapic) -{ - DECLARE_BITMAP(handled_vectors, 256); - int i; - - memset(handled_vectors, 0, sizeof(handled_vectors)); - for (i = 0; i < IOAPIC_NUM_PINS; ++i) - __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); - memcpy(ioapic->handled_vectors, handled_vectors, - sizeof(handled_vectors)); - smp_wmb(); -} - void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; @@ -310,7 +297,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) e->bits |= (u32) val; e->fields.remote_irr = 0; } - update_handled_vectors(ioapic); mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); @@ -594,7 +580,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->id = 0; memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); rtc_irq_eoi_tracking_reset(ioapic); - update_handled_vectors(ioapic); } static const struct kvm_io_device_ops ioapic_mmio_ops = { @@ -623,8 +608,10 @@ int kvm_ioapic_init(struct kvm *kvm) if (ret < 0) { kvm->arch.vioapic = NULL; kfree(ioapic); + return ret; } + kvm_vcpu_request_scan_ioapic(kvm); return ret; } @@ -661,7 +648,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); ioapic->irr = 0; ioapic->irr_delivered = 0; - update_handled_vectors(ioapic); kvm_vcpu_request_scan_ioapic(kvm); kvm_ioapic_inject_all(ioapic, state->irr); spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 3dbd0e2aac4e..bf36d66a1951 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -73,7 +73,6 @@ struct kvm_ioapic { struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; - DECLARE_BITMAP(handled_vectors, 256); struct rtc_status rtc_status; struct delayed_work eoi_inject; u32 irq_eoi[IOAPIC_NUM_PINS]; @@ -98,13 +97,6 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } -static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) -{ - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - smp_rmb(); - return test_bit(vector, ioapic->handled_vectors); -} - void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5693dd9fc163..4c30fb0a48a1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -869,14 +869,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } +static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) +{ + return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap); +} + static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + if (kvm_ioapic_handles_vector(apic, vector)) { int trigger_mode; if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); } } @@ -1923,7 +1929,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, /* Cache not set: could be safe but we don't bother. */ apic->highest_isr_cache == -1 || /* Need EOI to update ioapic. */ - kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) { /* * PV EOI was disabled by apic_sync_pv_eoi_from_guest * so we need not do anything here. diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2f9ed1ff0632..09582081276e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3664,7 +3664,7 @@ static int svm_vm_has_apicv(struct kvm *kvm) return 0; } -static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu) { return; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06ef4908ba61..9381b1d34313 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8043,8 +8043,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) } } -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu) { + u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap; if (!vmx_vm_has_apicv(vcpu->kvm)) return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1ed74ebc502..c4adb8847b23 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6143,15 +6143,13 @@ static void process_smi(struct kvm_vcpu *vcpu) static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { - u64 eoi_exit_bitmap[4]; - if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - memset(eoi_exit_bitmap, 0, 32); + memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8); - kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap); - kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + kvm_x86_ops->load_eoi_exitmap(vcpu); } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From d50ab6c1a2b24e12d3012d7beb343eba5b94a6ca Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 11:49:59 +0200 Subject: KVM: x86: replace vm_has_apicv hook with cpu_uses_apicv This will avoid an unnecessary trip to ->kvm and from there to the VPIC. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/irq.c | 2 +- arch/x86/kvm/lapic.c | 4 ++-- arch/x86/kvm/lapic.h | 4 ++-- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 8 +++++++- 6 files changed, 15 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33609c2c743b..a0ef289d5a86 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -820,7 +820,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - int (*vm_has_apicv)(struct kvm *kvm); + int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm *kvm, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a1ec6a50a05a..c0dad893dc59 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -63,7 +63,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) if (kvm_cpu_has_extint(v)) return 1; - if (kvm_apic_vid_enabled(v->kvm)) + if (kvm_vcpu_apic_vid_enabled(v)) return 0; return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4c30fb0a48a1..c568d69c7060 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -390,7 +390,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) vcpu = apic->vcpu; - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) { + if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) { /* try to update RVI */ apic_clear_vector(vec, apic->regs + APIC_IRR); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -1622,7 +1622,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); + apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu); apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0; apic->highest_isr_cache = -1; update_divide_count(apic); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index eb46d6bcaa75..7259d272416f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -143,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) return apic->vcpu->arch.apic_base & X2APIC_ENABLE; } -static inline bool kvm_apic_vid_enabled(struct kvm *kvm) +static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->vm_has_apicv(kvm); + return kvm_x86_ops->cpu_uses_apicv(vcpu); } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 09582081276e..9a37e487aca0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3659,7 +3659,7 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } -static int svm_vm_has_apicv(struct kvm *kvm) +static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu) { return 0; } @@ -4425,7 +4425,7 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, - .vm_has_apicv = svm_vm_has_apicv, + .cpu_uses_apicv = svm_cpu_uses_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, .sync_pir_to_irr = svm_sync_pir_to_irr, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9381b1d34313..b8a90c4f676f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -810,6 +810,7 @@ static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); static int vmx_vm_has_apicv(struct kvm *kvm); +static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -4337,6 +4338,11 @@ static int vmx_vm_has_apicv(struct kvm *kvm) return enable_apicv && irqchip_in_kernel(kvm); } +static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu) +{ + return vmx_vm_has_apicv(vcpu->kvm); +} + static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -10362,7 +10368,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .vm_has_apicv = vmx_vm_has_apicv, + .cpu_uses_apicv = vmx_cpu_uses_apicv, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, -- cgit v1.2.3-70-g09d2 From 35754c987f252e859bfa390a6816e85563afe79d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 12:05:37 +0200 Subject: KVM: x86: introduce lapic_in_kernel Avoid pointer chasing and memory barriers, and simplify the code when split irqchip (LAPIC in kernel, IOAPIC/PIC in userspace) is introduced. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq.c | 6 +++--- arch/x86/kvm/irq.h | 8 ++++++++ arch/x86/kvm/lapic.c | 4 ++-- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 46 ++++++++++++++++++++-------------------------- arch/x86/kvm/x86.c | 18 +++++++++--------- 7 files changed, 45 insertions(+), 43 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index c0dad893dc59..b653ae202c8e 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) */ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) { - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.pending; if (kvm_cpu_has_extint(v)) @@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) */ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.pending; if (kvm_cpu_has_extint(v)) @@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) { int vector; - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; vector = kvm_cpu_get_extint(v); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 3d782a2c336a..9e6e7e04de98 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -92,6 +92,14 @@ static inline int irqchip_in_kernel(struct kvm *kvm) return vpic != NULL; } +static inline int lapic_in_kernel(struct kvm_vcpu *vcpu) +{ + /* Same as irqchip_in_kernel(vcpu->kvm), but with less + * pointer chasing and no unnecessary memory barriers. + */ + return vcpu->arch.apic != NULL; +} + void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c568d69c7060..c4bcc86d6dc4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1985,7 +1985,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4; - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; if (reg == APIC_ICR2) @@ -2002,7 +2002,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; if (reg == APIC_DFR || reg == APIC_ICR2) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff606f507913..c3f39aa9b9cb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3427,7 +3427,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) static bool can_do_async_pf(struct kvm_vcpu *vcpu) { - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9a37e487aca0..34c089023827 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3060,7 +3060,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ r = cr_interception(svm); - if (irqchip_in_kernel(svm->vcpu.kvm)) + if (lapic_in_kernel(&svm->vcpu)) return r; if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return r; @@ -3305,7 +3305,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) * If the user space waits to inject interrupts, exit as soon as * possible */ - if (!irqchip_in_kernel(svm->vcpu.kvm) && + if (!lapic_in_kernel(&svm->vcpu) && kvm_run->request_interrupt_window && !kvm_cpu_has_interrupt(&svm->vcpu)) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b8a90c4f676f..4729b7f6e5f2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -809,7 +809,6 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); -static int vmx_vm_has_apicv(struct kvm *kvm); static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -947,9 +946,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void) return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } -static inline bool vm_need_tpr_shadow(struct kvm *kvm) +static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) { - return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); + return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); } static inline bool cpu_has_secondary_exec_ctrls(void) @@ -1063,9 +1062,9 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) +static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { - return flexpriority_enabled && irqchip_in_kernel(kvm); + return flexpriority_enabled && lapic_in_kernel(vcpu); } static inline bool cpu_has_vmx_vpid(void) @@ -2378,7 +2377,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - if (vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (vmx_cpu_uses_apicv(&vmx->vcpu)) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_POSTED_INTR; @@ -4333,14 +4332,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) msr, MSR_TYPE_W); } -static int vmx_vm_has_apicv(struct kvm *kvm) -{ - return enable_apicv && irqchip_in_kernel(kvm); -} - static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu) { - return vmx_vm_has_apicv(vcpu->kvm); + return enable_apicv && lapic_in_kernel(vcpu); } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) @@ -4520,7 +4514,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; - if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (!vmx_cpu_uses_apicv(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; return pin_based_exec_ctrl; } @@ -4532,7 +4526,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; - if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { + if (!cpu_need_tpr_shadow(&vmx->vcpu)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 exec_control |= CPU_BASED_CR8_STORE_EXITING | @@ -4549,7 +4543,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; @@ -4563,7 +4557,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (!vmx_cpu_uses_apicv(&vmx->vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; @@ -4624,7 +4618,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx_secondary_exec_control(vmx)); } - if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { + if (vmx_cpu_uses_apicv(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); @@ -4768,7 +4762,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (cpu_has_vmx_tpr_shadow() && !init_event) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vcpu->kvm)) + if (cpu_need_tpr_shadow(vcpu)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, __pa(vcpu->arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); @@ -4776,7 +4770,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (vmx_vm_has_apicv(vcpu->kvm)) + if (vmx_cpu_uses_apicv(vcpu)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); if (vmx->vpid != 0) @@ -5316,7 +5310,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); kvm_complete_insn_gp(vcpu, err); - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) return 1; if (cr8_prev <= cr8) return 1; @@ -5535,7 +5529,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) * If the user space waits to inject interrupts, exit as soon as * possible */ - if (!irqchip_in_kernel(vcpu->kvm) && + if (!lapic_in_kernel(vcpu) && vcpu->run->request_interrupt_window && !kvm_cpu_has_interrupt(vcpu)) { vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; @@ -7944,10 +7938,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) * apicv */ if (!cpu_has_vmx_virtualize_x2apic_mode() || - !vmx_vm_has_apicv(vcpu->kvm)) + !vmx_cpu_uses_apicv(vcpu)) return; - if (!vm_need_tpr_shadow(vcpu->kvm)) + if (!cpu_need_tpr_shadow(vcpu)) return; sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); @@ -8052,7 +8046,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu) { u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap; - if (!vmx_vm_has_apicv(vcpu->kvm)) + if (!vmx_cpu_uses_apicv(vcpu)) return; vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); @@ -8551,7 +8545,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) put_cpu(); if (err) goto free_vmcs; - if (vm_need_virtualize_apic_accesses(kvm)) { + if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { err = alloc_apic_access_page(kvm); if (err) goto free_vmcs; @@ -9344,7 +9338,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && - (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { + cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; kvm_vcpu_reload_apic_access_page(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c4adb8847b23..88fb3bf2ae6d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -788,7 +788,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) return 1; - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; @@ -798,7 +798,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) return kvm_lapic_get_cr8(vcpu); else return vcpu->arch.cr8; @@ -3175,7 +3175,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vapic_addr va; r = -EINVAL; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) goto out; r = -EFAULT; if (copy_from_user(&va, argp, sizeof va)) @@ -5666,7 +5666,7 @@ void kvm_arch_exit(void) int kvm_vcpu_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; - if (irqchip_in_kernel(vcpu->kvm)) { + if (lapic_in_kernel(vcpu)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; return 1; } else { @@ -6162,7 +6162,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { struct page *page = NULL; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) return; if (!kvm_x86_ops->set_apic_access_page_addr) @@ -6200,7 +6200,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; - bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && + bool req_int_win = !lapic_in_kernel(vcpu) && vcpu->run->request_interrupt_window; bool req_immediate_exit = false; @@ -6597,7 +6597,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) { + if (!lapic_in_kernel(vcpu)) { if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { r = -EINVAL; goto out; @@ -7297,7 +7297,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { - return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); + return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu); } struct static_key kvm_no_apic_vcpu __read_mostly; @@ -7391,7 +7391,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) static_key_slow_dec(&kvm_no_apic_vcpu); } -- cgit v1.2.3-70-g09d2 From 4ca7dd8ce4b24e18f94eed90e80c6eb80fb48c9a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 30 Jul 2015 10:32:16 +0200 Subject: KVM: x86: unify handling of interrupt window The interrupt window is currently checked twice, once in vmx.c/svm.c and once in dm_request_for_irq_injection. The only difference is the extra check for kvm_arch_interrupt_allowed in dm_request_for_irq_injection, and the different return value (EINTR/KVM_EXIT_INTR for vmx.c/svm.c vs. 0/KVM_EXIT_IRQ_WINDOW_OPEN for dm_request_for_irq_injection). However, dm_request_for_irq_injection is basically dead code! Revive it by removing the checks in vmx.c and svm.c's vmexit handlers, and fixing the returned values for the dm_request_for_irq_injection case. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 13 ------------- arch/x86/kvm/vmx.c | 11 ----------- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 2 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 34c089023827..54a86183e5d3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3294,24 +3294,11 @@ static int msr_interception(struct vcpu_svm *svm) static int interrupt_window_interception(struct vcpu_svm *svm) { - struct kvm_run *kvm_run = svm->vcpu.run; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); ++svm->vcpu.stat.irq_window_exits; - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!lapic_in_kernel(&svm->vcpu) && - kvm_run->request_interrupt_window && - !kvm_cpu_has_interrupt(&svm->vcpu)) { - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } - return 1; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4729b7f6e5f2..6ee0dc69675b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5524,17 +5524,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); ++vcpu->stat.irq_window_exits; - - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!lapic_in_kernel(vcpu) && - vcpu->run->request_interrupt_window && - !kvm_cpu_has_interrupt(vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } return 1; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88fb3bf2ae6d..28c98c8f9d1c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6469,8 +6469,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu) kvm_inject_pending_timer_irqs(vcpu); if (dm_request_for_irq_injection(vcpu)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + r = 0; + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; ++vcpu->stat.request_irq_exits; break; } -- cgit v1.2.3-70-g09d2 From 49df6397edfc5a8ba8ca813b51fb9729d8e94b40 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Wed, 29 Jul 2015 23:21:40 -0700 Subject: KVM: x86: Split the APIC from the rest of IRQCHIP. First patch in a series which enables the relocation of the PIC/IOAPIC to userspace. Adds capability KVM_CAP_SPLIT_IRQCHIP; KVM_CAP_SPLIT_IRQCHIP enables the construction of LAPICs without the rest of the irqchip. Compile tested for x86. Signed-off-by: Steve Rutherford Suggested-by: Andrew Honig Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 17 +++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/i8254.c | 4 +++- arch/x86/kvm/ioapic.h | 8 ++++++++ arch/x86/kvm/irq.h | 11 ++++++++++- arch/x86/kvm/irq_comm.c | 9 ++++++++- arch/x86/kvm/lapic.c | 6 ++++-- arch/x86/kvm/x86.c | 23 +++++++++++++++++++++-- include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + 10 files changed, 75 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index d9ecceea5a02..43e0816d0de1 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3627,6 +3627,23 @@ struct { KVM handlers should exit to userspace with rc = -EREMOTE. +7.5 KVM_CAP_SPLIT_IRQCHIP + +Architectures: x86 +Parameters: None +Returns: 0 on success, -1 on error + +Create a local apic for each processor in the kernel. This can be used +instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the +IOAPIC and PIC (and also the PIT, even though this has to be enabled +separately). + +This supersedes KVM_CREATE_IRQCHIP, creating only local APICs, but no in kernel +IOAPIC or PIC. This also enables in kernel routing of interrupt requests. + +Fails if VCPU has already been created, or if the irqchip is already in the +kernel (i.e. KVM_CREATE_IRQCHIP has already been called). + 8. Other capabilities. ---------------------- diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a0ef289d5a86..befcf555bddc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -684,6 +684,8 @@ struct kvm_arch { u32 bsp_vcpu_id; u64 disabled_quirks; + + bool irqchip_split; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f90952f64e79..08116ff227cc 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -35,6 +35,7 @@ #include #include +#include "ioapic.h" #include "irq.h" #include "i8254.h" #include "x86.h" @@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; s64 interval; - if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) + if (!ioapic_in_kernel(kvm) || + ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) return; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index bf36d66a1951..a8842c0dee73 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -97,6 +97,14 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } +static inline int ioapic_in_kernel(struct kvm *kvm) +{ + int ret; + + ret = (ioapic_irqchip(kvm) != NULL); + return ret; +} + void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9e6e7e04de98..2f9703dcd913 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -83,13 +83,22 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) return kvm->arch.vpic; } +static inline int irqchip_split(struct kvm *kvm) +{ + return kvm->arch.irqchip_split; +} + static inline int irqchip_in_kernel(struct kvm *kvm) { struct kvm_pic *vpic = pic_irqchip(kvm); + bool ret; + + ret = (vpic != NULL); + ret |= irqchip_split(kvm); /* Read vpic before kvm->irq_routing. */ smp_rmb(); - return vpic != NULL; + return ret; } static inline int lapic_in_kernel(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 9efff9e5b58c..67f6b62a6814 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -208,7 +208,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_in_kernel(kvm)) + if (!ioapic_in_kernel(kvm)) goto unlock; kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); @@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) return kvm_set_irq_routing(kvm, default_routing, ARRAY_SIZE(default_routing), 0); } + +static const struct kvm_irq_routing_entry empty_routing[] = {}; + +int kvm_setup_empty_irq_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, empty_routing, 0, 0); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c4bcc86d6dc4..e05946c36b87 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -209,7 +209,8 @@ out: if (old) kfree_rcu(old, rcu); - kvm_vcpu_request_scan_ioapic(kvm); + if (ioapic_in_kernel(kvm)) + kvm_vcpu_request_scan_ioapic(kvm); } static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) @@ -1845,7 +1846,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_rtc_eoi_tracking_restore_one(vcpu); + if (ioapic_in_kernel(vcpu->kvm)) + kvm_rtc_eoi_tracking_restore_one(vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 28c98c8f9d1c..f720774a4797 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2448,6 +2448,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: + case KVM_CAP_SPLIT_IRQCHIP: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -3555,6 +3556,24 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.disabled_quirks = cap->args[0]; r = 0; break; + case KVM_CAP_SPLIT_IRQCHIP: { + mutex_lock(&kvm->lock); + r = -EEXIST; + if (irqchip_in_kernel(kvm)) + goto split_irqchip_unlock; + if (atomic_read(&kvm->online_vcpus)) + goto split_irqchip_unlock; + r = kvm_setup_empty_irq_routing(kvm); + if (r) + goto split_irqchip_unlock; + /* Pairs with irqchip_in_kernel. */ + smp_wmb(); + kvm->arch.irqchip_split = true; + r = 0; +split_irqchip_unlock: + mutex_unlock(&kvm->lock); + break; + } default: r = -EINVAL; break; @@ -3668,7 +3687,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -3692,7 +3711,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1bef9e21e725..354f147647ab 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1002,6 +1002,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) #endif int kvm_setup_default_irq_routing(struct kvm *kvm); +int kvm_setup_empty_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a9256f0331ae..ed00f8fc9ea2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -824,6 +824,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_MULTI_ADDRESS_SPACE 118 #define KVM_CAP_GUEST_DEBUG_HW_BPS 119 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120 +#define KVM_CAP_SPLIT_IRQCHIP 121 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-70-g09d2 From 7543a635aa09eb138b2cbf60ac3ff19503ae6954 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Wed, 29 Jul 2015 23:21:41 -0700 Subject: KVM: x86: Add KVM exit for IOAPIC EOIs Adds KVM_EXIT_IOAPIC_EOI which allows the kernel to EOI level-triggered IOAPIC interrupts. Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs to be informed (which is identical to the EOI_EXIT_BITMAP field used by modern x86 processors, but can also be used to elide kvm IOAPIC EOI exits on older processors). [Note: A prototype using ResampleFDs found that decoupling the EOI from the VCPU's thread made it possible for the VCPU to not see a recent EOI after reentering the guest. This does not match real hardware.] Compile tested for Intel x86. Signed-off-by: Steve Rutherford Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 12 ++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/lapic.c | 24 +++++++++++++++++------- arch/x86/kvm/x86.c | 11 +++++++++++ include/linux/kvm_host.h | 2 +- include/uapi/linux/kvm.h | 5 +++++ 6 files changed, 48 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 43e0816d0de1..0d14bf5db534 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3309,6 +3309,18 @@ Valid values for 'type' are: to ignore the request, or to gather VM memory core dump and/or reset/shutdown of the VM. + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; + +Indicates that the VCPU's in-kernel local APIC received an EOI for a +level-triggered IOAPIC interrupt. This exit only triggers when the +IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled); +the userspace IOAPIC should process the EOI and retrigger the interrupt if +it is still asserted. Vector is the LAPIC interrupt vector for which the +EOI was received. + /* Fix the size of the union. */ char padding[256]; }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index befcf555bddc..af09fa1d1be7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -574,6 +574,8 @@ struct kvm_vcpu_arch { struct { bool pv_unhalted; } pv; + + int pending_ioapic_eoi; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e05946c36b87..ef70f6f3a37a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -877,15 +877,25 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (kvm_ioapic_handles_vector(apic, vector)) { - int trigger_mode; - if (apic_test_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; + int trigger_mode; + + /* Eoi the ioapic only if the ioapic doesn't own the vector. */ + if (!kvm_ioapic_handles_vector(apic, vector)) + return; - kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); + /* Request a KVM exit to inform the userspace IOAPIC. */ + if (irqchip_split(apic->vcpu->kvm)) { + apic->vcpu->arch.pending_ioapic_eoi = vector; + kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); + return; } + + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); } static int apic_set_eoi(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f720774a4797..27429daa054e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6271,6 +6271,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_pmu_handle_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_pmu_deliver_pmi(vcpu); + if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { + BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); + if (test_bit(vcpu->arch.pending_ioapic_eoi, + (void *) vcpu->arch.eoi_exit_bitmap)) { + vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI; + vcpu->run->eoi.vector = + vcpu->arch.pending_ioapic_eoi; + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 354f147647ab..3b33215ed447 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -140,6 +140,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_APIC_PAGE_RELOAD 25 #define KVM_REQ_SMI 26 #define KVM_REQ_HV_CRASH 27 +#define KVM_REQ_IOAPIC_EOI_EXIT 28 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -1146,4 +1147,3 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ #endif - diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ed00f8fc9ea2..12e3afbf0f47 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -183,6 +183,7 @@ struct kvm_s390_skeys { #define KVM_EXIT_EPR 23 #define KVM_EXIT_SYSTEM_EVENT 24 #define KVM_EXIT_S390_STSI 25 +#define KVM_EXIT_IOAPIC_EOI 26 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -333,6 +334,10 @@ struct kvm_run { __u8 sel1; __u16 sel2; } s390_stsi; + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; /* Fix the size of the union. */ char padding[256]; }; -- cgit v1.2.3-70-g09d2 From b053b2aef25d00773fa6762dcd4b7f5c9c42d171 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Wed, 29 Jul 2015 23:32:35 -0700 Subject: KVM: x86: Add EOI exit bitmap inference In order to support a userspace IOAPIC interacting with an in kernel APIC, the EOI exit bitmaps need to be configurable. If the IOAPIC is in userspace (i.e. the irqchip has been split), the EOI exit bitmaps will be set whenever the GSI Routes are configured. In particular, for the low MSI routes are reservable for userspace IOAPICs. For these MSI routes, the EOI Exit bit corresponding to the destination vector of the route will be set for the destination VCPU. The intention is for the userspace IOAPICs to use the reservable MSI routes to inject interrupts into the guest. This is a slight abuse of the notion of an MSI Route, given that MSIs classically bypass the IOAPIC. It might be worthwhile to add an additional route type to improve clarity. Compile tested for Intel x86. Signed-off-by: Steve Rutherford Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 9 ++++++--- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/ioapic.h | 2 ++ arch/x86/kvm/irq_comm.c | 42 +++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.c | 3 +-- arch/x86/kvm/x86.c | 9 ++++++++- include/linux/kvm_host.h | 16 +++++++++++++++ virt/kvm/irqchip.c | 12 ++--------- 8 files changed, 78 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 0d14bf5db534..89e71648d748 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3642,7 +3642,7 @@ KVM handlers should exit to userspace with rc = -EREMOTE. 7.5 KVM_CAP_SPLIT_IRQCHIP Architectures: x86 -Parameters: None +Parameters: args[0] - number of routes reserved for userspace IOAPICs Returns: 0 on success, -1 on error Create a local apic for each processor in the kernel. This can be used @@ -3650,8 +3650,11 @@ instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the IOAPIC and PIC (and also the PIT, even though this has to be enabled separately). -This supersedes KVM_CREATE_IRQCHIP, creating only local APICs, but no in kernel -IOAPIC or PIC. This also enables in kernel routing of interrupt requests. +This capability also enables in kernel routing of interrupt requests; +when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are +used in the IRQ routing table. The first args[0] MSI routes are reserved +for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes, +a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. Fails if VCPU has already been created, or if the irqchip is already in the kernel (i.e. KVM_CREATE_IRQCHIP has already been called). diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af09fa1d1be7..7a5f9debbcd8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -688,6 +688,7 @@ struct kvm_arch { u64 disabled_quirks; bool irqchip_split; + u8 nr_reserved_ioapic_pins; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index a8842c0dee73..084617d37c74 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -9,6 +9,7 @@ struct kvm; struct kvm_vcpu; #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ #define IOAPIC_EDGE_TRIG 0 #define IOAPIC_LEVEL_TRIG 1 @@ -121,5 +122,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); #endif diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 67f6b62a6814..177460998bb0 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -335,3 +335,45 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm) { return kvm_set_irq_routing(kvm, empty_routing, 0, 0); } + +void kvm_arch_irq_routing_update(struct kvm *kvm) +{ + if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) + return; + kvm_make_scan_ioapic_request(kvm); +} + +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_kernel_irq_routing_entry *entry; + struct kvm_irq_routing_table *table; + u32 i, nr_ioapic_pins; + int idx; + + /* kvm->irq_routing must be read after clearing + * KVM_SCAN_IOAPIC. */ + smp_mb(); + idx = srcu_read_lock(&kvm->irq_srcu); + table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + nr_ioapic_pins = min_t(u32, table->nr_rt_entries, + kvm->arch.nr_reserved_ioapic_pins); + for (i = 0; i < nr_ioapic_pins; ++i) { + hlist_for_each_entry(entry, &table->map[i], link) { + u32 dest_id, dest_mode; + + if (entry->type != KVM_IRQ_ROUTING_MSI) + continue; + dest_id = (entry->msi.address_lo >> 12) & 0xff; + dest_mode = (entry->msi.address_lo >> 2) & 0x1; + if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id, + dest_mode)) { + u32 vector = entry->msi.data & 0xff; + + __set_bit(vector, + (unsigned long *) eoi_exit_bitmap); + } + } + } + srcu_read_unlock(&kvm->irq_srcu, idx); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ef70f6f3a37a..2f4c0d0cbe0a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -209,8 +209,7 @@ out: if (old) kfree_rcu(old, rcu); - if (ioapic_in_kernel(kvm)) - kvm_vcpu_request_scan_ioapic(kvm); + kvm_make_scan_ioapic_request(kvm); } static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27429daa054e..4aeed2086c5e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3558,6 +3558,9 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; case KVM_CAP_SPLIT_IRQCHIP: { mutex_lock(&kvm->lock); + r = -EINVAL; + if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) + goto split_irqchip_unlock; r = -EEXIST; if (irqchip_in_kernel(kvm)) goto split_irqchip_unlock; @@ -3569,6 +3572,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, /* Pairs with irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_split = true; + kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; r = 0; split_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -6167,7 +6171,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8); - kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + if (irqchip_split(vcpu->kvm)) + kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap); + else + kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); kvm_x86_ops->load_eoi_exitmap(vcpu); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3b33215ed447..d754f2d826e9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -330,6 +330,18 @@ struct kvm_kernel_irq_routing_entry { struct hlist_node link; }; +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +struct kvm_irq_routing_table { + int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; + u32 nr_rt_entries; + /* + * Array indexed by gsi. Each entry contains list of irq chips + * the gsi is connected to. + */ + struct hlist_head map[0]; +}; +#endif + #ifndef KVM_PRIVATE_MEM_SLOTS #define KVM_PRIVATE_MEM_SLOTS 0 #endif @@ -456,10 +468,14 @@ void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); +void kvm_arch_irq_routing_update(struct kvm *kvm); #else static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) { } +static inline void kvm_arch_irq_routing_update(struct kvm *kvm) +{ +} #endif #ifdef CONFIG_HAVE_KVM_IRQFD diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index d7ea8e20dae4..716a1c4db528 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -31,16 +31,6 @@ #include #include "irq.h" -struct kvm_irq_routing_table { - int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; - u32 nr_rt_entries; - /* - * Array indexed by gsi. Each entry contains list of irq chips - * the gsi is connected to. - */ - struct hlist_head map[0]; -}; - int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) { @@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm, kvm_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); + kvm_arch_irq_routing_update(kvm); + synchronize_srcu_expedited(&kvm->irq_srcu); new = old; -- cgit v1.2.3-70-g09d2 From 1c1a9ce973a7863dd46767226bce2a5f12d48bc6 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Thu, 30 Jul 2015 11:27:16 +0200 Subject: KVM: x86: Add support for local interrupt requests from userspace In order to enable userspace PIC support, the userspace PIC needs to be able to inject local interrupts even when the APICs are in the kernel. KVM_INTERRUPT now supports sending local interrupts to an APIC when APICs are in the kernel. The ready_for_interrupt_request flag is now only set when the CPU/APIC will immediately accept and inject an interrupt (i.e. APIC has not masked the PIC). When the PIC wishes to initiate an INTA cycle with, say, CPU0, it kicks CPU0 out of the guest, and renedezvous with CPU0 once it arrives in userspace. When the CPU/APIC unmasks the PIC, a KVM_EXIT_IRQ_WINDOW_OPEN is triggered, so that userspace has a chance to inject a PIC interrupt if it had been pending. Overall, this design can lead to a small number of spurious userspace renedezvous. In particular, whenever the PIC transistions from low to high while it is masked and whenever the PIC becomes unmasked while it is low. Note: this does not buffer more than one local interrupt in the kernel, so the VMM needs to enter the guest in order to complete interrupt injection before injecting an additional interrupt. Compiles for x86. Can pass the KVM Unit Tests. Signed-off-by: Steve Rutherford Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 14 +++++++++---- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/irq.c | 32 +++++++++++++++++++++++------ arch/x86/kvm/irq.h | 8 ++++++++ arch/x86/kvm/x86.c | 42 ++++++++++++++++++++++++++++++--------- 5 files changed, 78 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 89e71648d748..e3e9c41721a2 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -401,10 +401,9 @@ Capability: basic Architectures: x86, ppc, mips Type: vcpu ioctl Parameters: struct kvm_interrupt (in) -Returns: 0 on success, -1 on error +Returns: 0 on success, negative on failure. -Queues a hardware interrupt vector to be injected. This is only -useful if in-kernel local APIC or equivalent is not used. +Queues a hardware interrupt vector to be injected. /* for KVM_INTERRUPT */ struct kvm_interrupt { @@ -414,7 +413,14 @@ struct kvm_interrupt { X86: -Note 'irq' is an interrupt vector, not an interrupt pin or line. +Returns: 0 on success, + -EEXIST if an interrupt is already enqueued + -EINVAL the the irq number is invalid + -ENXIO if the PIC is in the kernel + -EFAULT if the pointer is invalid + +Note 'irq' is an interrupt vector, not an interrupt pin or line. This +ioctl is useful if the in-kernel PIC is not used. PPC: diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7a5f9debbcd8..76a5b30979b3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -576,6 +576,7 @@ struct kvm_vcpu_arch { } pv; int pending_ioapic_eoi; + int pending_external_vector; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b653ae202c8e..097060e33bd6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -37,15 +37,28 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); +/* + * check if there is a pending userspace external interrupt + */ +static int pending_userspace_extint(struct kvm_vcpu *v) +{ + return v->arch.pending_external_vector != -1; +} + /* * check if there is pending interrupt from * non-APIC source without intack. */ static int kvm_cpu_has_extint(struct kvm_vcpu *v) { - if (kvm_apic_accept_pic_intr(v)) - return pic_irqchip(v->kvm)->output; /* PIC */ - else + u8 accept = kvm_apic_accept_pic_intr(v); + + if (accept) { + if (irqchip_split(v->kvm)) + return pending_userspace_extint(v); + else + return pic_irqchip(v->kvm)->output; + } else return 0; } @@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); */ static int kvm_cpu_get_extint(struct kvm_vcpu *v) { - if (kvm_cpu_has_extint(v)) - return kvm_pic_read_irq(v->kvm); /* PIC */ - return -1; + if (kvm_cpu_has_extint(v)) { + if (irqchip_split(v->kvm)) { + int vector = v->arch.pending_external_vector; + + v->arch.pending_external_vector = -1; + return vector; + } else + return kvm_pic_read_irq(v->kvm); /* PIC */ + } else + return -1; } /* diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2f9703dcd913..ae5c78f2337d 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -83,6 +83,14 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) return kvm->arch.vpic; } +static inline int pic_in_kernel(struct kvm *kvm) +{ + int ret; + + ret = (pic_irqchip(kvm) != NULL); + return ret; +} + static inline int irqchip_split(struct kvm *kvm) { return kvm->arch.irqchip_split; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4aeed2086c5e..8b97c54edebc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2662,12 +2662,24 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, { if (irq->irq >= KVM_NR_INTERRUPTS) return -EINVAL; - if (irqchip_in_kernel(vcpu->kvm)) + + if (!irqchip_in_kernel(vcpu->kvm)) { + kvm_queue_interrupt(vcpu, irq->irq, false); + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; + } + + /* + * With in-kernel LAPIC, we only use this to inject EXTINT, so + * fail for in-kernel 8259. + */ + if (pic_in_kernel(vcpu->kvm)) return -ENXIO; - kvm_queue_interrupt(vcpu, irq->irq, false); - kvm_make_request(KVM_REQ_EVENT, vcpu); + if (vcpu->arch.pending_external_vector != -1) + return -EEXIST; + vcpu->arch.pending_external_vector = irq->irq; return 0; } @@ -5796,9 +5808,15 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) */ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) { - return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && - vcpu->run->request_interrupt_window && - kvm_arch_interrupt_allowed(vcpu)); + if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm)) + return false; + + if (kvm_cpu_has_interrupt(vcpu)) + return false; + + return (irqchip_split(vcpu->kvm) + ? kvm_apic_accept_pic_intr(vcpu) + : kvm_arch_interrupt_allowed(vcpu)); } static void post_kvm_run_save(struct kvm_vcpu *vcpu) @@ -5809,13 +5827,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = 1; - else + if (!irqchip_in_kernel(vcpu->kvm)) kvm_run->ready_for_interrupt_injection = kvm_arch_interrupt_allowed(vcpu) && !kvm_cpu_has_interrupt(vcpu) && !kvm_event_needs_reinjection(vcpu); + else if (!pic_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = + kvm_apic_accept_pic_intr(vcpu) && + !kvm_cpu_has_interrupt(vcpu); + else + kvm_run->ready_for_interrupt_injection = 1; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -7403,6 +7425,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); + vcpu->arch.pending_external_vector = -1; + return 0; fail_free_mce_banks: -- cgit v1.2.3-70-g09d2 From 931c33b178b091cced2a6b3f57f04655f8ff5207 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 15 Sep 2015 14:41:58 +0800 Subject: kvm: add tracepoint for fast mmio Cc: Gleb Natapov Cc: Paolo Bonzini Signed-off-by: Jason Wang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 18 ++++++++++++++++++ arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 1 + 3 files changed, 20 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4eae7c35ddf5..ce4abe333c39 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -128,6 +128,24 @@ TRACE_EVENT(kvm_pio, __entry->count > 1 ? "(...)" : "") ); +/* + * Tracepoint for fast mmio. + */ +TRACE_EVENT(kvm_fast_mmio, + TP_PROTO(u64 gpa), + TP_ARGS(gpa), + + TP_STRUCT__entry( + __field(u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = gpa; + ), + + TP_printk("fast mmio at gpa 0x%llx", __entry->gpa) +); + /* * Tracepoint for cpuid. */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6ee0dc69675b..324b09ff1def 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5756,6 +5756,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { skip_emulated_instruction(vcpu); + trace_kvm_fast_mmio(gpa); return 1; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b97c54edebc..b90ad01c617d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8070,6 +8070,7 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); -- cgit v1.2.3-70-g09d2 From e516cebb4f2b2057a5a421fea589079502acfff6 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Wed, 16 Sep 2015 12:29:48 +0300 Subject: kvm/x86: Hyper-V HV_X64_MSR_RESET msr HV_X64_MSR_RESET msr is used by Hyper-V based Windows guest to reset guest VM by hypervisor. Necessary to support loading of winhv.sys in guest, which in turn is required to support Windows VMBus. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/hyperv.h | 3 +++ arch/x86/kvm/hyperv.c | 10 ++++++++++ arch/x86/kvm/x86.c | 7 +++++++ include/linux/kvm_host.h | 1 + 4 files changed, 21 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index f0412c50c47b..dab584bf7ddf 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -153,6 +153,9 @@ /* MSR used to provide vcpu index */ #define HV_X64_MSR_VP_INDEX 0x40000002 +/* MSR used to reset the guest OS. */ +#define HV_X64_MSR_RESET 0x40000003 + /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a8160d2ae362..0ad11a232474 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_TIME_REF_COUNT: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_RESET: r = true; break; } @@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, data); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + case HV_X64_MSR_RESET: + if (data == 1) { + vcpu_debug(vcpu, "hyper-v reset requested\n"); + kvm_make_request(KVM_REQ_HV_RESET, vcpu); + } + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -241,6 +248,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) pdata); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + case HV_X64_MSR_RESET: + data = 0; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b90ad01c617d..297b36fa3b80 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -952,6 +952,7 @@ static u32 emulated_msrs[] = { HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, + HV_X64_MSR_RESET, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, @@ -6321,6 +6322,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; + r = 0; + goto out; + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d754f2d826e9..cd0ba2e931e1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -141,6 +141,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_SMI 26 #define KVM_REQ_HV_CRASH 27 #define KVM_REQ_IOAPIC_EOI_EXIT 28 +#define KVM_REQ_HV_RESET 29 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 -- cgit v1.2.3-70-g09d2 From 11c4b1ca719eaaa5ca6fe0e80bb009f3f2012fd2 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Wed, 16 Sep 2015 12:29:49 +0300 Subject: kvm/x86: Hyper-V HV_X64_MSR_VP_INDEX export for QEMU. Insert Hyper-V HV_X64_MSR_VP_INDEX into msr's emulated list, so QEMU can set Hyper-V features cpuid HV_X64_MSR_VP_INDEX_AVAILABLE bit correctly. KVM emulation part is in place already. Necessary to support loading of winhv.sys in guest, which in turn is required to support Windows VMBus. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 297b36fa3b80..716b4610791c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -953,6 +953,7 @@ static u32 emulated_msrs[] = { HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_RESET, + HV_X64_MSR_VP_INDEX, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, -- cgit v1.2.3-70-g09d2 From 9eec50b8bbe1535c440a1ee88c1958f78fc55957 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Wed, 16 Sep 2015 12:29:50 +0300 Subject: kvm/x86: Hyper-V HV_X64_MSR_VP_RUNTIME support HV_X64_MSR_VP_RUNTIME msr used by guest to get "the time the virtual processor consumes running guest code, and the time the associated logical processor spends running hypervisor code on behalf of that guest." Calculation of this time is performed by task_cputime_adjusted() for vcpu task. Necessary to support loading of winhv.sys in guest, which in turn is required to support Windows VMBus. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/uapi/asm/hyperv.h | 3 +++ arch/x86/kvm/hyperv.c | 21 +++++++++++++++++++-- arch/x86/kvm/x86.c | 1 + kernel/sched/cputime.c | 2 ++ 5 files changed, 26 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 76a5b30979b3..d064cb2e19e8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -374,6 +374,7 @@ struct kvm_mtrr { /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { u64 hv_vapic; + s64 runtime_offset; }; struct kvm_vcpu_arch { diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index dab584bf7ddf..2677a0aac2cc 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -156,6 +156,9 @@ /* MSR used to reset the guest OS. */ #define HV_X64_MSR_RESET 0x40000003 +/* MSR used to provide vcpu runtime in 100ns units */ +#define HV_X64_MSR_VP_RUNTIME 0x40000010 + /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 0ad11a232474..62cf8c915e95 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -178,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, return 0; } -static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +/* Calculate cpu time spent by current task in 100ns units */ +static u64 current_task_runtime_100ns(void) +{ + cputime_t utime, stime; + + task_cputime_adjusted(current, &utime, &stime); + return div_u64(cputime_to_nsecs(utime + stime), 100); +} + +static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; @@ -212,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + case HV_X64_MSR_VP_RUNTIME: + if (!host) + return 1; + hv->runtime_offset = data - current_task_runtime_100ns(); + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -287,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_APIC_ASSIST_PAGE: data = hv->hv_vapic; break; + case HV_X64_MSR_VP_RUNTIME: + data = current_task_runtime_100ns() + hv->runtime_offset; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -305,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) mutex_unlock(&vcpu->kvm->lock); return r; } else - return kvm_hv_set_msr(vcpu, msr, data); + return kvm_hv_set_msr(vcpu, msr, data, host); } int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 716b4610791c..185fc1619c99 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -954,6 +954,7 @@ static u32 emulated_msrs[] = { HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_RESET, HV_X64_MSR_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8cbc3db671df..26a54461bf59 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->utime; *st = p->stime; } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { -- cgit v1.2.3-70-g09d2 From d6a858d13e5d02b97daab5ba0648c704ae3f9517 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 28 Sep 2015 11:58:14 +0200 Subject: KVM: vmx: disable posted interrupts if no local APIC Uniprocessor 32-bit randconfigs can disable the local APIC, and posted interrupts require reserving a vector on the LAPIC, so they are incompatible. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 324b09ff1def..0f15e2382109 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -983,7 +983,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) static inline bool cpu_has_vmx_posted_intr(void) { - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; + return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && + vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; } static inline bool cpu_has_vmx_apicv(void) -- cgit v1.2.3-70-g09d2 From eb1c31b46800a476644cd63ab3bc7ef918b0a917 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:50 +0800 Subject: KVM: x86: allow guest to use cflushopt and clwb Pass these CPU features to guest to enable them in guest They are needed by nvdimm drivers Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2fbea2544f24..962fc7d7e0d4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); /* cpuid 0xD.1.eax */ const u32 kvm_supported_word10_x86_features = -- cgit v1.2.3-70-g09d2 From 8b3e34e46aca9b6d349b331cd9cf71ccbdc91b2e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:51 +0800 Subject: KVM: x86: add pcommit support Pass PCOMMIT CPU feature to guest to enable PCOMMIT instruction Currently we do not catch pcommit instruction for L1 guest and allow L1 to catch this instruction for L2 if, as required by the spec, L1 can enumerate the PCOMMIT instruction via CPUID: | IA32_VMX_PROCBASED_CTLS2[53] (which enumerates support for the | 1-setting of PCOMMIT exiting) is always the same as | CPUID.07H:EBX.PCOMMIT[bit 22]. Thus, software can set PCOMMIT exiting | to 1 if and only if the PCOMMIT instruction is enumerated via CPUID The spec can be found at https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/include/uapi/asm/vmx.h | 4 +++- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 8 ++++++++ arch/x86/kvm/vmx.c | 34 +++++++++++++++++++++++++++++----- 5 files changed, 42 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 448b7ca61aee..d25f32ac9c5c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -72,7 +72,7 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 - +#define SECONDARY_EXEC_PCOMMIT 0x00200000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..5b15d94a33f8 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -78,6 +78,7 @@ #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 +#define EXIT_REASON_PCOMMIT 65 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -126,7 +127,8 @@ { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ - { EXIT_REASON_XRSTORS, "XRSTORS" } + { EXIT_REASON_XRSTORS, "XRSTORS" }, \ + { EXIT_REASON_PCOMMIT, "PCOMMIT" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 962fc7d7e0d4..faeb0b3bb343 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); /* cpuid 0xD.1.eax */ const u32 kvm_supported_word10_x86_features = diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index dd05b9cef6ae..aed7bfeeaf0d 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -133,4 +133,12 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_MPX)); } + +static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_PCOMMIT)); +} #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0f15e2382109..691010604144 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2475,7 +2475,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_PCOMMIT; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -3016,7 +3017,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_ENABLE_PML; + SECONDARY_EXEC_ENABLE_PML | + SECONDARY_EXEC_PCOMMIT; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -4571,6 +4573,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) /* PML is enabled/disabled in creating/destorying vcpu */ exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + /* Currently, we allow L1 guest to directly run pcommit instruction. */ + exec_control &= ~SECONDARY_EXEC_PCOMMIT; + return exec_control; } @@ -4614,10 +4619,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) { + if (cpu_has_secondary_exec_ctrls()) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control(vmx)); - } if (vmx_cpu_uses_apicv(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); @@ -7212,6 +7216,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } +static int handle_pcommit(struct kvm_vcpu *vcpu) +{ + /* we never catch pcommit instruct for L1 guest. */ + WARN_ON(1); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7262,6 +7273,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_PCOMMIT] = handle_pcommit, }; static const int kvm_vmx_max_exit_handlers = @@ -7563,6 +7575,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + case EXIT_REASON_PCOMMIT: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); default: return true; } @@ -8697,6 +8711,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } + + if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { + if (guest_cpuid_has_pcommit(vcpu)) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_PCOMMIT; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_PCOMMIT; + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9310,7 +9333,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT); + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_PCOMMIT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; -- cgit v1.2.3-70-g09d2 From e2821620c0908593e6cb40f79c7e78b31921ed73 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:52 +0800 Subject: KVM: VMX: drop rdtscp_enabled check in prepare_vmcs02() SECONDARY_EXEC_RDTSCP set for L2 guest comes from vmcs12 Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 691010604144..8f4c13808232 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9327,8 +9327,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (cpu_has_secondary_exec_ctrls()) { exec_control = vmx_secondary_exec_control(vmx); - if (!vmx->rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; + /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | -- cgit v1.2.3-70-g09d2 From f36201e5f44b55faf44ddc820929548e51ec841b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:53 +0800 Subject: KVM: VMX: simplify rdtscp handling in vmx_cpuid_update() if vmx_rdtscp_supported() is true SECONDARY_EXEC_RDTSCP must have already been set in current vmcs by vmx_secondary_exec_control() Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8f4c13808232..d53f57aad5eb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8677,16 +8677,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - if (exec_control & SECONDARY_EXEC_RDTSCP) { - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) - vmx->rdtscp_enabled = true; - else { - exec_control &= ~SECONDARY_EXEC_RDTSCP; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); } + if (nested && !vmx->rdtscp_enabled) vmx->nested.nested_vmx_secondary_ctls_high &= ~SECONDARY_EXEC_RDTSCP; -- cgit v1.2.3-70-g09d2 From 29541bb8f4a18b2939dea137315e1803b4617c8d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:54 +0800 Subject: KVM: VMX: simplify invpcid handling in vmx_cpuid_update() If vmx_invpcid_supported() is true, second execution control filed must be supported and SECONDARY_EXEC_ENABLE_INVPCID must have already been set in current vmcs by vmx_secondary_exec_control() If vmx_invpcid_supported() is false, no need to clear SECONDARY_EXEC_ENABLE_INVPCID Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d53f57aad5eb..a5b26a940df2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8694,19 +8694,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && - best && (best->ebx & bit(X86_FEATURE_INVPCID)) && - guest_cpuid_has_pcid(vcpu)) { + (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || + !guest_cpuid_has_pcid(vcpu))) { exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } else { - if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } -- cgit v1.2.3-70-g09d2 From 8b97265a1516819d54c2d24b3874489a52f269d4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 15 Sep 2015 17:34:42 +0200 Subject: KVM: VMX: align vmx->nested.nested_vmx_secondary_ctls_high to vmx->rdtscp_enabled The SECONDARY_EXEC_RDTSCP must be available iff RDTSCP is enabled in the guest. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a5b26a940df2..dad471f8ca57 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8686,9 +8686,14 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) exec_control); } - if (nested && !vmx->rdtscp_enabled) - vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; + if (nested) { + if (vmx->rdtscp_enabled) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_RDTSCP; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_RDTSCP; + } } /* Exposing INVPCID only when PCID is exposed */ -- cgit v1.2.3-70-g09d2 From feda805fe7c4ed9cf78158e73b1218752e3b4314 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:55 +0800 Subject: KVM: VMX: unify SECONDARY_VM_EXEC_CONTROL update Unify the update in vmx_cpuid_update() Signed-off-by: Xiao Guangrong [Rewrite to use vmcs_set_secondary_exec_control. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index dad471f8ca57..a7a0ed6906fe 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8668,23 +8668,38 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } +static void vmcs_set_secondary_exec_control(u32 new_ctl) +{ + /* + * These bits in the secondary execution controls field + * are dynamic, the others are mostly based on the hypervisor + * architecture and the guest's CPUID. Do not touch the + * dynamic bits. + */ + u32 mask = + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + + u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + (new_ctl & ~mask) | (cur_ctl & mask)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; + u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) vmx->rdtscp_enabled = true; - else { - exec_control &= ~SECONDARY_EXEC_RDTSCP; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + else + secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; if (nested) { if (vmx->rdtscp_enabled) @@ -8701,14 +8716,14 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (vmx_invpcid_supported() && (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || !guest_cpuid_has_pcid(vcpu))) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID; if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } + vmcs_set_secondary_exec_control(secondary_exec_ctl); + if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { if (guest_cpuid_has_pcommit(vcpu)) vmx->nested.nested_vmx_secondary_ctls_high |= -- cgit v1.2.3-70-g09d2 From 7ec362964d6cc228ea68572fdd6d75a59f8b40fa Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:56 +0800 Subject: KVM: VMX: clean up bit operation on SECONDARY_VM_EXEC_CONTROL Use vmcs_set_bits() and vmcs_clear_bits() to clean up the code Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a7a0ed6906fe..2937cf136df6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6636,7 +6636,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { - u32 exec_control; if (vmx->nested.current_vmptr == -1ull) return; @@ -6649,9 +6648,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); } vmx->nested.posted_intr_nv = -1; @@ -7047,7 +7045,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t vmptr; - u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7079,9 +7076,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; if (enable_shadow_vmcs) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->nested.current_shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; @@ -7591,7 +7587,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) static int vmx_enable_pml(struct vcpu_vmx *vmx) { struct page *pml_pg; - u32 exec_control; pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!pml_pg) @@ -7602,24 +7597,18 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_ENABLE_PML; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); return 0; } static void vmx_disable_pml(struct vcpu_vmx *vmx) { - u32 exec_control; - ASSERT(vmx->pml_pg); __free_page(vmx->pml_pg); vmx->pml_pg = NULL; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); } static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 1cea0ce68ed76490ffa64a9e2a7a40104efe9352 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:57 +0800 Subject: KVM: VMX: drop rdtscp_enabled field Check cpuid bit instead of it Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 8 ++++++++ arch/x86/kvm/vmx.c | 17 ++++++----------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index aed7bfeeaf0d..d434ee952b00 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -141,4 +141,12 @@ static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_PCOMMIT)); } + +static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_RDTSCP)); +} #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2937cf136df6..6f3f97f6e248 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -532,8 +532,6 @@ struct vcpu_vmx { s64 vnmi_blocked_time; u32 exit_reason; - bool rdtscp_enabled; - /* Posted interrupt descriptor */ struct pi_desc pi_desc; @@ -2208,7 +2206,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && vmx->rdtscp_enabled) + if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) move_msr_up(vmx, index, save_nmsrs++); /* * MSR_STAR is only needed on long mode guests, and only @@ -2675,7 +2673,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_xss; break; case MSR_TSC_AUX: - if (!to_vmx(vcpu)->rdtscp_enabled) + if (!guest_cpuid_has_rdtscp(vcpu)) return 1; /* Otherwise falls through */ default: @@ -2781,7 +2779,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; case MSR_TSC_AUX: - if (!vmx->rdtscp_enabled) + if (!guest_cpuid_has_rdtscp(vcpu)) return 1; /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) @@ -8682,16 +8680,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); - vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) - vmx->rdtscp_enabled = true; - else + bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu); + if (!rdtscp_enabled) secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; if (nested) { - if (vmx->rdtscp_enabled) + if (rdtscp_enabled) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_RDTSCP; else -- cgit v1.2.3-70-g09d2 From 72c930dcfc2b49404ee9e20f6c868402e9c71166 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 18 Sep 2015 17:54:29 +0200 Subject: x86: kvmclock: abolish PVCLOCK_COUNTS_FROM_ZERO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer KVM won't be exposing PVCLOCK_COUNTS_FROM_ZERO anymore. The purpose of that flags was to start counting system time from 0 when the KVM clock has been initialized. We can achieve the same by selecting one read as the initial point. A simple subtraction will work unless the KVM clock count overflows earlier (has smaller width) than scheduler's cycle count. We should be safe till x86_128. Because PVCLOCK_COUNTS_FROM_ZERO was enabled only on new hypervisors, setting sched clock as stable based on PVCLOCK_TSC_STABLE_BIT might regress on older ones. I presume we don't need to change kvm_clock_read instead of introducing kvm_sched_clock_read. A problem could arise in case sched_clock is expected to return the same value as get_cycles, but we should have merged those clocks in that case. Signed-off-by: Radim Krčmář Acked-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2c7aafa70702..2bd81e302427 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -32,6 +32,7 @@ static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; +static cycle_t kvm_sched_clock_offset; static int parse_no_kvmclock(char *arg) { @@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) return kvm_clock_read(); } +static cycle_t kvm_sched_clock_read(void) +{ + return kvm_clock_read() - kvm_sched_clock_offset; +} + +static inline void kvm_sched_clock_init(bool stable) +{ + if (!stable) { + pv_time_ops.sched_clock = kvm_clock_read; + return; + } + + kvm_sched_clock_offset = kvm_clock_read(); + pv_time_ops.sched_clock = kvm_sched_clock_read; + set_sched_clock_stable(); + + printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", + kvm_sched_clock_offset); + + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); +} + /* * If we don't do that, there is the possibility that the guest * will calibrate under heavy load - thus, getting a lower lpj - @@ -248,7 +272,17 @@ void __init kvmclock_init(void) memblock_free(mem, size); return; } - pv_time_ops.sched_clock = kvm_clock_read; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + + cpu = get_cpu(); + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); + put_cpu(); + x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; @@ -265,16 +299,6 @@ void __init kvmclock_init(void) kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; - - if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) - pvclock_set_flags(~0); - - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - if (flags & PVCLOCK_COUNTS_FROM_ZERO) - set_sched_clock_stable(); - put_cpu(); } int __init kvm_setup_vsyscall_timeinfo(void) -- cgit v1.2.3-70-g09d2 From 18cd52c4d9bfbd081fb643021ba8d048475cd82a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 18 Sep 2015 18:04:17 +0200 Subject: irq_remapping: move structs outside #ifdef This is friendlier to clients of the code, who are going to prepare vcpu_data structs unconditionally, even if CONFIG_IRQ_REMAP is not defined. Reviewed-by: Thomas Gleixner Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/irq_remapping.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 046c7fb1ca43..a210eba2727c 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,6 +33,11 @@ enum irq_remap_cap { IRQ_POSTING_CAP = 0, }; +struct vcpu_data { + u64 pi_desc_addr; /* Physical address of PI Descriptor */ + u32 vector; /* Guest vector of the interrupt */ +}; + #ifdef CONFIG_IRQ_REMAP extern bool irq_remapping_cap(enum irq_remap_cap cap); @@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void) return x86_vector_domain; } -struct vcpu_data { - u64 pi_desc_addr; /* Physical address of PI Descriptor */ - u32 vector; /* Guest vector of the interrupt */ -}; - #else /* CONFIG_IRQ_REMAP */ static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } -- cgit v1.2.3-70-g09d2 From 6ef1522f7ecc063317dfb5ca63da6e47130a4c50 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:45 +0800 Subject: KVM: Extend struct pi_desc for VT-d Posted-Interrupts Extend struct pi_desc for VT-d Posted-Interrupts. Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6f3f97f6e248..4ebdccd882c8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -446,8 +446,24 @@ struct nested_vmx { /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ - u32 control; /* bit 0 of control is outstanding notification bit */ - u32 rsvd[7]; + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; } __aligned(64); static bool pi_test_and_set_on(struct pi_desc *pi_desc) -- cgit v1.2.3-70-g09d2 From ebbfc765369690cf8fc512615e6b83ec1702f8ac Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:46 +0800 Subject: KVM: Add some helper functions for Posted-Interrupts This patch adds some helper functions to manipulate the Posted-Interrupts Descriptor. Signed-off-by: Feng Wu Reviewed-by: Paolo Bonzini Reviewed-by: Alex Williamson [Make the new functions inline. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4ebdccd882c8..4739a52874c2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -443,6 +443,8 @@ struct nested_vmx { }; #define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ @@ -483,6 +485,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + return clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + return set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; -- cgit v1.2.3-70-g09d2 From 8feb4a04dc756002f78df0026e58118669de4851 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:47 +0800 Subject: KVM: Define a new interface kvm_intr_is_single_vcpu() This patch defines a new interface kvm_intr_is_single_vcpu(), which can returns whether the interrupt is for single-CPU or not. It is used by VT-d PI, since now we only support single-CPU interrupts, For lowest-priority interrupts, if user configures it via /proc/irq or uses irqbalance to make it single-CPU, we can use PI to deliver the interrupts to it. Full functionality of lowest-priority support will be added later. Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/irq_comm.c | 27 +++++++++++++++++++ arch/x86/kvm/lapic.c | 59 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.h | 2 ++ 4 files changed, 91 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d064cb2e19e8..ba4e5673e604 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1241,4 +1241,7 @@ int x86_set_memory_region(struct kvm *kvm, bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 177460998bb0..39f833f8132e 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -297,6 +297,33 @@ out: return r; } +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + int i, r = 0; + struct kvm_vcpu *vcpu; + + if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) + return true; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (++r == 2) + return false; + + *dest_vcpu = vcpu; + } + + return r == 1; +} +EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); + #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2f4c0d0cbe0a..944b38a56929 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -755,6 +755,65 @@ out: return ret; } +bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + struct kvm_apic_map *map; + bool ret = false; + struct kvm_lapic *dst = NULL; + + if (irq->shorthand) + return false; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (!map) + goto out; + + if (irq->dest_mode == APIC_DEST_PHYSICAL) { + if (irq->dest_id == 0xFF) + goto out; + + if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) + goto out; + + dst = map->phys_map[irq->dest_id]; + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } else { + u16 cid; + unsigned long bitmap = 1; + int i, r = 0; + + if (!kvm_apic_logical_map_valid(map)) + goto out; + + apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); + + if (cid >= ARRAY_SIZE(map->logical_map)) + goto out; + + for_each_set_bit(i, &bitmap, 16) { + dst = map->logical_map[cid][i]; + if (++r == 2) + goto out; + } + + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } + + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 7259d272416f..fde8e35d5850 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -168,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); +bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); #endif -- cgit v1.2.3-70-g09d2 From d84f1e0755ba1e87d20d1f90c1e6eb0cbbc0af9d Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:49 +0800 Subject: KVM: make kvm_set_msi_irq() public Make kvm_set_msi_irq() public, we can use this function outside. Signed-off-by: Feng Wu Reviewed-by: Paolo Bonzini Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/irq_comm.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ba4e5673e604..79fffbbe2348 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -176,6 +176,8 @@ enum { */ #define KVM_APIC_PV_EOI_PENDING 1 +struct kvm_kernel_irq_routing_entry; + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -1244,4 +1246,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 39f833f8132e..c89228980230 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm_lapic_irq *irq) +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) { trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); @@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, irq->level = 1; irq->shorthand = 0; } +EXPORT_SYMBOL_GPL(kvm_set_msi_irq); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) -- cgit v1.2.3-70-g09d2 From efc644048ecde54f016011fe10110addd0de348f Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:51 +0800 Subject: KVM: x86: Update IRTE for posted-interrupts This patch adds the routine to update IRTE for posted-interrupts when guest changes the interrupt configuration. Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Fengguang Wu [Squashed in automatically generated patch from the build robot "KVM: x86: vcpu_to_pi_desc() can be static" - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/trace.h | 33 ++++++++++++++++ arch/x86/kvm/vmx.c | 83 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 2 + 4 files changed, 121 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 79fffbbe2348..a4067ecd4376 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -897,6 +897,9 @@ struct kvm_x86_ops { gfn_t offset, unsigned long mask); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + + int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ce4abe333c39..120302511802 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -992,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm, __entry->smbase) ); +/* + * Tracepoint for VT-d posted-interrupts. + */ +TRACE_EVENT(kvm_pi_irte_update, + TP_PROTO(unsigned int vcpu_id, unsigned int gsi, + unsigned int gvec, u64 pi_desc_addr, bool set), + TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned int, gsi ) + __field( unsigned int, gvec ) + __field( u64, pi_desc_addr ) + __field( bool, set ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->gsi = gsi; + __entry->gvec = gvec; + __entry->pi_desc_addr = pi_desc_addr; + __entry->set = set; + ), + + TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, " + "gvec: 0x%x, pi_desc_addr: 0x%llx", + __entry->set ? "enabled and being updated" : "disabled", + __entry->vcpu_id, + __entry->gsi, + __entry->gvec, + __entry->pi_desc_addr) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4739a52874c2..cb26b93a2cd7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -603,6 +604,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [number] = VMCS12_OFFSET(name) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ @@ -10345,6 +10351,81 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +/* + * vmx_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = -EINVAL; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + */ + + kvm_set_msi_irq(e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) + continue; + + vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else { + /* suppress notification event before unposting */ + pi_set_sn(vcpu_to_pi_desc(vcpu)); + ret = irq_set_vcpu_affinity(host_irq, NULL); + pi_clear_sn(vcpu_to_pi_desc(vcpu)); + } + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -10462,6 +10543,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, .pmu_ops = &intel_pmu_ops, + + .update_pi_irte = vmx_update_pi_irte, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 185fc1619c99..3978ace4ddbc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -64,6 +64,7 @@ #include /* Ugh! */ #include #include +#include #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 @@ -8094,3 +8095,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); -- cgit v1.2.3-70-g09d2 From 87276880065246ce49ec571130d3d1e4a22e5604 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:40 +0800 Subject: KVM: x86: select IRQ_BYPASS_MANAGER Select IRQ_BYPASS_MANAGER for x86 when CONFIG_KVM is set Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/Kconfig | 2 ++ arch/x86/kvm/x86.c | 53 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a4067ecd4376..15664994b6f3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index d8a1d56276e1..639a6e34500c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -28,6 +28,8 @@ config KVM select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD + select IRQ_BYPASS_MANAGER + select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3978ace4ddbc..b8425a769c0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -51,6 +51,8 @@ #include #include #include +#include +#include #include #define CREATE_TRACE_POINTS @@ -8079,6 +8081,57 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + if (kvm_x86_ops->update_pi_irte) { + irqfd->producer = prod; + return kvm_x86_ops->update_pi_irte(irqfd->kvm, + prod->irq, irqfd->gsi, 1); + } + + return -EINVAL; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + int ret; + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + if (!kvm_x86_ops->update_pi_irte) { + WARN_ON(irqfd->producer != NULL); + return; + } + + WARN_ON(irqfd->producer != prod); + irqfd->producer = NULL; + + /* + * When producer of consumer is unregistered, we change back to + * remapped mode, so we can re-use the current implementation + * when the irq is masked/disabed or the consumer side (KVM + * int this case doesn't want to receive the interrupts. + */ + ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + if (ret) + printk(KERN_INFO "irq bypass consumer (token %p) unregistration" + " fails: %d\n", irqfd->consumer.token, ret); +} + +int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + if (!kvm_x86_ops->update_pi_irte) + return -EINVAL; + + return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); +} + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); -- cgit v1.2.3-70-g09d2 From 28b835d60fcc2498e717cf5e6f0c3691c24546f7 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:54 +0800 Subject: KVM: Update Posted-Interrupts Descriptor when vCPU is preempted This patch updates the Posted-Interrupts Descriptor when vCPU is preempted. sched out: - Set 'SN' to suppress furture non-urgent interrupts posted for the vCPU. sched in: - Clear 'SN' - Change NDST if vCPU is scheduled to a different CPU - Set 'NV' to POSTED_INTR_VECTOR Signed-off-by: Feng Wu [Include asm/cpu.h to fix !CONFIG_SMP compilation. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cb26b93a2cd7..99f5c61954ea 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -35,6 +35,7 @@ #include "kvm_cache_regs.h" #include "x86.h" +#include #include #include #include @@ -1942,6 +1943,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) preempt_enable(); } +static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + do { + old.control = new.control = pi_desc->control; + + /* + * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there + * are two possible cases: + * 1. After running 'pre_block', context switch + * happened. For this case, 'sn' was set in + * vmx_vcpu_put(), so we need to clear it here. + * 2. After running 'pre_block', we were blocked, + * and woken up by some other guy. For this case, + * we don't need to do anything, 'pi_post_block' + * will do everything for us. However, we cannot + * check whether it is case #1 or case #2 here + * (maybe, not needed), so we also clear sn here, + * I think it is not a big deal. + */ + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { + if (vcpu->cpu != cpu) { + dest = cpu_physical_id(cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + } + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); +} /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -1992,10 +2039,27 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ vmx->loaded_vmcs->cpu = cpu; } + + vmx_vcpu_pi_load(vcpu, cpu); +} + +static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + /* Set SN when the vCPU is preempted */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { + vmx_vcpu_pi_put(vcpu); + __vmx_load_host_state(to_vmx(vcpu)); if (!vmm_exclusive) { __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); @@ -4427,6 +4491,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Currently, we don't support urgent interrupt, + * all interrupts are recognized as non-urgent + * interrupt, so we cannot post interrupts when + * 'SN' is set. + * + * If the vcpu is in guest mode, it means it is + * running instead of being scheduled out and + * waiting in the run queue, and that's the only + * case when 'SN' is set currently, warning if + * 'SN' is set. + */ + WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); return true; -- cgit v1.2.3-70-g09d2 From bf9f6ac8d74969690df1485b33b7c238ca9f2269 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:55 +0800 Subject: KVM: Update Posted-Interrupts Descriptor when vCPU is blocked This patch updates the Posted-Interrupts Descriptor when vCPU is blocked. pre-block: - Add the vCPU to the blocked per-CPU list - Set 'NV' to POSTED_INTR_WAKEUP_VECTOR post-block: - Remove the vCPU from the per-CPU list Signed-off-by: Feng Wu [Concentrate invocation of pre/post-block hooks to vcpu_block. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/locking.txt | 12 +++ arch/x86/include/asm/kvm_host.h | 11 +++ arch/x86/kvm/vmx.c | 153 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 34 +++++--- include/linux/kvm_host.h | 3 + virt/kvm/kvm_main.c | 3 + 6 files changed, 206 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index d68af4dc3006..19f94a6b9bb0 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -166,3 +166,15 @@ Comment: The srcu read lock must be held while accessing memslots (e.g. MMIO/PIO address->device structure mapping (kvm->buses). The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu if it is needed by multiple functions. + +Name: blocked_vcpu_on_cpu_lock +Type: spinlock_t +Arch: x86 +Protects: blocked_vcpu_on_cpu +Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. + When VT-d posted-interrupts is supported and the VM has assigned + devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu + protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues + wakeup notification event since external interrupts from the + assigned devices happens, we will find the vCPU on the list to + wakeup. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 15664994b6f3..cdbdb559ecd2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -899,6 +899,17 @@ struct kvm_x86_ops { /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + /* + * Architecture specific hooks for vCPU blocking due to + * HLT instruction. + * Returns for .pre_block(): + * - 0 means continue to block the vCPU. + * - 1 means we cannot block the vCPU since some event + * happens during this period, such as, 'ON' bit in + * posted-interrupts descriptor is set. + */ + int (*pre_block)(struct kvm_vcpu *vcpu); + void (*post_block)(struct kvm_vcpu *vcpu); int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); }; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 99f5c61954ea..c5c22831aee2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -878,6 +878,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DEFINE_PER_CPU(struct desc_ptr, host_gdt); +/* + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * can find which vCPU should be waken up. + */ +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; @@ -2986,6 +2993,8 @@ static int hardware_enable(void) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); /* * Now we can enable the vmclear operation in kdump @@ -6045,6 +6054,25 @@ static void update_ple_window_actual_max(void) ple_window_grow, INT_MIN); } +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +static void wakeup_handler(void) +{ + struct kvm_vcpu *vcpu; + int cpu = smp_processor_id(); + + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), + blocked_vcpu_list) { + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (pi_test_on(pi_desc) == 1) + kvm_vcpu_kick(vcpu); + } + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6231,6 +6259,8 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + kvm_set_posted_intr_wakeup_handler(wakeup_handler); + return alloc_kvm_area(); out8: @@ -10431,6 +10461,126 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +/* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. + * - Store the vCPU to the wakeup list, so when interrupts happen + * we can find the right vCPU to wake up. + * - Change the Posted-interrupt descriptor as below: + * 'NDST' <-- vcpu->pre_pcpu + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR + * - If 'ON' is set during this process, which means at least one + * interrupt is posted for this vCPU, we cannot block it, in + * this case, return 1, otherwise, return 0. + * + */ +static int vmx_pre_block(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + vcpu->pre_pcpu = vcpu->cpu; + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + + do { + old.control = new.control = pi_desc->control; + + /* + * We should not block the vCPU if + * an interrupt is posted for it. + */ + if (pi_test_on(pi_desc) == 1) { + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + + return 1; + } + + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); + + /* + * Since vCPU can be preempted during this process, + * vcpu->cpu could be different with pre_pcpu, we + * need to set pre_pcpu as the destination of wakeup + * notification event, then we can find the right vCPU + * to wakeup in wakeup handler if interrupts happen + * when the vCPU is in blocked state. + */ + dest = cpu_physical_id(vcpu->pre_pcpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'wakeup vector' */ + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + return 0; +} + +static void vmx_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + unsigned long flags; + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + if(vcpu->pre_pcpu != -1) { + spin_lock_irqsave( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + } +} + /* * vmx_update_pi_irte - set IRTE for Posted-Interrupts * @@ -10622,6 +10772,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .pre_block = vmx_pre_block, + .post_block = vmx_post_block, + .pmu_ops = &intel_pmu_ops, .update_pi_irte = vmx_update_pi_irte, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b8425a769c0a..2d2c9bb0d6d6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6335,6 +6335,20 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } } + /* + * KVM_REQ_EVENT is not set when posted interrupts are set by + * VT-d hardware, so we have to update RVI unconditionally. + */ + if (kvm_lapic_enabled(vcpu)) { + /* + * Update architecture specific hints for APIC + * virtual interrupt delivery. + */ + if (kvm_x86_ops->hwapic_irr_update) + kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_lapic_find_highest_irr(vcpu)); + } + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { @@ -6351,13 +6365,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { - /* - * Update architecture specific hints for APIC - * virtual interrupt delivery. - */ - if (kvm_x86_ops->hwapic_irr_update) - kvm_x86_ops->hwapic_irr_update(vcpu, - kvm_lapic_find_highest_irr(vcpu)); update_cr8_intercept(vcpu); kvm_lapic_sync_to_vapic(vcpu); } @@ -6493,10 +6500,15 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { - if (!kvm_arch_vcpu_runnable(vcpu)) { + if (!kvm_arch_vcpu_runnable(vcpu) && + (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_x86_ops->post_block) + kvm_x86_ops->post_block(vcpu); + if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; } @@ -6528,10 +6540,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu) for (;;) { if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) + !vcpu->arch.apf.halted) { r = vcpu_enter_guest(vcpu); - else + } else { r = vcpu_block(kvm, vcpu); + } + if (r <= 0) break; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c3f4538807f..9596a2f0977b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -234,6 +234,9 @@ struct kvm_vcpu { unsigned long requests; unsigned long guest_debug; + int pre_pcpu; + struct list_head blocked_vcpu_list; + struct mutex mutex; struct kvm_run *run; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index afd7ae6aec65..a75502c93c3e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) init_waitqueue_head(&vcpu->wq); kvm_async_pf_vcpu_init(vcpu); + vcpu->pre_pcpu = -1; + INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); + page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { r = -ENOMEM; -- cgit v1.2.3-70-g09d2 From f59922b47e0a202386c8e8dcf9f0235b8a028ae0 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 16 Sep 2015 12:14:52 +0200 Subject: KVM: s390: remove unused variable in __inject_vm the float int structure is no longer used in __inject_vm. Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5c2c169395c3..ab9f525aa7cd 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1390,12 +1390,9 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { - struct kvm_s390_float_interrupt *fi; u64 type = READ_ONCE(inti->type); int rc; - fi = &kvm->arch.float_int; - switch (type) { case KVM_S390_MCHK: rc = __inject_float_mchk(kvm, inti); -- cgit v1.2.3-70-g09d2 From fee0e0fdb2b9c221a3621bede722aa9f9c9f0d39 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 28 Sep 2015 13:32:38 +0200 Subject: KVM: s390: disabled wait cares about machine checks, not PER We don't care about program event recording irqs (synchronous program irqs) but asynchronous irqs when checking for disabled wait. Machine checks were missing. Let's directly switch to the functions we have for that purpose instead of testing once again for magic bits. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ab9f525aa7cd..1058240b3db3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -51,11 +51,9 @@ static int psw_mchk_disabled(struct kvm_vcpu *vcpu) static int psw_interrupts_disabled(struct kvm_vcpu *vcpu) { - if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) || - (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO) || - (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT)) - return 0; - return 1; + return psw_extint_disabled(vcpu) && + psw_ioint_disabled(vcpu) && + psw_mchk_disabled(vcpu); } static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 5f94c58ed0a6db016528d8555f1b655ad354f7bb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 28 Sep 2015 14:27:51 +0200 Subject: KVM: s390: set interception requests for all floating irqs No need to separate pending and floating irqs when setting interception requests. Let's do it for all equally. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 1058240b3db3..4f05520efbae 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -107,14 +107,10 @@ static inline u8 int_word_to_isc(u32 int_word) return (int_word & 0x38000000) >> 27; } -static inline unsigned long pending_floating_irqs(struct kvm_vcpu *vcpu) +static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.float_int.pending_irqs; -} - -static inline unsigned long pending_local_irqs(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.local_int.pending_irqs; + return vcpu->kvm->arch.float_int.pending_irqs | + vcpu->arch.local_int.pending_irqs; } static unsigned long disable_iscs(struct kvm_vcpu *vcpu, @@ -133,8 +129,7 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) { unsigned long active_mask; - active_mask = pending_local_irqs(vcpu); - active_mask |= pending_floating_irqs(vcpu); + active_mask = pending_irqs(vcpu); if (!active_mask) return 0; @@ -202,7 +197,7 @@ static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag) static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) { - if (!(pending_floating_irqs(vcpu) & IRQ_PEND_IO_MASK)) + if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK)) return; else if (psw_ioint_disabled(vcpu)) __set_cpuflag(vcpu, CPUSTAT_IO_INT); @@ -212,7 +207,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) { - if (!(pending_local_irqs(vcpu) & IRQ_PEND_EXT_MASK)) + if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK)) return; if (psw_extint_disabled(vcpu)) __set_cpuflag(vcpu, CPUSTAT_EXT_INT); @@ -222,7 +217,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) { - if (!(pending_local_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) + if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) return; if (psw_mchk_disabled(vcpu)) vcpu->arch.sie_block->ictl |= ICTL_LPSW; -- cgit v1.2.3-70-g09d2 From 118b862b153190f92415ece4cb97a896929c5ab8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 23 Sep 2015 12:25:15 +0200 Subject: KVM: s390: kvm_arch_vcpu_runnable already cares about timer interrupts We can remove that double check. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 4f05520efbae..1260f8c18df9 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -839,7 +839,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) vcpu->stat.exit_wait_state++; /* fast path */ - if (kvm_cpu_has_pending_timer(vcpu) || kvm_arch_vcpu_runnable(vcpu)) + if (kvm_arch_vcpu_runnable(vcpu)) return 0; if (psw_interrupts_disabled(vcpu)) { -- cgit v1.2.3-70-g09d2 From 4d32ad6becf0baf09f38707f0aff42c0f4367a99 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 6 May 2015 13:51:29 +0200 Subject: KVM: s390: drop out early in kvm_s390_has_irq() Let's get rid of the local variable and exit directly if we found any pending interrupt. This is not only faster, but also better readable. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 1260f8c18df9..10a0e8beb9e1 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -808,23 +808,21 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop) { - int rc; - - rc = !!deliverable_irqs(vcpu); + if (deliverable_irqs(vcpu)) + return 1; - if (!rc && kvm_cpu_has_pending_timer(vcpu)) - rc = 1; + if (kvm_cpu_has_pending_timer(vcpu)) + return 1; /* external call pending and deliverable */ - if (!rc && kvm_s390_ext_call_pending(vcpu) && + if (kvm_s390_ext_call_pending(vcpu) && !psw_extint_disabled(vcpu) && (vcpu->arch.sie_block->gcr[0] & 0x2000ul)) - rc = 1; - - if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu)) - rc = 1; + return 1; - return rc; + if (!exclude_stop && kvm_s390_is_stop_irq_pending(vcpu)) + return 1; + return 0; } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 66933b78e3204057bfc26343afcd0d463c0e8e55 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 20 Nov 2014 13:49:32 +0100 Subject: KVM: s390: simplify in-kernel program irq injection The main reason to keep program injection in kernel separated until now was that we were able to do some checking, if really only the owning thread injects program interrupts (via waitqueue_active(li->wq)). This BUG_ON was never triggered and the chances of really hitting it, if another thread injected a program irq to another vcpu, were very small. Let's drop this check and turn kvm_s390_inject_program_int() and kvm_s390_inject_prog_irq() into simple inline functions that makes use of kvm_s390_inject_vcpu(). __must_check can be dropped as they are implicitely given by kvm_s390_inject_vcpu(), to avoid ugly long function prototypes. Reviewed-by: Jens Freimann Acked-by: Cornelia Huck Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 28 ---------------------------- arch/s390/kvm/kvm-s390.h | 24 ++++++++++++++++++++---- 2 files changed, 20 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 10a0e8beb9e1..f603bacf6ac9 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -977,34 +977,6 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) return 0; } -int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) -{ - struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - struct kvm_s390_irq irq; - - spin_lock(&li->lock); - irq.u.pgm.code = code; - __inject_prog(vcpu, &irq); - BUG_ON(waitqueue_active(li->wq)); - spin_unlock(&li->lock); - return 0; -} - -int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, - struct kvm_s390_pgm_info *pgm_info) -{ - struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - struct kvm_s390_irq irq; - int rc; - - spin_lock(&li->lock); - irq.u.pgm = *pgm_info; - rc = __inject_prog(vcpu, &irq); - BUG_ON(waitqueue_active(li->wq)); - spin_unlock(&li->lock); - return rc; -} - static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index c446aabf60d3..3a368d2a6114 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -175,6 +175,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) return kvm->arch.user_cpu_state_ctrl != 0; } +/* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); @@ -185,7 +186,25 @@ int __must_check kvm_s390_inject_vm(struct kvm *kvm, struct kvm_s390_interrupt *s390int); int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq); -int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); +static inline int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, + struct kvm_s390_pgm_info *pgm_info) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_PROGRAM_INT, + .u.pgm = *pgm_info, + }; + + return kvm_s390_inject_vcpu(vcpu, &irq); +} +static inline int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_PROGRAM_INT, + .u.pgm.code = code, + }; + + return kvm_s390_inject_vcpu(vcpu, &irq); +} struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, u64 isc_mask, u32 schid); int kvm_s390_reinject_io_int(struct kvm *kvm, @@ -231,9 +250,6 @@ extern unsigned long kvm_s390_fac_list_mask[]; /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); -/* implemented in interrupt.c */ -int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, - struct kvm_s390_pgm_info *pgm_info); static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) { -- cgit v1.2.3-70-g09d2 From 238293b14d9b1f5689e2aa68710000b0f25aa612 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 4 May 2015 12:38:48 +0200 Subject: KVM: s390: correctly handle injection of pgm irqs and per events PER events can always co-exist with other program interrupts. For now, we always overwrite all program interrupt parameters when injecting any type of program interrupt. Let's handle that correctly by only overwriting the relevant portion of the program interrupt parameters. Therefore we can now inject PER events and ordinary program interrupts concurrently, resulting in no loss of program interrupts. This will especially by helpful when manually detecting PER events later - as both types might be triggered during one SIE exit. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f603bacf6ac9..a8be542b9cb0 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -972,7 +972,26 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, irq->u.pgm.code, 0); - li->irq.pgm = irq->u.pgm; + if (irq->u.pgm.code == PGM_PER) { + li->irq.pgm.code |= PGM_PER; + /* only modify PER related information */ + li->irq.pgm.per_address = irq->u.pgm.per_address; + li->irq.pgm.per_code = irq->u.pgm.per_code; + li->irq.pgm.per_atmid = irq->u.pgm.per_atmid; + li->irq.pgm.per_access_id = irq->u.pgm.per_access_id; + } else if (!(irq->u.pgm.code & PGM_PER)) { + li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) | + irq->u.pgm.code; + /* only modify non-PER information */ + li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code; + li->irq.pgm.mon_code = irq->u.pgm.mon_code; + li->irq.pgm.data_exc_code = irq->u.pgm.data_exc_code; + li->irq.pgm.mon_class_nr = irq->u.pgm.mon_class_nr; + li->irq.pgm.exc_access_id = irq->u.pgm.exc_access_id; + li->irq.pgm.op_access_id = irq->u.pgm.op_access_id; + } else { + li->irq.pgm = irq->u.pgm; + } set_bit(IRQ_PEND_PROG, &li->pending_irqs); return 0; } -- cgit v1.2.3-70-g09d2 From 5a3d883a59b3fe8dc8775c7a79200a5b11a6761e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 29 Sep 2015 16:27:24 +0200 Subject: KVM: s390: switch to get_tod_clock() and fix STP sync races Nobody except early.c makes use of store_tod_clock() to handle the cc. So if we would get a cc != 0, we would be in more trouble. Let's replace all users with get_tod_clock(). Returning a cc on an ioctl sounded strange either way. We can now also easily move the get_tod_clock() call into the preempt_disable() section. This is in fact necessary to make the STP sync work as expected. Otherwise the host TOD could change and we would end up with a wrong epoch calculation. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 18 ++++-------------- arch/s390/kvm/priv.c | 8 ++------ 2 files changed, 6 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0a67c40eece9..a0907795f31d 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -523,19 +523,14 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { struct kvm_vcpu *cur_vcpu; unsigned int vcpu_idx; - u64 host_tod, gtod; - int r; + u64 gtod; if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod))) return -EFAULT; - r = store_tod_clock(&host_tod); - if (r) - return r; - mutex_lock(&kvm->lock); preempt_disable(); - kvm->arch.epoch = gtod - host_tod; + kvm->arch.epoch = gtod - get_tod_clock(); kvm_s390_vcpu_block_all(kvm); kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch; @@ -581,15 +576,10 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { - u64 host_tod, gtod; - int r; - - r = store_tod_clock(&host_tod); - if (r) - return r; + u64 gtod; preempt_disable(); - gtod = host_tod + kvm->arch.epoch; + gtod = get_tod_clock() + kvm->arch.epoch; preempt_enable(); if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) return -EFAULT; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 4d21dc4d1a84..b253de5b8945 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -34,7 +34,7 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) { struct kvm_vcpu *cpup; - s64 hostclk, val; + s64 val; int i, rc; ar_t ar; u64 op2; @@ -49,15 +49,11 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - if (store_tod_clock(&hostclk)) { - kvm_s390_set_psw_cc(vcpu, 3); - return 0; - } VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val); - val = (val - hostclk) & ~0x3fUL; mutex_lock(&vcpu->kvm->lock); preempt_disable(); + val = (val - get_tod_clock()) & ~0x3fUL; kvm_for_each_vcpu(i, cpup, vcpu->kvm) cpup->arch.sie_block->epoch = val; preempt_enable(); -- cgit v1.2.3-70-g09d2 From 25ed16759660cdfccd4a3cb7d30cce8a797b542a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 12 May 2015 09:49:14 +0200 Subject: KVM: s390: factor out and fix setting of guest TOD clock Let's move that whole logic into one function. We now always use unsigned values when calculating the epoch (to avoid over/underflow defined). Also, we always have to get all VCPUs out of SIE before doing the update to avoid running differing VCPUs with different TODs. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 28 +++++++++++++++++----------- arch/s390/kvm/kvm-s390.h | 1 + arch/s390/kvm/priv.c | 15 +++------------ 3 files changed, 21 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a0907795f31d..87bd602f326c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -521,22 +521,12 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { - struct kvm_vcpu *cur_vcpu; - unsigned int vcpu_idx; u64 gtod; if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod))) return -EFAULT; - mutex_lock(&kvm->lock); - preempt_disable(); - kvm->arch.epoch = gtod - get_tod_clock(); - kvm_s390_vcpu_block_all(kvm); - kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) - cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch; - kvm_s390_vcpu_unblock_all(kvm); - preempt_enable(); - mutex_unlock(&kvm->lock); + kvm_s390_set_tod_clock(kvm, gtod); VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod); return 0; } @@ -1906,6 +1896,22 @@ retry: return 0; } +void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod) +{ + struct kvm_vcpu *vcpu; + int i; + + mutex_lock(&kvm->lock); + preempt_disable(); + kvm->arch.epoch = tod - get_tod_clock(); + kvm_s390_vcpu_block_all(kvm); + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.sie_block->epoch = kvm->arch.epoch; + kvm_s390_vcpu_unblock_all(kvm); + preempt_enable(); + mutex_unlock(&kvm->lock); +} + /** * kvm_arch_fault_in_page - fault-in guest page if necessary * @vcpu: The corresponding virtual cpu diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 3a368d2a6114..cc15ea3a150e 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -231,6 +231,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ +void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu, diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index b253de5b8945..77191b85ea7a 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -33,11 +33,9 @@ /* Handle SCK (SET CLOCK) interception */ static int handle_set_clock(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *cpup; - s64 val; - int i, rc; + int rc; ar_t ar; - u64 op2; + u64 op2, val; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -50,14 +48,7 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) return kvm_s390_inject_prog_cond(vcpu, rc); VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val); - - mutex_lock(&vcpu->kvm->lock); - preempt_disable(); - val = (val - get_tod_clock()) & ~0x3fUL; - kvm_for_each_vcpu(i, cpup, vcpu->kvm) - cpup->arch.sie_block->epoch = val; - preempt_enable(); - mutex_unlock(&vcpu->kvm->lock); + kvm_s390_set_tod_clock(vcpu->kvm, val); kvm_s390_set_psw_cc(vcpu, 0); return 0; -- cgit v1.2.3-70-g09d2 From 60417fcc2b0235dfe3dcd589c56dbe3ea1a64c54 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 29 Sep 2015 16:20:36 +0200 Subject: KVM: s390: factor out reading of the guest TOD clock Let's factor this out and always use get_tod_clock_fast() when reading the guest TOD. STORE CLOCK FAST does not do serialization and, therefore, might result in some fuzziness between different processors in a way that subsequent calls on different CPUs might have time stamps that are earlier. This semantics is fine though for all KVM use cases. To make it obvious that the new function has STORE CLOCK FAST semantics we name it kvm_s390_get_tod_clock_fast. With this patch, we only have a handful of places were we have to care about STP sync (using preempt_disable() logic). Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 15 +++------------ arch/s390/kvm/kvm-s390.c | 4 +--- arch/s390/kvm/kvm-s390.h | 10 ++++++++++ 3 files changed, 14 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index a8be542b9cb0..373e32346d68 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -69,13 +69,8 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) static int ckc_irq_pending(struct kvm_vcpu *vcpu) { - preempt_disable(); - if (!(vcpu->arch.sie_block->ckc < - get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) { - preempt_enable(); + if (vcpu->arch.sie_block->ckc >= kvm_s390_get_tod_clock_fast(vcpu->kvm)) return 0; - } - preempt_enable(); return ckc_interrupts_enabled(vcpu); } @@ -851,9 +846,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) goto no_timer; } - preempt_disable(); - now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; - preempt_enable(); + now = kvm_s390_get_tod_clock_fast(vcpu->kvm); sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); /* underflow */ @@ -892,9 +885,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) u64 now, sltime; vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); - preempt_disable(); - now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; - preempt_enable(); + now = kvm_s390_get_tod_clock_fast(vcpu->kvm); sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); /* diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 87bd602f326c..618c85411a51 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -568,9 +568,7 @@ static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { u64 gtod; - preempt_disable(); - gtod = get_tod_clock() + kvm->arch.epoch; - preempt_enable(); + gtod = kvm_s390_get_tod_clock_fast(kvm); if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) return -EFAULT; VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index cc15ea3a150e..1e70e00d3c5e 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -271,6 +271,16 @@ static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm) kvm_s390_vcpu_unblock(vcpu); } +static inline u64 kvm_s390_get_tod_clock_fast(struct kvm *kvm) +{ + u64 rc; + + preempt_disable(); + rc = get_tod_clock_fast() + kvm->arch.epoch; + preempt_enable(); + return rc; +} + /** * kvm_s390_inject_prog_cond - conditionally inject a program check * @vcpu: virtual cpu -- cgit v1.2.3-70-g09d2 From c77f3fab441c3e466b4c3601a475fc31ce156b06 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 8 Oct 2015 20:23:33 +0200 Subject: kvm: x86: set KVM_REQ_EVENT when updating IRR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After moving PIR to IRR, the interrupt needs to be delivered manually. Reported-by: Paolo Bonzini Cc: Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 944b38a56929..168b8759bd73 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) struct kvm_lapic *apic = vcpu->arch.apic; __kvm_apic_update_irr(pir, apic->regs); + + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -- cgit v1.2.3-70-g09d2 From db2bdcbbbd32e5500b822d5e74ef8b5bd777e687 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 8 Oct 2015 20:23:34 +0200 Subject: KVM: x86: fix edge EOI and IOAPIC reconfig race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM uses eoi_exit_bitmap to track vectors that need an action on EOI. The problem is that IOAPIC can be reconfigured while an interrupt with old configuration is pending and eoi_exit_bitmap only remembers the newest configuration; thus EOI from the pending interrupt is not recognized. (Reconfiguration is not a problem for level interrupts, because IOAPIC sends interrupt with the new configuration.) For an edge interrupt with ACK notifiers, like i8254 timer; things can happen in this order 1) IOAPIC inject a vector from i8254 2) guest reconfigures that vector's VCPU and therefore eoi_exit_bitmap on original VCPU gets cleared 3) guest's handler for the vector does EOI 4) KVM's EOI handler doesn't pass that vector to IOAPIC because it is not in that VCPU's eoi_exit_bitmap 5) i8254 stops working A simple solution is to set the IOAPIC vector in eoi_exit_bitmap if the vector is in PIR/IRR/ISR. This creates an unwanted situation if the vector is reused by a non-IOAPIC source, but I think it is so rare that we don't want to make the solution more sophisticated. The simple solution also doesn't work if we are reconfiguring the vector. (Shouldn't happen in the wild and I'd rather fix users of ACK notifiers instead of working around that.) The are no races because ioapic injection and reconfig are locked. Fixes: b053b2aef25d ("KVM: x86: Add EOI exit bitmap inference") [Before b053b2aef25d, this bug happened only with APICv.] Fixes: c7c9c56ca26f ("x86, apicv: add virtual interrupt delivery support") Cc: Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 4 +++- arch/x86/kvm/x86.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 2dcda0f188ba..88d0a92d3f94 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -246,7 +246,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode)) + e->fields.dest_id, e->fields.dest_mode) || + (e->fields.trig_mode == IOAPIC_EDGE_TRIG && + kvm_apic_pending_eoi(vcpu, e->fields.vector))) __set_bit(e->fields.vector, (unsigned long *)eoi_exit_bitmap); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e28954d2698a..e33aebbf189e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6201,8 +6201,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap); - else + else { + kvm_x86_ops->sync_pir_to_irr(vcpu); kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + } kvm_x86_ops->load_eoi_exitmap(vcpu); } -- cgit v1.2.3-70-g09d2 From 13db77347db175a68edd58a79963cdf5cb3a9607 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 8 Oct 2015 20:30:00 +0200 Subject: KVM: x86: don't notify userspace IOAPIC on edge EOI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On real hardware, edge-triggered interrupts don't set a bit in TMR, which means that IOAPIC isn't notified on EOI. Do the same here. Staying in guest/kernel mode after edge EOI is what we want for most devices. If some bugs could be nicely worked around with edge EOI notifications, we should invest in a better interface. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index c89228980230..8f4499c7ffc1 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -389,13 +389,15 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) for (i = 0; i < nr_ioapic_pins; ++i) { hlist_for_each_entry(entry, &table->map[i], link) { u32 dest_id, dest_mode; + bool level; if (entry->type != KVM_IRQ_ROUTING_MSI) continue; dest_id = (entry->msi.address_lo >> 12) & 0xff; dest_mode = (entry->msi.address_lo >> 2) & 0x1; - if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id, - dest_mode)) { + level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL; + if (level && kvm_apic_match_dest(vcpu, NULL, 0, + dest_id, dest_mode)) { u32 vector = entry->msi.data & 0xff; __set_bit(vector, -- cgit v1.2.3-70-g09d2 From 991e7a0eedf12721f5561d7a1a54d248dc290bc0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 16 Sep 2015 17:30:05 +0800 Subject: KVM: VMX: adjust interface to allocate/free_vpid Adjust allocate/free_vid so that they can be reused for the nested vpid. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8eeba6ac5914..1452271e98ae 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4268,29 +4268,28 @@ static int alloc_identity_pagetable(struct kvm *kvm) return r; } -static void allocate_vpid(struct vcpu_vmx *vmx) +static int allocate_vpid(void) { int vpid; - vmx->vpid = 0; if (!enable_vpid) - return; + return 0; spin_lock(&vmx_vpid_lock); vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); - if (vpid < VMX_NR_VPIDS) { - vmx->vpid = vpid; + if (vpid < VMX_NR_VPIDS) __set_bit(vpid, vmx_vpid_bitmap); - } + else + vpid = 0; spin_unlock(&vmx_vpid_lock); + return vpid; } -static void free_vpid(struct vcpu_vmx *vmx) +static void free_vpid(int vpid) { - if (!enable_vpid) + if (!enable_vpid || vpid == 0) return; spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); + __clear_bit(vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); } @@ -8629,7 +8628,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (enable_pml) vmx_disable_pml(vmx); - free_vpid(vmx); + free_vpid(vmx->vpid); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); free_nested(vmx); @@ -8648,7 +8647,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); - allocate_vpid(vmx); + vmx->vpid = allocate_vpid(); err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -8724,7 +8723,7 @@ free_msrs: uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: - free_vpid(vmx); + free_vpid(vmx->vpid); kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); } -- cgit v1.2.3-70-g09d2 From dd5f5341a3a684c8850f8a8ccbaa70e196518e76 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 23 Sep 2015 18:26:57 +0800 Subject: KVM: VMX: introduce __vmx_flush_tlb to handle specific vpid Introduce __vmx_flush_tlb() to handle specific vpid. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1452271e98ae..e1d94f81a28d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1392,13 +1392,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } -static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) +static inline void vpid_sync_vcpu_single(int vpid) { - if (vmx->vpid == 0) + if (vpid == 0) return; if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); } static inline void vpid_sync_vcpu_global(void) @@ -1407,10 +1407,10 @@ static inline void vpid_sync_vcpu_global(void) __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); } -static inline void vpid_sync_context(struct vcpu_vmx *vmx) +static inline void vpid_sync_context(int vpid) { if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_single(vmx); + vpid_sync_vcpu_single(vpid); else vpid_sync_vcpu_global(); } @@ -3563,9 +3563,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) { - vpid_sync_context(to_vmx(vcpu)); + vpid_sync_context(vpid); if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; @@ -3573,6 +3573,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) } } +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -4915,7 +4920,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); - vpid_sync_context(vmx); + vpid_sync_context(vmx->vpid); } /* -- cgit v1.2.3-70-g09d2 From 6a14c2222419daa7ecebc7f566841b9b4d63b397 Mon Sep 17 00:00:00 2001 From: Tudor Laurentiu Date: Wed, 23 Sep 2015 18:06:22 +0300 Subject: powerpc/e6500: add TMCFG0 register definition The register is not currently used in the base kernel but will be in a forthcoming kvm patch. Signed-off-by: Laurentiu Tudor Acked-by: Scott Wood Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/reg_booke.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 16547efa2d5a..2fef74b474f0 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -742,6 +742,12 @@ #define MMUBE1_VBE4 0x00000002 #define MMUBE1_VBE5 0x00000001 +#define TMRN_TMCFG0 16 /* Thread Management Configuration Register 0 */ +#define TMRN_TMCFG0_NPRIBITS 0x003f0000 /* Bits of thread priority */ +#define TMRN_TMCFG0_NPRIBITS_SHIFT 16 +#define TMRN_TMCFG0_NATHRD 0x00003f00 /* Number of active threads */ +#define TMRN_TMCFG0_NATHRD_SHIFT 8 +#define TMRN_TMCFG0_NTHRD 0x0000003f /* Number of threads */ #define TMRN_IMSR0 0x120 /* Initial MSR Register 0 (e6500) */ #define TMRN_IMSR1 0x121 /* Initial MSR Register 1 (e6500) */ #define TMRN_INIA0 0x140 /* Next Instruction Address Register 0 */ -- cgit v1.2.3-70-g09d2 From d4cd4f9586f87a5fc828b4c4698aa4faf56c96fc Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Thu, 24 Sep 2015 16:00:23 +0200 Subject: KVM: PPC: e500: fix handling local_sid_lookup result The function can return negative value. The problem has been detected using proposed semantic patch scripts/coccinelle/tests/assign_signed_to_unsigned.cocci [1]. [1]: http://permalink.gmane.org/gmane.linux.kernel/2046107 Signed-off-by: Andrzej Hajda Acked-by: Scott Wood Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/e500.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index b29ce752c7d6..32fdab57d604 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -237,7 +237,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, struct kvm_book3e_206_tlb_entry *gtlbe) { struct vcpu_id_table *idt = vcpu_e500->idt; - unsigned int pr, tid, ts, pid; + unsigned int pr, tid, ts; + int pid; u32 val, eaddr; unsigned long flags; -- cgit v1.2.3-70-g09d2 From 2daab50e17997424af57d1ab14042a51e9a368ab Mon Sep 17 00:00:00 2001 From: Tudor Laurentiu Date: Fri, 25 Sep 2015 18:02:23 +0300 Subject: KVM: PPC: e500: Emulate TMCFG0 TMRN register Emulate TMCFG0 TMRN register exposing one HW thread per vcpu. Signed-off-by: Mihai Caraman [Laurentiu.Tudor@freescale.com: rebased on latest kernel, use define instead of hardcoded value, moved code in own function] Signed-off-by: Laurentiu Tudor Acked-by: Scott Wood Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/disassemble.h | 5 +++++ arch/powerpc/kvm/e500_emulate.c | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h index 6330a61b875a..4852e849128b 100644 --- a/arch/powerpc/include/asm/disassemble.h +++ b/arch/powerpc/include/asm/disassemble.h @@ -42,6 +42,11 @@ static inline unsigned int get_dcrn(u32 inst) return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0); } +static inline unsigned int get_tmrn(u32 inst) +{ + return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0); +} + static inline unsigned int get_rt(u32 inst) { return (inst >> 21) & 0x1f; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index ce7291c79f6c..990db69a1d0b 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "booke.h" #include "e500.h" @@ -22,6 +23,7 @@ #define XOP_DCBTLS 166 #define XOP_MSGSND 206 #define XOP_MSGCLR 238 +#define XOP_MFTMR 366 #define XOP_TLBIVAX 786 #define XOP_TLBSX 914 #define XOP_TLBRE 946 @@ -113,6 +115,19 @@ static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_e500_emul_mftmr(struct kvm_vcpu *vcpu, unsigned int inst, + int rt) +{ + /* Expose one thread per vcpu */ + if (get_tmrn(inst) == TMRN_TMCFG0) { + kvmppc_set_gpr(vcpu, rt, + 1 | (1 << TMRN_TMCFG0_NATHRD_SHIFT)); + return EMULATE_DONE; + } + + return EMULATE_FAIL; +} + int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -165,6 +180,10 @@ int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); break; + case XOP_MFTMR: + emulated = kvmppc_e500_emul_mftmr(vcpu, inst, rt); + break; + case XOP_EHPRIV: emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, advance); -- cgit v1.2.3-70-g09d2 From 224f363246c3668452ec0ab5a0ff51824822f3fd Mon Sep 17 00:00:00 2001 From: Tudor Laurentiu Date: Thu, 1 Oct 2015 15:58:03 +0300 Subject: KVM: PPC: e500: fix couple of shift operations on 64 bits Fix couple of cases where we shift left a 32-bit value thus might get truncated results on 64-bit targets. Signed-off-by: Laurentiu Tudor Suggested-by: Scott Wood Acked-by: Scott Wood Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/e500_mmu_host.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 4d33e199edcc..5e2102c19586 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -406,7 +406,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { unsigned long gfn_start, gfn_end; - tsize_pages = 1 << (tsize - 2); + tsize_pages = 1UL << (tsize - 2); gfn_start = gfn & ~(tsize_pages - 1); gfn_end = gfn_start + tsize_pages; @@ -447,7 +447,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } if (likely(!pfnmap)) { - tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); + tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT); pfn = gfn_to_pfn_memslot(slot, gfn); if (is_error_noslot_pfn(pfn)) { if (printk_ratelimit()) -- cgit v1.2.3-70-g09d2 From 966d713e86db1916a0b45a324a935d009737316e Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Mon, 23 Mar 2015 22:24:45 +0530 Subject: KVM: PPC: Book3S HV: Deliver machine check with MSR(RI=0) to guest as MCE For the machine check interrupt that happens while we are in the guest, kvm layer attempts the recovery, and then delivers the machine check interrupt directly to the guest if recovery fails. On successful recovery we go back to normal functioning of the guest. But there can be cases where a machine check interrupt can happen with MSR(RI=0) while we are in the guest. This means MC interrupt is unrecoverable and we have to deliver a machine check to the guest since the machine check interrupt might have trashed valid values in SRR0/1. The current implementation do not handle this case, causing guest to crash with Bad kernel stack pointer instead of machine check oops message. [26281.490060] Bad kernel stack pointer 3fff9ccce5b0 at c00000000000490c [26281.490434] Oops: Bad kernel stack pointer, sig: 6 [#1] [26281.490472] SMP NR_CPUS=2048 NUMA pSeries This patch fixes this issue by checking MSR(RI=0) in KVM layer and forwarding unrecoverable interrupt to guest which then panics with proper machine check Oops message. Signed-off-by: Mahesh Salgaonkar Acked-by: Paul Mackerras Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index b98889e9851d..fb8d29650612 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2377,7 +2377,6 @@ machine_check_realmode: mr r3, r9 /* get vcpu pointer */ bl kvmppc_realmode_machine_check nop - cmpdi r3, 0 /* Did we handle MCE ? */ ld r9, HSTATE_KVM_VCPU(r13) li r12, BOOK3S_INTERRUPT_MACHINE_CHECK /* @@ -2390,13 +2389,18 @@ machine_check_realmode: * The old code used to return to host for unhandled errors which * was causing guest to hang with soft lockups inside guest and * makes it difficult to recover guest instance. + * + * if we receive machine check with MSR(RI=0) then deliver it to + * guest as machine check causing guest to crash. */ - ld r10, VCPU_PC(r9) ld r11, VCPU_MSR(r9) + andi. r10, r11, MSR_RI /* check for unrecoverable exception */ + beq 1f /* Deliver a machine check to guest */ + ld r10, VCPU_PC(r9) + cmpdi r3, 0 /* Did we handle MCE ? */ bne 2f /* Continue guest execution. */ /* If not, deliver a machine check. SRR0/1 are already set */ - li r10, BOOK3S_INTERRUPT_MACHINE_CHECK - ld r11, VCPU_MSR(r9) +1: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK bl kvmppc_msr_interrupt 2: b fast_interrupt_c_return -- cgit v1.2.3-70-g09d2 From 99b83ac893b84ed1a62ad6d1f2b6cc32026b9e85 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Oct 2015 09:12:21 -0700 Subject: KVM: nVMX: emulate the INVVPID instruction Add the INVVPID instruction emulation. Reviewed-by: Wincy Van Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 61 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index d25f32ac9c5c..aa336ff3e03e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -416,6 +416,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) +#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e1d94f81a28d..9b6ab311d0bd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -442,6 +442,7 @@ struct nested_vmx { u32 nested_vmx_misc_low; u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; + u32 nested_vmx_vpid_caps; }; #define POSTED_INTR_ON 0 @@ -2612,6 +2613,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + vmx->nested.nested_vmx_vpid_caps = 0; + if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -7329,7 +7332,63 @@ static int handle_invept(struct kvm_vcpu *vcpu) static int handle_invvpid(struct kvm_vcpu *vcpu) { - kvm_queue_exception(vcpu, UD_VECTOR); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info; + unsigned long type, types; + gva_t gva; + struct x86_exception e; + int vpid; + + if (!(vmx->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_ENABLE_VPID) || + !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* according to the intel vmx instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, + sizeof(u32), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_VPID_EXTENT_ALL_CONTEXT: + if (get_vmcs12(vcpu)->virtual_processor_id == 0) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + vmx_flush_tlb(vcpu); + nested_vmx_succeed(vcpu); + break; + default: + /* Trap single context invalidation invvpid calls */ + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); return 1; } -- cgit v1.2.3-70-g09d2 From 5c614b3583e7b6dab0c86356fa36c2bcbb8322a0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Oct 2015 09:18:36 -0700 Subject: KVM: nVMX: nested VPID emulation VPID is used to tag address space and avoid a TLB flush. Currently L0 use the same VPID to run L1 and all its guests. KVM flushes VPID when switching between L1 and L2. This patch advertises VPID to the L1 hypervisor, then address space of L1 and L2 can be separately treated and avoid TLB flush when swithing between L1 and L2. For each nested vmentry, if vpid12 is changed, reuse shadow vpid w/ an invvpid. Performance: run lmbench on L2 w/ 3.5 kernel. Context switching - times in microseconds - smaller is better ------------------------------------------------------------------------- Host OS 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw --------- ------------- ------ ------ ------ ------ ------ ------- ------- kernel Linux 3.5.0-1 1.2200 1.3700 1.4500 4.7800 2.3300 5.60000 2.88000 nested VPID kernel Linux 3.5.0-1 1.2600 1.4300 1.5600 12.7 12.9 3.49000 7.46000 vanilla Reviewed-by: Jan Kiszka Reviewed-by: Wincy Van Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9b6ab311d0bd..bea552204aa2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -426,6 +426,9 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + u16 vpid02; + u16 last_vpid; + u32 nested_vmx_procbased_ctls_low; u32 nested_vmx_procbased_ctls_high; u32 nested_vmx_true_procbased_ctls_low; @@ -1213,6 +1216,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); } +static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); +} + static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); @@ -2590,6 +2598,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | @@ -6818,6 +6827,7 @@ static void free_nested(struct vcpu_vmx *vmx) return; vmx->nested.vmxon = false; + free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); @@ -7379,7 +7389,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); return 1; } - vmx_flush_tlb(vcpu); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); nested_vmx_succeed(vcpu); break; default: @@ -8759,8 +8769,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } - if (nested) + if (nested) { nested_vmx_setup_ctls_msrs(vmx); + vmx->nested.vpid02 = allocate_vpid(); + } vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -8781,6 +8793,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: + free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); @@ -9665,12 +9678,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (enable_vpid) { /* - * Trivially support vpid by letting L2s share their parent - * L1's vpid. TODO: move to a more elaborate solution, giving - * each L2 its own vpid and exposing the vpid feature to L1. + * There is no direct mapping between vpid02 and vpid12, the + * vpid02 is per-vCPU for L0 and reused while the value of + * vpid12 is changed w/ one invvpid during nested vmentry. + * The vpid12 is allocated by L1 for L2, so it will not + * influence global bitmap(for vpid01 and vpid02 allocation) + * even if spawn a lot of nested vCPUs. */ - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx_flush_tlb(vcpu); + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + } + } else { + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmx_flush_tlb(vcpu); + } + } if (nested_cpu_has_ept(vmcs12)) { -- cgit v1.2.3-70-g09d2 From 089d7b6ec5151ad06a2cd524bc0580d311b641ad Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Oct 2015 09:18:37 -0700 Subject: KVM: nVMX: expose VPID capability to L1 Expose VPID capability to L1. For nested guests, we don't do anything specific for single context invalidation. Hence, only advertise support for global context invalidation. The major benefit of nested VPID comes from having separate vpids when switching between L1 and L2, and also when L2's vCPUs not sched in/out on L1. Reviewed-by: Wincy Van Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bea552204aa2..15bff51d492a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2622,7 +2622,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; - vmx->nested.nested_vmx_vpid_caps = 0; + if (enable_vpid) + vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; + else + vmx->nested.nested_vmx_vpid_caps = 0; if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= @@ -2739,7 +2743,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) break; case MSR_IA32_VMX_EPT_VPID_CAP: /* Currently, no nested vpid support */ - *pdata = vmx->nested.nested_vmx_ept_caps; + *pdata = vmx->nested.nested_vmx_ept_caps | + ((u64)vmx->nested.nested_vmx_vpid_caps << 32); break; default: return 1; -- cgit v1.2.3-70-g09d2 From 951f9fd74f2d826fff1d84a8ec34b491517dc15d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 23 Sep 2015 10:34:26 +0200 Subject: KVM: x86: manually unroll bad_mt_xwr loop The loop is computing one of two constants, it can be simpler to write everything inline. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c3f39aa9b9cb..b8482c0b75b2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3706,7 +3706,7 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, int maxphyaddr, bool execonly) { - int pte; + u64 bad_mt_xwr; rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); @@ -3724,14 +3724,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; - for (pte = 0; pte < 64; pte++) { - int rwx_bits = pte & 7; - int mt = pte >> 3; - if (mt == 0x2 || mt == 0x3 || mt == 0x7 || - rwx_bits == 0x2 || rwx_bits == 0x6 || - (rwx_bits == 0x4 && !execonly)) - rsvd_check->bad_mt_xwr |= (1ull << pte); + bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ + bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ + bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ + if (!execonly) { + /* bits 0..2 must not be 100 unless VMX capabilities allow it */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } + rsvd_check->bad_mt_xwr = bad_mt_xwr; } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, -- cgit v1.2.3-70-g09d2 From 6092d3d3e6db983048469d424a8f2221915a8dd3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 14 Oct 2015 15:10:54 +0200 Subject: kvm: svm: Only propagate next_rip when guest supports it Currently we always write the next_rip of the shadow vmcb to the guests vmcb when we emulate a vmexit. This could confuse the guest when its cpuid indicated no support for the next_rip feature. Fix this by only propagating next_rip if the guest actually supports it. Cc: Bandan Das Cc: Dirk Mueller Tested-By: Dirk Mueller Signed-off-by: Joerg Roedel Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 21 +++++++++++++++++++++ arch/x86/kvm/svm.c | 11 ++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d434ee952b00..06332cb7e7d1 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -149,4 +149,25 @@ static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_RDTSCP)); } + +/* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 + */ +#define BIT_NRIPS 3 + +static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0); + + /* + * NRIPS is a scattered cpuid feature, so we can't use + * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit + * position 8, not 3). + */ + return best && (best->edx & bit(BIT_NRIPS)); +} +#undef BIT_NRIPS + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 54a86183e5d3..cd8659cfc632 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -159,6 +159,9 @@ struct vcpu_svm { u32 apf_reason; u64 tsc_ratio; + + /* cached guest cpuid flags for faster access */ + bool nrips_enabled : 1; }; static DEFINE_PER_CPU(u64, current_tsc_ratio); @@ -2365,7 +2368,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; - nested_vmcb->control.next_rip = vmcb->control.next_rip; + + if (svm->nrips_enabled) + nested_vmcb->control.next_rip = vmcb->control.next_rip; /* * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have @@ -4085,6 +4090,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) static void svm_cpuid_update(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + + /* Update nrips enabled cache */ + svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); } static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -- cgit v1.2.3-70-g09d2 From cd1872f028556dc0e8424e58413c0268c159383b Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:04:13 +0900 Subject: KVM: x86: MMU: Make force_pt_level bool This will be passed to a function later. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 ++++---- arch/x86/kvm/paging_tmpl.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b8482c0b75b2..2262728863de 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2962,7 +2962,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - int force_pt_level; + bool force_pt_level; pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; @@ -3476,7 +3476,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, pfn_t pfn; int r; int level; - int force_pt_level; + bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -3497,9 +3497,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (mapping_level_dirty_bitmap(vcpu, gfn) || !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) - force_pt_level = 1; + force_pt_level = true; else - force_pt_level = 0; + force_pt_level = false; if (likely(!force_pt_level)) { level = mapping_level(vcpu, gfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 736e6ab8784d..07f1a4ede637 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - int force_pt_level; + bool force_pt_level; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; @@ -747,7 +747,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) || is_self_change_mapping; else - force_pt_level = 1; + force_pt_level = true; if (!force_pt_level) { level = min(walker.level, mapping_level(vcpu, walker.gfn)); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); -- cgit v1.2.3-70-g09d2 From 5ed5c5c8fdbab889837c9223fc6f4bdaa830879c Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:05:13 +0900 Subject: KVM: x86: MMU: Simplify force_pt_level calculation code in FNAME(page_fault)() As a bonus, an extra memory slot search can be eliminated when is_self_change_mapping is true. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/paging_tmpl.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 07f1a4ede637..8ebc3a5560ce 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -743,15 +743,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || is_self_change_mapping; - else + if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + if (!force_pt_level) { + level = min(walker.level, mapping_level(vcpu, walker.gfn)); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + } + } else force_pt_level = true; - if (!force_pt_level) { - level = min(walker.level, mapping_level(vcpu, walker.gfn)); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); -- cgit v1.2.3-70-g09d2 From fd136902187838bcae3a572f41cb703553dd63b8 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:06:02 +0900 Subject: KVM: x86: MMU: Move mapping_level_dirty_bitmap() call in mapping_level() This is necessary to eliminate an extra memory slot search later. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 29 ++++++++++++++--------------- arch/x86/kvm/paging_tmpl.h | 6 +++--- 2 files changed, 17 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2262728863de..890cd694c9a2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -870,10 +870,16 @@ static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); } -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, + bool *force_pt_level) { int host_level, level, max_level; + if (likely(!*force_pt_level)) + *force_pt_level = mapping_level_dirty_bitmap(vcpu, large_gfn); + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; + host_level = host_mapping_level(vcpu->kvm, large_gfn); if (host_level == PT_PAGE_TABLE_LEVEL) @@ -2962,14 +2968,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level; + bool force_pt_level = false; pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); /* * This path builds a PAE pagetable - so we can map * 2mb pages at maximum. Therefore check if the level @@ -2979,8 +2984,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, v, level, error_code)) return 0; @@ -3495,20 +3499,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - if (mapping_level_dirty_bitmap(vcpu, gfn) || - !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) - force_pt_level = true; - else - force_pt_level = false; - + force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, + PT_DIRECTORY_LEVEL); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); if (level > PT_DIRECTORY_LEVEL && !check_hugepage_cache_consistency(vcpu, gfn, level)) level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, gpa, level, error_code)) return 0; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 8ebc3a5560ce..bf39d0f3efa9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -744,9 +744,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); - if (!force_pt_level) { - level = min(walker.level, mapping_level(vcpu, walker.gfn)); + level = mapping_level(vcpu, walker.gfn, &force_pt_level); + if (likely(!force_pt_level)) { + level = min(walker.level, level); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } } else -- cgit v1.2.3-70-g09d2 From d8aacf5df86a961923a2c9c547d341d64a9d9f5d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:07:01 +0900 Subject: KVM: x86: MMU: Remove mapping_level_dirty_bitmap() Now that it has only one caller, and its name is not so helpful for readers, remove it. The new memslot_valid_for_gpte() function makes it possible to share the common code between gfn_to_memslot_dirty_bitmap() and mapping_level(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 890cd694c9a2..09833b04fc97 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -851,6 +851,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } +static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, + bool no_dirty_log) +{ + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return false; + if (no_dirty_log && slot->dirty_bitmap) + return false; + + return true; +} + static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) @@ -858,25 +869,22 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) + if (!memslot_valid_for_gpte(slot, no_dirty_log)) slot = NULL; return slot; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) -{ - return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); -} - static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, bool *force_pt_level) { int host_level, level, max_level; + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); if (likely(!*force_pt_level)) - *force_pt_level = mapping_level_dirty_bitmap(vcpu, large_gfn); + *force_pt_level = !memslot_valid_for_gpte(slot, true); if (unlikely(*force_pt_level)) return PT_PAGE_TABLE_LEVEL; -- cgit v1.2.3-70-g09d2 From 5225fdf8c8bea4418f69875804584c89a27c170e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:08:03 +0900 Subject: KVM: x86: MMU: Eliminate an extra memory slot search in mapping_level() Calling kvm_vcpu_gfn_to_memslot() twice in mapping_level() should be avoided since getting a slot by binary search may not be negligible, especially for virtual machines with many memory slots. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 09833b04fc97..dd2a7c6ec2ca 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm->arch.indirect_shadow_pages--; } -static int has_wrprotected_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - int level) +static int __has_wrprotected_page(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (slot) { linfo = lpage_info_slot(gfn, slot, level); return linfo->write_count; @@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu, return 1; } +static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +{ + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + return __has_wrprotected_page(gfn, level, slot); +} + static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { unsigned long page_size; @@ -896,7 +901,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu, large_gfn, level)) + if (__has_wrprotected_page(large_gfn, level, slot)) break; return level - 1; -- cgit v1.2.3-70-g09d2 From 7cae2bedcbd4680b155999655e49c27b9cf020fa Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 14 Oct 2015 19:33:09 -0300 Subject: KVM: x86: move steal time initialization to vcpu entry time As reported at https://bugs.launchpad.net/qemu/+bug/1494350, it is possible to have vcpu->arch.st.last_steal initialized from a thread other than vcpu thread, say the iothread, via KVM_SET_MSRS. Which can cause an overflow later (when subtracting from vcpu threads sched_info.run_delay). To avoid that, move steal time accumulation to vcpu entry time, before copying steal time data to guest. Signed-off-by: Marcelo Tosatti Reviewed-by: David Matlack Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e33aebbf189e..9e9c226cb79d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1903,6 +1903,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu) static void record_steal_time(struct kvm_vcpu *vcpu) { + accumulate_steal_time(vcpu); + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -2053,12 +2055,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & KVM_MSR_ENABLED)) break; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - - preempt_disable(); - accumulate_steal_time(vcpu); - preempt_enable(); - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; @@ -2634,7 +2630,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->cpu = cpu; } - accumulate_steal_time(vcpu); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } -- cgit v1.2.3-70-g09d2 From 5690891bcec5fcfda38da974ffa5488e36a59811 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Oct 2015 11:30:19 +0200 Subject: kvm: x86: zero EFER on INIT Not zeroing EFER means that a 32-bit firmware cannot enter paging mode without clearing EFER.LME first (which it should not know about). Yang Zhang from Intel confirmed that the manual is wrong and EFER is cleared to zero on INIT. Fixes: d28bc9dd25ce023270d2e039e7c98d38ecbf7758 Cc: stable@vger.kernel.org Cc: Yang Z Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 11 +++++------ arch/x86/kvm/vmx.c | 3 +-- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cd8659cfc632..f2c8e4917688 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1089,7 +1089,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - tsc; } -static void init_vmcb(struct vcpu_svm *svm, bool init_event) +static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; @@ -1160,8 +1160,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - if (!init_event) - svm_set_efer(&svm->vcpu, 0); + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; @@ -1215,7 +1214,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; } - init_vmcb(svm, init_event); + init_vmcb(svm); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); @@ -1271,7 +1270,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - init_vmcb(svm, false); + init_vmcb(svm); svm_init_osvw(&svm->vcpu); @@ -1893,7 +1892,7 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm, false); + init_vmcb(svm); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 15bff51d492a..4d0aa31a42ba 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4932,8 +4932,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx->vcpu.arch.cr0 = cr0; vmx_set_cr4(vcpu, 0); - if (!init_event) - vmx_set_efer(vcpu, 0); + vmx_set_efer(vcpu, 0); vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); -- cgit v1.2.3-70-g09d2 From 8c85ac1c0a1b41299370857765bc0950666ed5d9 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 19 Oct 2015 15:13:29 +0900 Subject: KVM: x86: MMU: Initialize force_pt_level before calling mapping_level() Commit fd1369021878 ("KVM: x86: MMU: Move mapping_level_dirty_bitmap() call in mapping_level()") forgot to initialize force_pt_level to false in FNAME(page_fault)() before calling mapping_level() like nonpaging_map() does. This can sometimes result in forcing page table level mapping unnecessarily. Fix this and move the first *force_pt_level check in mapping_level() before kvm_vcpu_gfn_to_memslot() call to make it a bit clearer that the variable must be initialized before mapping_level() gets called. This change can also avoid calling kvm_vcpu_gfn_to_memslot() when !check_hugepage_cache_consistency() check in tdp_page_fault() forces page table level mapping. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 7 ++++--- arch/x86/kvm/paging_tmpl.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd2a7c6ec2ca..7d85bcae3332 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -886,10 +886,11 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, int host_level, level, max_level; struct kvm_memory_slot *slot; - slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; - if (likely(!*force_pt_level)) - *force_pt_level = !memslot_valid_for_gpte(slot, true); + slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); + *force_pt_level = !memslot_valid_for_gpte(slot, true); if (unlikely(*force_pt_level)) return PT_PAGE_TABLE_LEVEL; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bf39d0f3efa9..b41faa91a6f9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level; + bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; -- cgit v1.2.3-70-g09d2 From 572abd563befd56bee918316c4f7ab144d2decf5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 29 Sep 2015 17:15:45 +1000 Subject: KVM: PPC: Book3S HV: Don't fall back to smaller HPT size in allocation ioctl Currently the KVM_PPC_ALLOCATE_HTAB will try to allocate the requested size of HPT, and if that is not possible, then try to allocate smaller sizes (by factors of 2) until either a minimum is reached or the allocation succeeds. This is not ideal for userspace, particularly in migration scenarios, where the destination VM really does require the size requested. Also, the minimum HPT size of 256kB may be insufficient for the guest to run successfully. This removes the fallback to smaller sizes on allocation failure for the KVM_PPC_ALLOCATE_HTAB ioctl. The fallback still exists for the case where the HPT is allocated at the time the first VCPU is run, if no HPT has been allocated by ioctl by that time. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 1f9c0a17f445..10722b1e38b5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -70,7 +70,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) } /* Lastly try successively smaller sizes from the page allocator */ - while (!hpt && order > PPC_MIN_HPT_ORDER) { + /* Only do this if userspace didn't specify a size via ioctl */ + while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) { hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| __GFP_NOWARN, order - PAGE_SHIFT); if (!hpt) -- cgit v1.2.3-70-g09d2 From c64dfe2af3de448f9afa2e542983015b31c1f91c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 18 May 2015 14:10:54 +1000 Subject: KVM: PPC: Book3S HV: Make H_REMOVE return correct HPTE value for absent HPTEs This fixes a bug where the old HPTE value returned by H_REMOVE has the valid bit clear if the HPTE was an absent HPTE, as happens for HPTEs for emulated MMIO pages and for RAM pages that have been paged out by the host. If the absent bit is set, we clear it and set the valid bit, because from the guest's point of view, the HPTE is valid. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index c1df9bb1e413..97e7f8c853d8 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -470,6 +470,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); + if (v & HPTE_V_ABSENT) + v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID; hpret[0] = v; hpret[1] = r; return H_SUCCESS; -- cgit v1.2.3-70-g09d2 From bfec5c2cc0adad3b343281eb4f9b94222c5f594f Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Fri, 16 Oct 2015 10:27:53 +0530 Subject: KVM: PPC: Implement extension to report number of memslots QEMU assumes 32 memslots if this extension is not implemented. Although, current value of KVM_USER_MEM_SLOTS is 32, once KVM_USER_MEM_SLOTS changes QEMU would take a wrong value. Signed-off-by: Nikunj A Dadhania Reviewed-by: Thomas Huth Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/powerpc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2e51289610e4..6fd2405c7f4a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -559,6 +559,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else r = num_online_cpus(); break; + case KVM_CAP_NR_MEMSLOTS: + r = KVM_USER_MEM_SLOTS; + break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; -- cgit v1.2.3-70-g09d2 From 70aa3961a196ac32baf54032b2051bac9a941118 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 15 Oct 2015 11:29:58 +0530 Subject: KVM: PPC: Book3S HV: Handle H_DOORBELL on the guest exit path Currently a CPU running a guest can receive a H_DOORBELL in the following two cases: 1) When the CPU is napping due to CEDE or there not being a guest vcpu. 2) The CPU is running the guest vcpu. Case 1), the doorbell message is not cleared since we were waking up from nap. Hence when the EE bit gets set on transition from guest to host, the H_DOORBELL interrupt is delivered to the host and the corresponding handler is invoked. However in Case 2), the message gets cleared by the action of taking the H_DOORBELL interrupt. Since the CPU was running a guest, instead of invoking the doorbell handler, the code invokes the second-level interrupt handler to switch the context from the guest to the host. At this point the setting of the EE bit doesn't result in the CPU getting the doorbell interrupt since it has already been delivered once. So, the handler for this doorbell is never invoked! This causes softlockups if the missed DOORBELL was an IPI sent from a sibling subcore on the same CPU. This patch fixes it by explitly invoking the doorbell handler on the exit path if the exit reason is H_DOORBELL similar to the way an EXTERNAL interrupt is handled. Since this will also handle Case 1), we can unconditionally clear the doorbell message in kvmppc_check_wake_reason. Signed-off-by: Gautham R. Shenoy Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index fb8d29650612..b1dab8d1d885 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -150,6 +150,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL beq 11f + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL + beq 15f /* Invoke the H_DOORBELL handler */ cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI beq cr2, 14f /* HMI check */ @@ -174,6 +176,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_HSRR1, r7 b hmi_exception_after_realmode +15: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + ba 0xe80 + kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ @@ -2440,14 +2446,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* hypervisor doorbell */ 3: li r12, BOOK3S_INTERRUPT_H_DOORBELL + + /* + * Clear the doorbell as we will invoke the handler + * explicitly in the guest exit path. + */ + lis r6, (PPC_DBELL_SERVER << (63-36))@h + PPC_MSGCLR(6) /* see if it's a host IPI */ li r3, 1 lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 bnelr - /* if not, clear it and return -1 */ - lis r6, (PPC_DBELL_SERVER << (63-36))@h - PPC_MSGCLR(6) + /* if not, return -1 */ li r3, -1 blr -- cgit v1.2.3-70-g09d2 From 3217f7c25bca66eed9b07f0b8bfd1937169b0736 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 27 Aug 2015 16:41:15 +0200 Subject: KVM: Add kvm_arch_vcpu_{un}blocking callbacks Some times it is useful for architecture implementations of KVM to know when the VCPU thread is about to block or when it comes back from blocking (arm/arm64 needs to know this to properly implement timers, for example). Therefore provide a generic architecture callback function in line with what we do elsewhere for KVM generic-arch interactions. Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 +++ arch/arm64/include/asm/kvm_host.h | 3 +++ arch/mips/include/asm/kvm_host.h | 2 ++ arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/s390/include/asm/kvm_host.h | 2 ++ arch/x86/include/asm/kvm_host.h | 3 +++ include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 3 +++ 8 files changed, 20 insertions(+) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index c4072d9f32c7..84da97901f1f 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -234,4 +234,7 @@ static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ed039688c221..e4f4d65f7d2b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -255,4 +255,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 5a1a882e0a75..6ded8d347af9 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -847,5 +847,7 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif /* __MIPS_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 827a38d7a9db..c9f122d00920 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -718,5 +718,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_exit(void) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8ced426091e1..72a614c68ed8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -644,5 +644,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2beee0382088..b28f0f142ecb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1233,4 +1233,7 @@ int x86_set_memory_region(struct kvm *kvm, bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1bef9e21e725..4a86f5f072c0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -625,6 +625,8 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8db1d9361993..7873d6daccb1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2018,6 +2018,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) } while (single_task_running() && ktime_before(cur, stop)); } + kvm_arch_vcpu_blocking(vcpu); + for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); @@ -2031,6 +2033,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) finish_wait(&vcpu->wq, &wait); cur = ktime_get(); + kvm_arch_vcpu_unblocking(vcpu); out: block_ns = ktime_to_ns(cur) - ktime_to_ns(start); -- cgit v1.2.3-70-g09d2 From d35268da66870d733ae763fd7f9b06a1f63f395e Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 25 Aug 2015 19:48:21 +0200 Subject: arm/arm64: KVM: arch_timer: Only schedule soft timer on vcpu_block We currently schedule a soft timer every time we exit the guest if the timer did not expire while running the guest. This is really not necessary, because the only work we do in the timer work function is to kick the vcpu. Kicking the vcpu does two things: (1) If the vpcu thread is on a waitqueue, make it runnable and remove it from the waitqueue. (2) If the vcpu is running on a different physical CPU from the one doing the kick, it sends a reschedule IPI. The second case cannot happen, because the soft timer is only ever scheduled when the vcpu is not running. The first case is only relevant when the vcpu thread is on a waitqueue, which is only the case when the vcpu thread has called kvm_vcpu_block(). Therefore, we only need to make sure a timer is scheduled for kvm_vcpu_block(), which we do by encapsulating all calls to kvm_vcpu_block() with kvm_timer_{un}schedule calls. Additionally, we only schedule a soft timer if the timer is enabled and unmasked, since it is useless otherwise. Note that theoretically userspace can use the SET_ONE_REG interface to change registers that should cause the timer to fire, even if the vcpu is blocked without a scheduled timer, but this case was not supported before this patch and we leave it for future work for now. Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 -- arch/arm/kvm/arm.c | 10 +++++ arch/arm64/include/asm/kvm_host.h | 3 -- include/kvm/arm_arch_timer.h | 2 + virt/kvm/arm/arch_timer.c | 94 +++++++++++++++++++++++++-------------- 5 files changed, 72 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 84da97901f1f..c4072d9f32c7 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -234,7 +234,4 @@ static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} - #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 78b286994577..7ed4d475d83a 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -271,6 +271,16 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvm_timer_should_fire(vcpu); } +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + kvm_timer_schedule(vcpu); +} + +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + kvm_timer_unschedule(vcpu); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { /* Force users to call KVM_ARM_VCPU_INIT */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e4f4d65f7d2b..ed039688c221 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -255,7 +255,4 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); -static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} - #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index e1e4d7c38dda..ef14cc1f1f26 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -71,5 +71,7 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); bool kvm_timer_should_fire(struct kvm_vcpu *vcpu); +void kvm_timer_schedule(struct kvm_vcpu *vcpu); +void kvm_timer_unschedule(struct kvm_vcpu *vcpu); #endif diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index b9d3a32cbc04..32095fbb5d7c 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -111,14 +111,21 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) return HRTIMER_NORESTART; } +static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) && + (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) && + !kvm_vgic_get_phys_irq_active(timer->map); +} + bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; cycle_t cval, now; - if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) || - !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) || - kvm_vgic_get_phys_irq_active(timer->map)) + if (!kvm_timer_irq_can_fire(vcpu)) return false; cval = timer->cntv_cval; @@ -127,12 +134,57 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) return cval <= now; } +/* + * Schedule the background timer before calling kvm_vcpu_block, so that this + * thread is removed from its waitqueue and made runnable when there's a timer + * interrupt to handle. + */ +void kvm_timer_schedule(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + u64 ns; + cycle_t cval, now; + + BUG_ON(timer_is_armed(timer)); + + /* + * No need to schedule a background timer if the guest timer has + * already expired, because kvm_vcpu_block will return before putting + * the thread to sleep. + */ + if (kvm_timer_should_fire(vcpu)) + return; + + /* + * If the timer is not capable of raising interrupts (disabled or + * masked), then there's no more work for us to do. + */ + if (!kvm_timer_irq_can_fire(vcpu)) + return; + + /* The timer has not yet expired, schedule a background timer */ + cval = timer->cntv_cval; + now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; + + ns = cyclecounter_cyc2ns(timecounter->cc, + cval - now, + timecounter->mask, + &timecounter->frac); + timer_arm(timer, ns); +} + +void kvm_timer_unschedule(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + timer_disarm(timer); +} + /** * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu * @vcpu: The vcpu pointer * - * Disarm any pending soft timers, since the world-switch code will write the - * virtual timer state back to the physical CPU. + * Check if the virtual timer has expired while we were running in the host, + * and inject an interrupt if that was the case. */ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) { @@ -140,17 +192,6 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) bool phys_active; int ret; - /* - * We're about to run this vcpu again, so there is no need to - * keep the background timer running, as we're about to - * populate the CPU timer again. - */ - timer_disarm(timer); - - /* - * If the timer expired while we were not scheduled, now is the time - * to inject it. - */ if (kvm_timer_should_fire(vcpu)) kvm_timer_inject_irq(vcpu); @@ -176,32 +217,17 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) * kvm_timer_sync_hwstate - sync timer state from cpu * @vcpu: The vcpu pointer * - * Check if the virtual timer was armed and either schedule a corresponding - * soft timer or inject directly if already expired. + * Check if the virtual timer has expired while we were running in the guest, + * and inject an interrupt if that was the case. */ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - cycle_t cval, now; - u64 ns; BUG_ON(timer_is_armed(timer)); - if (kvm_timer_should_fire(vcpu)) { - /* - * Timer has already expired while we were not - * looking. Inject the interrupt and carry on. - */ + if (kvm_timer_should_fire(vcpu)) kvm_timer_inject_irq(vcpu); - return; - } - - cval = timer->cntv_cval; - now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; - - ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask, - &timecounter->frac); - timer_arm(timer, ns); } int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, -- cgit v1.2.3-70-g09d2 From 4b4b4512da2a844b8da2585609b67fae1ce4f4db Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 30 Aug 2015 15:01:27 +0200 Subject: arm/arm64: KVM: Rework the arch timer to use level-triggered semantics The arch timer currently uses edge-triggered semantics in the sense that the line is never sampled by the vgic and lowering the line from the timer to the vgic doesn't have any effect on the pending state of virtual interrupts in the vgic. This means that we do not support a guest with the otherwise valid behavior of (1) disable interrupts (2) enable the timer (3) disable the timer (4) enable interrupts. Such a guest would validly not expect to see any interrupts on real hardware, but will see interrupts on KVM. This patch fixes this shortcoming through the following series of changes. First, we change the flow of the timer/vgic sync/flush operations. Now the timer is always flushed/synced before the vgic, because the vgic samples the state of the timer output. This has the implication that we move the timer operations in to non-preempible sections, but that is fine after the previous commit getting rid of hrtimer schedules on every entry/exit. Second, we change the internal behavior of the timer, letting the timer keep track of its previous output state, and only lower/raise the line to the vgic when the state changes. Note that in theory this could have been accomplished more simply by signalling the vgic every time the state *potentially* changed, but we don't want to be hitting the vgic more often than necessary. Third, we get rid of the use of the map->active field in the vgic and instead simply set the interrupt as active on the physical distributor whenever the input to the GIC is asserted and conversely clear the physical active state when the input to the GIC is deasserted. Fourth, and finally, we now initialize the timer PPIs (and all the other unused PPIs for now), to be level-triggered, and modify the sync code to sample the line state on HW sync and re-inject a new interrupt if it is still pending at that time. Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 11 +++-- include/kvm/arm_arch_timer.h | 2 +- include/kvm/arm_vgic.h | 3 -- virt/kvm/arm/arch_timer.c | 81 +++++++++++++++++++++++----------- virt/kvm/arm/vgic.c | 102 +++++++++++-------------------------------- 5 files changed, 91 insertions(+), 108 deletions(-) (limited to 'arch') diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 7ed4d475d83a..59125f48c707 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -561,9 +561,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) { local_irq_enable(); + kvm_timer_sync_hwstate(vcpu); kvm_vgic_sync_hwstate(vcpu); preempt_enable(); - kvm_timer_sync_hwstate(vcpu); continue; } @@ -608,12 +608,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_guest_exit(); trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + /* + * We must sync the timer state before the vgic state so that + * the vgic can properly sample the updated state of the + * interrupt line. + */ + kvm_timer_sync_hwstate(vcpu); + kvm_vgic_sync_hwstate(vcpu); preempt_enable(); - kvm_timer_sync_hwstate(vcpu); - ret = handle_exit(vcpu, run, ret); } diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index ef14cc1f1f26..1800227af9d6 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -51,7 +51,7 @@ struct arch_timer_cpu { bool armed; /* Timer IRQ */ - const struct kvm_irq_level *irq; + struct kvm_irq_level irq; /* VGIC mapping */ struct irq_phys_map *map; diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 4e14dac282bb..7bc5d0224ab0 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -159,7 +159,6 @@ struct irq_phys_map { u32 virt_irq; u32 phys_irq; u32 irq; - bool active; }; struct irq_phys_map_entry { @@ -354,8 +353,6 @@ int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu); struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int irq); int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map); -bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map); -void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus)) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 32095fbb5d7c..523816d8c402 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -59,18 +59,6 @@ static void timer_disarm(struct arch_timer_cpu *timer) } } -static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu) -{ - int ret; - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - kvm_vgic_set_phys_irq_active(timer->map, true); - ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, - timer->map, - timer->irq->level); - WARN_ON(ret); -} - static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; @@ -116,8 +104,7 @@ static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) && - (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) && - !kvm_vgic_get_phys_irq_active(timer->map); + (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE); } bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) @@ -134,6 +121,41 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) return cval <= now; } +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) +{ + int ret; + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + BUG_ON(!vgic_initialized(vcpu->kvm)); + + timer->irq.level = new_level; + ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, + timer->map, + timer->irq.level); + WARN_ON(ret); +} + +/* + * Check if there was a change in the timer state (should we raise or lower + * the line level to the GIC). + */ +static void kvm_timer_update_state(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + /* + * If userspace modified the timer registers via SET_ONE_REG before + * the vgic was initialized, we mustn't set the timer->irq.level value + * because the guest would never see the interrupt. Instead wait + * until we call this function from kvm_timer_flush_hwstate. + */ + if (!vgic_initialized(vcpu->kvm)) + return; + + if (kvm_timer_should_fire(vcpu) != timer->irq.level) + kvm_timer_update_irq(vcpu, !timer->irq.level); +} + /* * Schedule the background timer before calling kvm_vcpu_block, so that this * thread is removed from its waitqueue and made runnable when there's a timer @@ -192,17 +214,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) bool phys_active; int ret; - if (kvm_timer_should_fire(vcpu)) - kvm_timer_inject_irq(vcpu); + kvm_timer_update_state(vcpu); /* - * We keep track of whether the edge-triggered interrupt has been - * signalled to the vgic/guest, and if so, we mask the interrupt and - * the physical distributor to prevent the timer from raising a - * physical interrupt whenever we run a guest, preventing forward - * VCPU progress. + * If we enter the guest with the virtual input level to the VGIC + * asserted, then we have already told the VGIC what we need to, and + * we don't need to exit from the guest until the guest deactivates + * the already injected interrupt, so therefore we should set the + * hardware active state to prevent unnecessary exits from the guest. + * + * Conversely, if the virtual input level is deasserted, then always + * clear the hardware active state to ensure that hardware interrupts + * from the timer triggers a guest exit. */ - if (kvm_vgic_get_phys_irq_active(timer->map)) + if (timer->irq.level) phys_active = true; else phys_active = false; @@ -226,8 +251,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) BUG_ON(timer_is_armed(timer)); - if (kvm_timer_should_fire(vcpu)) - kvm_timer_inject_irq(vcpu); + /* + * The guest could have modified the timer registers or the timer + * could have expired, update the timer state. + */ + kvm_timer_update_state(vcpu); } int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, @@ -242,7 +270,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, * kvm_vcpu_set_target(). To handle this, we determine * vcpu timer irq number when the vcpu is reset. */ - timer->irq = irq; + timer->irq.irq = irq->irq; /* * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 @@ -251,6 +279,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, * the ARMv7 architecture. */ timer->cntv_ctl = 0; + kvm_timer_update_state(vcpu); /* * Tell the VGIC that the virtual interrupt is tied to a @@ -295,6 +324,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) default: return -1; } + + kvm_timer_update_state(vcpu); return 0; } diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index a44ecf9eca4e..3c2909c1bda3 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -537,34 +537,6 @@ bool vgic_handle_set_pending_reg(struct kvm *kvm, return false; } -/* - * If a mapped interrupt's state has been modified by the guest such that it - * is no longer active or pending, without it have gone through the sync path, - * then the map->active field must be cleared so the interrupt can be taken - * again. - */ -static void vgic_handle_clear_mapped_irq(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct list_head *root; - struct irq_phys_map_entry *entry; - struct irq_phys_map *map; - - rcu_read_lock(); - - /* Check for PPIs */ - root = &vgic_cpu->irq_phys_map_list; - list_for_each_entry_rcu(entry, root, entry) { - map = &entry->map; - - if (!vgic_dist_irq_is_pending(vcpu, map->virt_irq) && - !vgic_irq_is_active(vcpu, map->virt_irq)) - map->active = false; - } - - rcu_read_unlock(); -} - bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio, phys_addr_t offset, int vcpu_id) @@ -595,7 +567,6 @@ bool vgic_handle_clear_pending_reg(struct kvm *kvm, vcpu_id, offset); vgic_reg_access(mmio, reg, offset, mode); - vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id)); vgic_update_state(kvm); return true; } @@ -633,7 +604,6 @@ bool vgic_handle_clear_active_reg(struct kvm *kvm, ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); if (mmio->is_write) { - vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id)); vgic_update_state(kvm); return true; } @@ -1443,29 +1413,37 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) /* * Save the physical active state, and reset it to inactive. * - * Return 1 if HW interrupt went from active to inactive, and 0 otherwise. + * Return true if there's a pending level triggered interrupt line to queue. */ -static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr) +static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) { + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; struct irq_phys_map *map; + bool phys_active; + bool level_pending; int ret; if (!(vlr.state & LR_HW)) - return 0; + return false; map = vgic_irq_map_search(vcpu, vlr.irq); BUG_ON(!map); ret = irq_get_irqchip_state(map->irq, IRQCHIP_STATE_ACTIVE, - &map->active); + &phys_active); WARN_ON(ret); - if (map->active) + if (phys_active) return 0; - return 1; + /* Mapped edge-triggered interrupts not yet supported. */ + WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq)); + spin_lock(&dist->lock); + level_pending = process_level_irq(vcpu, lr, vlr); + spin_unlock(&dist->lock); + return level_pending; } /* Sync back the VGIC state after a guest run */ @@ -1490,18 +1468,8 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) continue; vlr = vgic_get_lr(vcpu, lr); - if (vgic_sync_hwirq(vcpu, vlr)) { - /* - * So this is a HW interrupt that the guest - * EOI-ed. Clean the LR state and allow the - * interrupt to be sampled again. - */ - vlr.state = 0; - vlr.hwirq = 0; - vgic_set_lr(vcpu, lr, vlr); - vgic_irq_clear_queued(vcpu, vlr.irq); - set_bit(lr, elrsr_ptr); - } + if (vgic_sync_hwirq(vcpu, lr, vlr)) + level_pending = true; if (!test_bit(lr, elrsr_ptr)) continue; @@ -1880,30 +1848,6 @@ static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu) kfree(entry); } -/** - * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ - * - * Return the logical active state of a mapped interrupt. This doesn't - * necessarily reflects the current HW state. - */ -bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map) -{ - BUG_ON(!map); - return map->active; -} - -/** - * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ - * - * Set the logical active state of a mapped interrupt. This doesn't - * immediately affects the HW state. - */ -void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active) -{ - BUG_ON(!map); - map->active = active; -} - /** * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping * @vcpu: The VCPU pointer @@ -2129,17 +2073,23 @@ int vgic_init(struct kvm *kvm) } /* - * Enable all SGIs and configure all private IRQs as - * edge-triggered. + * Enable and configure all SGIs to be edge-triggere and + * configure all PPIs as level-triggered. */ for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - if (i < VGIC_NR_SGIS) + if (i < VGIC_NR_SGIS) { + /* SGIs */ vgic_bitmap_set_irq_val(&dist->irq_enabled, vcpu->vcpu_id, i, 1); - if (i < VGIC_NR_PRIVATE_IRQS) vgic_bitmap_set_irq_val(&dist->irq_cfg, vcpu->vcpu_id, i, VGIC_CFG_EDGE); + } else if (i < VGIC_NR_PRIVATE_IRQS) { + /* PPIs */ + vgic_bitmap_set_irq_val(&dist->irq_cfg, + vcpu->vcpu_id, i, + VGIC_CFG_LEVEL); + } } vgic_enable(vcpu); -- cgit v1.2.3-70-g09d2 From 75755c6d02df9e9b959b3066c12de5494907e3d9 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 9 Oct 2015 10:08:43 -0500 Subject: arm/arm64: KVM : Enable vhost device selection under KVM config menu vhost drivers provide guest VMs with better I/O performance and lower CPU utilization. This patch allows users to select vhost devices under KVM configuration menu on ARM. This makes vhost support on arm/arm64 on a par with other architectures (e.g. x86, ppc). Signed-off-by: Wei Huang Signed-off-by: Christoffer Dall --- arch/arm/kvm/Kconfig | 2 ++ arch/arm64/kvm/Kconfig | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 356970f3b25e..95a000515e43 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -46,4 +46,6 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. +source drivers/vhost/Kconfig + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 5c7e920e4861..38102f5d3cbb 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -41,4 +41,6 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. +source drivers/vhost/Kconfig + endif # VIRTUALIZATION -- cgit v1.2.3-70-g09d2 From 3781528e3045e7c9cc7c4846e0f675b1f353655f Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 25 Sep 2015 23:41:14 +0200 Subject: KVM: arm/arm64: rename pause into power_off The kvm_vcpu_arch pause field is renamed into power_off to prepare for the introduction of a new pause field. Also vcpu_pause is renamed into vcpu_sleep since we will sleep until both power_off and pause are false. Signed-off-by: Eric Auger Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 4 ++-- arch/arm/kvm/arm.c | 20 ++++++++++---------- arch/arm/kvm/psci.c | 10 +++++----- arch/arm64/include/asm/kvm_host.h | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index c4072d9f32c7..107374f986fd 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -126,8 +126,8 @@ struct kvm_vcpu_arch { * here. */ - /* Don't run the guest on this vcpu */ - bool pause; + /* vcpu power-off state */ + bool power_off; /* IO related fields */ struct kvm_decode mmio_decode; diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 59125f48c707..9d2fb4772d8c 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -318,7 +318,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (vcpu->arch.pause) + if (vcpu->arch.power_off) mp_state->mp_state = KVM_MP_STATE_STOPPED; else mp_state->mp_state = KVM_MP_STATE_RUNNABLE; @@ -331,10 +331,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: - vcpu->arch.pause = false; + vcpu->arch.power_off = false; break; case KVM_MP_STATE_STOPPED: - vcpu->arch.pause = true; + vcpu->arch.power_off = true; break; default: return -EINVAL; @@ -478,11 +478,11 @@ bool kvm_arch_intc_initialized(struct kvm *kvm) return vgic_initialized(kvm); } -static void vcpu_pause(struct kvm_vcpu *vcpu) +static void vcpu_sleep(struct kvm_vcpu *vcpu) { wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); - wait_event_interruptible(*wq, !vcpu->arch.pause); + wait_event_interruptible(*wq, !vcpu->arch.power_off); } static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) @@ -532,8 +532,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) update_vttbr(vcpu->kvm); - if (vcpu->arch.pause) - vcpu_pause(vcpu); + if (vcpu->arch.power_off) + vcpu_sleep(vcpu); /* * Disarming the background timer must be done in a @@ -780,12 +780,12 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, vcpu_reset_hcr(vcpu); /* - * Handle the "start in power-off" case by marking the VCPU as paused. + * Handle the "start in power-off" case. */ if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) - vcpu->arch.pause = true; + vcpu->arch.power_off = true; else - vcpu->arch.pause = false; + vcpu->arch.power_off = false; return 0; } diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index ad6f6424f1d1..0b556968a6da 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -63,7 +63,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) { - vcpu->arch.pause = true; + vcpu->arch.power_off = true; } static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) @@ -87,7 +87,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ if (!vcpu) return PSCI_RET_INVALID_PARAMS; - if (!vcpu->arch.pause) { + if (!vcpu->arch.power_off) { if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) return PSCI_RET_ALREADY_ON; else @@ -115,7 +115,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) * the general puspose registers are undefined upon CPU_ON. */ *vcpu_reg(vcpu, 0) = context_id; - vcpu->arch.pause = false; + vcpu->arch.power_off = false; smp_mb(); /* Make sure the above is visible */ wq = kvm_arch_vcpu_wq(vcpu); @@ -153,7 +153,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) mpidr = kvm_vcpu_get_mpidr_aff(tmp); if ((mpidr & target_affinity_mask) == target_affinity) { matching_cpus++; - if (!tmp->arch.pause) + if (!tmp->arch.power_off) return PSCI_0_2_AFFINITY_LEVEL_ON; } } @@ -179,7 +179,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) * re-initialized. */ kvm_for_each_vcpu(i, tmp, vcpu->kvm) { - tmp->arch.pause = true; + tmp->arch.power_off = true; kvm_vcpu_kick(tmp); } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ed039688c221..4b4157b3b983 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -149,8 +149,8 @@ struct kvm_vcpu_arch { u32 mdscr_el1; } guest_debug_preserved; - /* Don't run the guest */ - bool pause; + /* vcpu power-off state */ + bool power_off; /* IO related fields */ struct kvm_decode mmio_decode; -- cgit v1.2.3-70-g09d2 From 4f5f1dc03606e18986b874f899cf86b0a3e4f2a5 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 25 Sep 2015 23:41:15 +0200 Subject: KVM: arm/arm64: check power_off in kvm_arch_vcpu_runnable kvm_arch_vcpu_runnable now also checks whether the power_off flag is set. Signed-off-by: Eric Auger Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 9d2fb4772d8c..d04deeb1d4e0 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -352,7 +352,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, */ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v); + return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v)) + && !v->arch.power_off); } /* Just ensure a guest exit from a particular CPU */ -- cgit v1.2.3-70-g09d2 From 101d3da09c953b08c814cd9a0b8605623d640ba0 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 25 Sep 2015 23:41:16 +0200 Subject: KVM: arm/arm64: check power_off in critical section before VCPU run In case a vcpu off PSCI call is called just after we executed the vcpu_sleep check, we can enter the guest although power_off is set. Let's check the power_off state in the critical section, just before entering the guest. Signed-off-by: Eric Auger Reported-by: Christoffer Dall Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index d04deeb1d4e0..3b3384c37e24 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -560,7 +560,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) run->exit_reason = KVM_EXIT_INTR; } - if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) { + if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || + vcpu->arch.power_off) { local_irq_enable(); kvm_timer_sync_hwstate(vcpu); kvm_vgic_sync_hwstate(vcpu); -- cgit v1.2.3-70-g09d2 From 3b92830ad41b2fe377e0765322e8aefd0ab8388d Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 25 Sep 2015 23:41:17 +0200 Subject: KVM: arm/arm64: implement kvm_arm_[halt,resume]_guest We introduce kvm_arm_halt_guest and resume functions. They will be used for IRQ forward state change. Halt is synchronous and prevents the guest from being re-entered. We use the same mechanism put in place for PSCI former pause, now renamed power_off. A new flag is introduced in arch vcpu state, pause, only meant to be used by those functions. Signed-off-by: Eric Auger Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 +++ arch/arm/kvm/arm.c | 35 +++++++++++++++++++++++++++++++---- arch/arm64/include/asm/kvm_host.h | 3 +++ 3 files changed, 37 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 107374f986fd..6692982c9b57 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -129,6 +129,9 @@ struct kvm_vcpu_arch { /* vcpu power-off state */ bool power_off; + /* Don't run the guest (internal implementation need) */ + bool pause; + /* IO related fields */ struct kvm_decode mmio_decode; diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3b3384c37e24..ed1574724caf 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -353,7 +353,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v)) - && !v->arch.power_off); + && !v->arch.power_off && !v->arch.pause); } /* Just ensure a guest exit from a particular CPU */ @@ -479,11 +479,38 @@ bool kvm_arch_intc_initialized(struct kvm *kvm) return vgic_initialized(kvm); } +static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused; +static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused; + +static void kvm_arm_halt_guest(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.pause = true; + force_vm_exit(cpu_all_mask); +} + +static void kvm_arm_resume_guest(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); + + vcpu->arch.pause = false; + wake_up_interruptible(wq); + } +} + static void vcpu_sleep(struct kvm_vcpu *vcpu) { wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); - wait_event_interruptible(*wq, !vcpu->arch.power_off); + wait_event_interruptible(*wq, ((!vcpu->arch.power_off) && + (!vcpu->arch.pause))); } static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) @@ -533,7 +560,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) update_vttbr(vcpu->kvm); - if (vcpu->arch.power_off) + if (vcpu->arch.power_off || vcpu->arch.pause) vcpu_sleep(vcpu); /* @@ -561,7 +588,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) } if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || - vcpu->arch.power_off) { + vcpu->arch.power_off || vcpu->arch.pause) { local_irq_enable(); kvm_timer_sync_hwstate(vcpu); kvm_vgic_sync_hwstate(vcpu); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 4b4157b3b983..a35ce7266aac 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -152,6 +152,9 @@ struct kvm_vcpu_arch { /* vcpu power-off state */ bool power_off; + /* Don't run the guest (internal implementation need) */ + bool pause; + /* IO related fields */ struct kvm_decode mmio_decode; -- cgit v1.2.3-70-g09d2 From b5905dc12ed4254f7e0aac62bab48f002181f639 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 30 Aug 2015 15:55:22 +0200 Subject: arm/arm64: KVM: Improve kvm_exit tracepoint The ARM architecture only saves the exit class to the HSR (ESR_EL2 for arm64) on synchronous exceptions, not on asynchronous exceptions like an IRQ. However, we only report the exception class on kvm_exit, which is confusing because an IRQ looks like it exited at some PC with the same reason as the previous exit. Add a lookup table for the exception index and prepend the kvm_exit tracepoint text with the exception type to clarify this situation. Also resolve the exception class (EC) to a human-friendly text version so the trace output becomes immediately usable for debugging this code. Cc: Wei Huang Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_arm.h | 20 ++++++++++++++++++++ arch/arm/kvm/arm.c | 2 +- arch/arm/kvm/trace.h | 10 +++++++--- arch/arm64/include/asm/kvm_arm.h | 16 ++++++++++++++++ 4 files changed, 44 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index d995821f1698..dc641ddf0784 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -218,4 +218,24 @@ #define HSR_DABT_CM (1U << 8) #define HSR_DABT_EA (1U << 9) +#define kvm_arm_exception_type \ + {0, "RESET" }, \ + {1, "UNDEFINED" }, \ + {2, "SOFTWARE" }, \ + {3, "PREF_ABORT" }, \ + {4, "DATA_ABORT" }, \ + {5, "IRQ" }, \ + {6, "FIQ" }, \ + {7, "HVC" } + +#define HSRECN(x) { HSR_EC_##x, #x } + +#define kvm_arm_exception_class \ + HSRECN(UNKNOWN), HSRECN(WFI), HSRECN(CP15_32), HSRECN(CP15_64), \ + HSRECN(CP14_MR), HSRECN(CP14_LS), HSRECN(CP_0_13), HSRECN(CP10_ID), \ + HSRECN(JAZELLE), HSRECN(BXJ), HSRECN(CP14_64), HSRECN(SVC_HYP), \ + HSRECN(HVC), HSRECN(SMC), HSRECN(IABT), HSRECN(IABT_HYP), \ + HSRECN(DABT), HSRECN(DABT_HYP) + + #endif /* __ARM_KVM_ARM_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index ed1574724caf..eab83b2435b8 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -635,7 +635,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * guest time. */ kvm_guest_exit(); - trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* * We must sync the timer state before the vgic state so that diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h index 0ec35392d208..c25a88598eb0 100644 --- a/arch/arm/kvm/trace.h +++ b/arch/arm/kvm/trace.h @@ -25,21 +25,25 @@ TRACE_EVENT(kvm_entry, ); TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, unsigned long vcpu_pc), - TP_ARGS(exit_reason, vcpu_pc), + TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc), + TP_ARGS(idx, exit_reason, vcpu_pc), TP_STRUCT__entry( + __field( int, idx ) __field( unsigned int, exit_reason ) __field( unsigned long, vcpu_pc ) ), TP_fast_assign( + __entry->idx = idx; __entry->exit_reason = exit_reason; __entry->vcpu_pc = vcpu_pc; ), - TP_printk("HSR_EC: 0x%04x, PC: 0x%08lx", + TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", + __print_symbolic(__entry->idx, kvm_arm_exception_type), __entry->exit_reason, + __print_symbolic(__entry->exit_reason, kvm_arm_exception_class), __entry->vcpu_pc) ); diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 9694f2654593..5e6857b6bdc4 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -200,4 +200,20 @@ /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) +#define kvm_arm_exception_type \ + {0, "IRQ" }, \ + {1, "TRAP" } + +#define ECN(x) { ESR_ELx_EC_##x, #x } + +#define kvm_arm_exception_class \ + ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \ + ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(CP14_64), ECN(SVC64), \ + ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(IMP_DEF), ECN(IABT_LOW), \ + ECN(IABT_CUR), ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \ + ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \ + ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \ + ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \ + ECN(BKPT32), ECN(VECTOR32), ECN(BRK64) + #endif /* __ARM64_KVM_ARM_H__ */ -- cgit v1.2.3-70-g09d2 From db85c55f1b01b155332058753854d897e965d67f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 12 Oct 2015 15:04:50 +0100 Subject: arm64: kvm: restore EL1N SP for panic If we panic in hyp mode, we inject a call to panic() into the EL1N host kernel. If a guest context is active, we first attempt to restore the minimal amount of state necessary to execute the host kernel with restore_sysregs. However, the SP is restored as part of restore_common_regs, and so we may return to the host's panic() function with the SP of the guest. Any calculations based on the SP will be bogus, and any attempt to access the stack will result in recursive data aborts. When running Linux as a guest, the guest's EL1N SP is like to be some valid kernel address. In this case, the host kernel may use that region as a stack for panic(), corrupting it in the process. Avoid the problem by restoring the host SP prior to returning to the host. To prevent misleading backtraces in the host, the FP is zeroed at the same time. We don't need any of the other "common" registers in order to panic successfully. Signed-off-by: Mark Rutland Acked-by: Marc Zyngier Cc: Christoffer Dall Cc: Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp.S | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index e5836138ec42..1599701ef044 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -880,6 +880,14 @@ __kvm_hyp_panic: bl __restore_sysregs + /* + * Make sure we have a valid host stack, and don't leave junk in the + * frame pointer that will give us a misleading host stack unwinding. + */ + ldr x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)] + msr sp_el1, x22 + mov x29, xzr + 1: adr x0, __hyp_panic_str adr x1, 2f ldp x2, x3, [x1] -- cgit v1.2.3-70-g09d2 From c5c2c393468576bad6d10b2b5fefff8cd25df3f4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 26 Oct 2015 08:41:29 +0100 Subject: KVM: s390: SCA must not cross page boundaries We seemed to have missed a few corner cases in commit f6c137ff00a4 ("KVM: s390: randomize sca address"). The SCA has a maximum size of 2112 bytes. By setting the sca_offset to some unlucky numbers, we exceed the page. 0x7c0 (1984) -> Fits exactly 0x7d0 (2000) -> 16 bytes out 0x7e0 (2016) -> 32 bytes out 0x7f0 (2032) -> 48 bytes out One VCPU entry is 32 bytes long. For the last two cases, we actually write data to the other page. 1. The address of the VCPU. 2. Injection/delivery/clearing of SIGP externall calls via SIGP IF. Especially the 2. happens regularly. So this could produce two problems: 1. The guest losing/getting external calls. 2. Random memory overwrites in the host. So this problem happens on every 127 + 128 created VM with 64 VCPUs. Cc: stable@vger.kernel.org # v3.15+ Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 618c85411a51..35596177fad2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1098,7 +1098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.sca) goto out_err; spin_lock(&kvm_lock); - sca_offset = (sca_offset + 16) & 0x7f0; + sca_offset += 16; + if (sca_offset + sizeof(struct sca_block) > PAGE_SIZE) + sca_offset = 0; kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset); spin_unlock(&kvm_lock); -- cgit v1.2.3-70-g09d2 From 58c383c62e1a4379cee531b56e4293211f2d5ded Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 12 Oct 2015 13:27:29 +0200 Subject: KVM: s390: drop useless newline in debugging data the s390 debug feature does not need newlines. In fact it will result in empty lines. Get rid of 4 leftovers. Signed-off-by: Christian Borntraeger Acked-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 35596177fad2..07a6aa896c12 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -514,7 +514,7 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) if (gtod_high != 0) return -EINVAL; - VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x\n", gtod_high); + VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x", gtod_high); return 0; } @@ -527,7 +527,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) return -EFAULT; kvm_s390_set_tod_clock(kvm, gtod); - VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod); + VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod); return 0; } @@ -559,7 +559,7 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) if (copy_to_user((void __user *)attr->addr, >od_high, sizeof(gtod_high))) return -EFAULT; - VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x\n", gtod_high); + VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x", gtod_high); return 0; } @@ -571,7 +571,7 @@ static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) gtod = kvm_s390_get_tod_clock_fast(kvm); if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) return -EFAULT; - VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod); + VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx", gtod); return 0; } -- cgit v1.2.3-70-g09d2 From 46b708ea875f14f5496109df053624199f3aae87 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 23 Jul 2015 15:24:03 +0200 Subject: KVM: s390: use simple switch statement as multiplexer We currently do some magic shifting (by exploiting that exit codes are always a multiple of 4) and a table lookup to jump into the exit handlers. This causes some calculations and checks, just to do an potentially expensive function call. Changing that to a switch statement gives the compiler the chance to inline and dynamically decide between jump tables or inline compare and branches. In addition it makes the code more readable. bloat-o-meter gives me a small reduction in code size: add/remove: 0/7 grow/shrink: 1/1 up/down: 986/-1334 (-348) function old new delta kvm_handle_sie_intercept 72 1058 +986 handle_prog 704 696 -8 handle_noop 54 - -54 handle_partial_execution 60 - -60 intercept_funcs 120 - -120 handle_instruction 198 - -198 handle_validity 210 - -210 handle_stop 316 - -316 handle_external_interrupt 368 - -368 Right now my gcc does conditional branches instead of jump tables. The inlining seems to give us enough cycles as some micro-benchmarking shows minimal improvements, but still in noise. Signed-off-by: Christian Borntraeger Reviewed-by: Cornelia Huck --- arch/s390/kvm/intercept.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 7365e8a46032..b4a5aa110cec 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -336,28 +336,28 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } -static const intercept_handler_t intercept_funcs[] = { - [0x00 >> 2] = handle_noop, - [0x04 >> 2] = handle_instruction, - [0x08 >> 2] = handle_prog, - [0x10 >> 2] = handle_noop, - [0x14 >> 2] = handle_external_interrupt, - [0x18 >> 2] = handle_noop, - [0x1C >> 2] = kvm_s390_handle_wait, - [0x20 >> 2] = handle_validity, - [0x28 >> 2] = handle_stop, - [0x38 >> 2] = handle_partial_execution, -}; - int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { - intercept_handler_t func; - u8 code = vcpu->arch.sie_block->icptcode; - - if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs)) + switch (vcpu->arch.sie_block->icptcode) { + case 0x00: + case 0x10: + case 0x18: + return handle_noop(vcpu); + case 0x04: + return handle_instruction(vcpu); + case 0x08: + return handle_prog(vcpu); + case 0x14: + return handle_external_interrupt(vcpu); + case 0x1c: + return kvm_s390_handle_wait(vcpu); + case 0x20: + return handle_validity(vcpu); + case 0x28: + return handle_stop(vcpu); + case 0x38: + return handle_partial_execution(vcpu); + default: return -EOPNOTSUPP; - func = intercept_funcs[code >> 2]; - if (func) - return func(vcpu); - return -EOPNOTSUPP; + } } -- cgit v1.2.3-70-g09d2 From 2da29bccc5045ea10c70cb3a69be777768fd0b66 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Fri, 30 Oct 2015 12:56:11 +0530 Subject: KVM: x86: removing unused variable removing unused variables, found by coccinelle Signed-off-by: Saurabh Sengar Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e9c226cb79d..57b5f79aad08 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3438,41 +3438,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); memset(&ps->reserved, 0, sizeof(ps->reserved)); - return r; + return 0; } static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { - int r = 0, start = 0; + int start = 0; u32 prev_legacy, cur_legacy; mutex_lock(&kvm->arch.vpit->pit_state.lock); prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; @@ -3484,7 +3478,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) kvm->arch.vpit->pit_state.flags = ps->flags; kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_reinject(struct kvm *kvm, -- cgit v1.2.3-70-g09d2 From 7a036a6f670f63b32c5ee126425f9109271ca13f Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 30 Oct 2015 16:36:24 +0100 Subject: KVM: x86: add read_phys to x86_emulate_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to read the physical memory when emulating RSM. X86EMUL_IO_NEEDED is returned on all errors for consistency with other helpers. Signed-off-by: Radim Krčmář Tested-by: Laszlo Ersek Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 10 ++++++++++ arch/x86/kvm/x86.c | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e16466ec473c..e9cd7befcb76 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -111,6 +111,16 @@ struct x86_emulate_ops { unsigned int bytes, struct x86_exception *fault); + /* + * read_phys: Read bytes of standard (non-emulated/special) memory. + * Used for descriptor reading. + * @addr: [IN ] Physical address from which to read. + * @val: [OUT] Value read from memory. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr, + void *val, unsigned int bytes); + /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 57b5f79aad08..a24bae0a83a2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4089,6 +4089,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } +static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); + + return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE; +} + int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -4824,6 +4833,7 @@ static const struct x86_emulate_ops emulate_ops = { .write_gpr = emulator_write_gpr, .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, + .read_phys = kvm_read_guest_phys_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, -- cgit v1.2.3-70-g09d2 From f40606b147dd5b4678cedc877a71deb520ca507e Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 30 Oct 2015 16:36:25 +0100 Subject: KVM: x86: handle SMBASE as physical address in RSM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GET_SMSTATE depends on real mode to ensure that smbase+offset is treated as a physical address, which has already caused a bug after shuffling the code. Enforce physical addressing. Signed-off-by: Radim Krčmář Reported-by: Laszlo Ersek Tested-by: Laszlo Ersek Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9da95b9daf8d..b60fed56671b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) #define GET_SMSTATE(type, smbase, offset) \ ({ \ type __val; \ - int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \ - sizeof(__val), NULL); \ + int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ + sizeof(__val)); \ if (r != X86EMUL_CONTINUE) \ return X86EMUL_UNHANDLEABLE; \ __val; \ @@ -2484,8 +2484,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) /* * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed - * to read_std/write_std are not virtual. + * CR0/CR3/CR4/EFER. * * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. */ -- cgit v1.2.3-70-g09d2 From c75efa974e013640496620f26f0b532cb5cb17f9 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Fri, 16 Oct 2015 10:07:50 +0300 Subject: drivers/hv: share Hyper-V SynIC constants with userspace Moved Hyper-V synic contants from guest Hyper-V drivers private header into x86 arch uapi Hyper-V header. Added Hyper-V synic msr's flags into x86 arch uapi Hyper-V header. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Vitaly Kuznetsov CC: "K. Y. Srinivasan" CC: Gleb Natapov CC: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/hyperv.h | 12 ++++++++++++ drivers/hv/hyperv_vmbus.h | 5 ----- include/linux/hyperv.h | 1 + 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 2677a0aac2cc..040d4083c24f 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -257,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE { __s64 tsc_offset; } HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; +/* Define the number of synthetic interrupt sources. */ +#define HV_SYNIC_SINT_COUNT (16) +/* Define the expected SynIC version. */ +#define HV_SYNIC_VERSION_1 (0x1) + +#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) +#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) +#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) +#define HV_SYNIC_SINT_MASKED (1ULL << 16) +#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) +#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) + #endif diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 3d70e36c918e..3782636562a1 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -63,9 +63,6 @@ enum hv_cpuid_function { /* Define version of the synthetic interrupt controller. */ #define HV_SYNIC_VERSION (1) -/* Define the expected SynIC version. */ -#define HV_SYNIC_VERSION_1 (0x1) - /* Define synthetic interrupt controller message constants. */ #define HV_MESSAGE_SIZE (256) #define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) @@ -105,8 +102,6 @@ enum hv_message_type { HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 }; -/* Define the number of synthetic interrupt sources. */ -#define HV_SYNIC_SINT_COUNT (16) #define HV_SYNIC_STIMER_COUNT (4) /* Define invalid partition identifier. */ diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 54733d5b503e..8fdc17b84739 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -26,6 +26,7 @@ #define _HYPERV_H #include +#include #include #include -- cgit v1.2.3-70-g09d2 From 0669a51015c58b1f036030743a0c0781eb63867f Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 30 Oct 2015 15:48:20 +0100 Subject: KVM: x86: zero apic_arb_prio on reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BSP doesn't get INIT so its apic_arb_prio isn't zeroed after reboot. BSP won't get lowest priority interrupts until other VCPUs get enough interrupts to match their pre-reboot apic_arb_prio. That behavior doesn't fit into KVM's round-robin-like interpretation of lowest priority delivery ... userspace should KVM_SET_LAPIC on reset, so just zero apic_arb_prio there. Reported-by: Yuki Shibuya Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 168b8759bd73..ecd4ea1d28a8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1918,6 +1918,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); + + vcpu->arch.apic_arb_prio = 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From b97e6de9c96cefaa02a6a7464731ea504b45e150 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 19:16:47 +0100 Subject: KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We do not want to do too much work in atomic context, in particular not walking all the VCPUs of the virtual machine. So we want to distinguish the architecture-specific injection function for irqfd from kvm_set_msi. Since it's still empty, reuse the newly added kvm_arch_set_irq and rename it to kvm_arch_set_irq_inatomic. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 14 ++++++++------ include/linux/kvm_host.h | 7 +++---- virt/kvm/eventfd.c | 11 ++++------- 3 files changed, 15 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8f4499c7ffc1..75dc633c48dc 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -124,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, } -static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm) +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) { struct kvm_lapic_irq irq; int r; + if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) + return -EWOULDBLOCK; + kvm_set_msi_irq(e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) @@ -165,10 +169,8 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) idx = srcu_read_lock(&kvm->irq_srcu); if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { e = &entries[0]; - if (likely(e->type == KVM_IRQ_ROUTING_MSI)) - ret = kvm_set_msi_inatomic(e, kvm); - else - ret = -EWOULDBLOCK; + ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, + irq, level); } srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 87189a41d904..15c78f320678 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -830,10 +830,9 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); - -int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, - int irq_source_id, int level, bool line_status); - +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status); bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index e29fd2640709..46dbc0a7dfc1 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -171,7 +171,7 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) queue_work(irqfd_cleanup_wq, &irqfd->shutdown); } -int __attribute__((weak)) kvm_arch_set_irq( +int __attribute__((weak)) kvm_arch_set_irq_inatomic( struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, int irq_source_id, int level, @@ -201,12 +201,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) irq = irqfd->irq_entry; } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); /* An event has been signaled, inject an interrupt */ - if (irq.type == KVM_IRQ_ROUTING_MSI) - kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, - false); - else if (kvm_arch_set_irq(&irq, kvm, - KVM_USERSPACE_IRQ_SOURCE_ID, 1, - false) == -EWOULDBLOCK) + if (kvm_arch_set_irq_inatomic(&irq, kvm, + KVM_USERSPACE_IRQ_SOURCE_ID, 1, + false) == -EWOULDBLOCK) schedule_work(&irqfd->inject); srcu_read_unlock(&kvm->irq_srcu, idx); } -- cgit v1.2.3-70-g09d2 From 7695405698d011c05a95288f1f3c0a3dd252dccc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 19:22:20 +0100 Subject: KVM: device assignment: remove pointless #ifdefs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The symbols are always defined. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/assigned-dev.c | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index d090ecf08809..1c17ee807ef7 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -131,7 +131,6 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) return IRQ_HANDLED; } -#ifdef __KVM_HAVE_MSI static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; @@ -150,9 +149,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) return IRQ_HANDLED; } -#endif -#ifdef __KVM_HAVE_MSIX static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; @@ -183,7 +180,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) return IRQ_HANDLED; } -#endif /* Ack the irq line for an assigned device */ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) @@ -386,7 +382,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, return 0; } -#ifdef __KVM_HAVE_MSI static int assigned_device_enable_host_msi(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { @@ -408,9 +403,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, return 0; } -#endif -#ifdef __KVM_HAVE_MSIX static int assigned_device_enable_host_msix(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { @@ -443,8 +436,6 @@ err: return r; } -#endif - static int assigned_device_enable_guest_intx(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -454,7 +445,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm, return 0; } -#ifdef __KVM_HAVE_MSI static int assigned_device_enable_guest_msi(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -463,9 +453,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, dev->ack_notifier.gsi = -1; return 0; } -#endif -#ifdef __KVM_HAVE_MSIX static int assigned_device_enable_guest_msix(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -474,7 +462,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, dev->ack_notifier.gsi = -1; return 0; } -#endif static int assign_host_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, @@ -492,16 +479,12 @@ static int assign_host_irq(struct kvm *kvm, case KVM_DEV_IRQ_HOST_INTX: r = assigned_device_enable_host_intx(kvm, dev); break; -#ifdef __KVM_HAVE_MSI case KVM_DEV_IRQ_HOST_MSI: r = assigned_device_enable_host_msi(kvm, dev); break; -#endif -#ifdef __KVM_HAVE_MSIX case KVM_DEV_IRQ_HOST_MSIX: r = assigned_device_enable_host_msix(kvm, dev); break; -#endif default: r = -EINVAL; } @@ -534,16 +517,12 @@ static int assign_guest_irq(struct kvm *kvm, case KVM_DEV_IRQ_GUEST_INTX: r = assigned_device_enable_guest_intx(kvm, dev, irq); break; -#ifdef __KVM_HAVE_MSI case KVM_DEV_IRQ_GUEST_MSI: r = assigned_device_enable_guest_msi(kvm, dev, irq); break; -#endif -#ifdef __KVM_HAVE_MSIX case KVM_DEV_IRQ_GUEST_MSIX: r = assigned_device_enable_guest_msix(kvm, dev, irq); break; -#endif default: r = -EINVAL; } @@ -826,7 +805,6 @@ out: } -#ifdef __KVM_HAVE_MSIX static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, struct kvm_assigned_msix_nr *entry_nr) { @@ -906,7 +884,6 @@ msix_entry_out: return r; } -#endif static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) @@ -1012,7 +989,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#ifdef __KVM_HAVE_MSIX case KVM_ASSIGN_SET_MSIX_NR: { struct kvm_assigned_msix_nr entry_nr; r = -EFAULT; @@ -1033,7 +1009,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#endif case KVM_ASSIGN_SET_INTX_MASK: { struct kvm_assigned_pci_dev assigned_dev; -- cgit v1.2.3-70-g09d2 From 8a22f234a81ab4d1de5d948c3478608f08a9b844 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 18:52:02 +0100 Subject: KVM: x86: move kvm_set_irq_inatomic to legacy device assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function is not used outside device assignment, and kvm_arch_set_irq_inatomic has a different prototype. Move it here and make it static to avoid confusion. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/assigned-dev.c | 37 +++++++++++++++++++++++++++++++++++++ arch/x86/kvm/irq_comm.c | 34 ---------------------------------- include/linux/kvm_host.h | 1 - 3 files changed, 37 insertions(+), 35 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index 1c17ee807ef7..9dc091acd5fb 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -21,6 +21,7 @@ #include #include "irq.h" #include "assigned-dev.h" +#include "trace/events/kvm.h" struct kvm_assigned_dev_kernel { struct kvm_irq_ack_notifier ack_notifier; @@ -131,6 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) return IRQ_HANDLED; } +/* + * Deliver an IRQ in an atomic context if we can, or return a failure, + * user can retry in a process context. + * Return value: + * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. + * Other values - No need to retry. + */ +static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, + int level) +{ + struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; + struct kvm_kernel_irq_routing_entry *e; + int ret = -EINVAL; + int idx; + + trace_kvm_set_irq(irq, level, irq_source_id); + + /* + * Injection into either PIC or IOAPIC might need to scan all CPUs, + * which would need to be retried from thread context; when same GSI + * is connected to both PIC and IOAPIC, we'd have to report a + * partial failure here. + * Since there's no easy way to do this, we only support injecting MSI + * which is limited to 1:1 GSI mapping. + */ + idx = srcu_read_lock(&kvm->irq_srcu); + if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { + e = &entries[0]; + ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, + irq, level); + } + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + + static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 75dc633c48dc..84b96d319909 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -142,40 +142,6 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return -EWOULDBLOCK; } -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, - irq, level); - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - int kvm_request_irq_source_id(struct kvm *kvm) { unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15c78f320678..242a6d2b53ff 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -827,7 +827,6 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin); int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); -int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, -- cgit v1.2.3-70-g09d2 From 656ec4a4928a3db7d16e5cb9bce351a478cfd3d5 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Mon, 2 Nov 2015 22:20:00 +0100 Subject: KVM: VMX: fix SMEP and SMAP without EPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment in code had it mostly right, but we enable paging for emulated real mode regardless of EPT. Without EPT (which implies emulated real mode), secondary VCPUs won't start unless we disable SM[AE]P when the guest doesn't use paging. Signed-off-by: Radim Krčmář Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4d0aa31a42ba..2ac116417583 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3788,20 +3788,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; - /* - * SMEP/SMAP is disabled if CPU is in non-paging mode - * in hardware. However KVM always uses paging mode to - * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP/SMAP needs to be - * manually disabled when guest switches to non-paging - * mode. - */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } } + if (!enable_unrestricted_guest && !is_paging(vcpu)) + /* + * SMEP/SMAP is disabled if CPU is in non-paging mode in + * hardware. However KVM always uses paging mode without + * unrestricted guest. + * To emulate this behavior, SMEP/SMAP needs to be manually + * disabled when guest switches to non-paging mode. + */ + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); + vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); return 0; -- cgit v1.2.3-70-g09d2 From 89651a3decbe03754f304a0b248f27eeb9a37937 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 3 Nov 2015 13:43:05 +0100 Subject: KVM: x86: allow RSM from 64-bit mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SDM says that exiting system management mode from 64-bit mode is invalid, but that would be too good to be true. But actually, most of the code is already there to support exiting from compat mode (EFER.LME=1, EFER.LMA=0). Getting all the way from 64-bit mode to real mode only requires clearing CS.L and CR4.PCIDE. Cc: stable@vger.kernel.org Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c Tested-by: Laszlo Ersek Cc: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b60fed56671b..1505587d06e9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2484,16 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) /* * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. - * - * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. + * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU + * supports long mode. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (emulator_has_longmode(ctxt)) { + struct desc_struct cs_desc; + + /* Zero CR4.PCIDE before CR0.PG. */ + if (cr4 & X86_CR4_PCIDE) { + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + cr4 &= ~X86_CR4_PCIDE; + } + + /* A 32-bit code segment is required to clear EFER.LMA. */ + memset(&cs_desc, 0, sizeof(cs_desc)); + cs_desc.type = 0xb; + cs_desc.s = cs_desc.g = cs_desc.p = 1; + ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS); + } + + /* For the 64-bit case, this will clear EFER.LMA. */ cr0 = ctxt->ops->get_cr(ctxt, 0); if (cr0 & X86_CR0_PE) ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - cr4 = ctxt->ops->get_cr(ctxt, 4); + + /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ if (cr4 & X86_CR4_PAE) ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + + /* And finally go back to 32-bit mode. */ efer = 0; ctxt->ops->set_msr(ctxt, MSR_EFER, efer); @@ -4454,7 +4474,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm), + II(EmulateOnUD | ImplicitOps, em_rsm, rsm), F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), -- cgit v1.2.3-70-g09d2 From 879ae1880449c88db11c1ebdaedc2da79b2fe73f Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 4 Nov 2015 12:54:41 +0100 Subject: KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0() Commit b18d5431acc7 ("KVM: x86: fix CR0.CD virtualization") was technically correct, but it broke OVMF guests by slowing down various parts of the firmware. Commit fb279950ba02 ("KVM: vmx: obey KVM_QUIRK_CD_NW_CLEARED") quirked the first function modified by b18d5431acc7, vmx_get_mt_mask(), for OVMF's sake. This restored the speed of the OVMF code that runs before PlatformPei (including the memory intensive LZMA decompression in SEC). This patch extends the quirk to the second function modified by b18d5431acc7, kvm_set_cr0(). It eliminates the intrusive slowdown that hits the EFI_MP_SERVICES_PROTOCOL implementation of edk2's UefiCpuPkg/CpuDxe -- which is built into OVMF --, when CpuDxe starts up all APs at once for initialization, in order to count them. We also carry over the kvm_arch_has_noncoherent_dma() sub-condition from the other half of the original commit b18d5431acc7. Fixes: b18d5431acc7a2fd22767925f3a6f597aa4bd29e Cc: stable@vger.kernel.org Cc: Jordan Justen Cc: Alex Williamson Reviewed-by: Xiao Guangrong Tested-by: Janusz Mocek Signed-off-by: Laszlo Ersek # Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a24bae0a83a2..30723a4122cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -625,7 +625,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); - if ((cr0 ^ old_cr0) & X86_CR0_CD) + if (((cr0 ^ old_cr0) & X86_CR0_CD) && + kvm_arch_has_noncoherent_dma(vcpu->kvm) && + !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); return 0; -- cgit v1.2.3-70-g09d2 From a3eaa8649e4c6a6afdafaa04b9114fb230617bb1 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 4 Nov 2015 13:46:05 +0800 Subject: KVM: VMX: Fix commit which broke PML I found PML was broken since below commit: commit feda805fe7c4ed9cf78158e73b1218752e3b4314 Author: Xiao Guangrong Date: Wed Sep 9 14:05:55 2015 +0800 KVM: VMX: unify SECONDARY_VM_EXEC_CONTROL update Unify the update in vmx_cpuid_update() Signed-off-by: Xiao Guangrong [Rewrite to use vmcs_set_secondary_exec_control. - Paolo] Signed-off-by: Paolo Bonzini The reason is in above commit vmx_cpuid_update calls vmx_secondary_exec_control, in which currently SECONDARY_EXEC_ENABLE_PML bit is cleared unconditionally (as PML is enabled in creating vcpu). Therefore if vcpu_cpuid_update is called after vcpu is created, PML will be disabled unexpectedly while log-dirty code still thinks PML is used. Fix this by clearing SECONDARY_EXEC_ENABLE_PML in vmx_secondary_exec_control only when PML is not supported or not enabled (!enable_pml). This is more reasonable as PML is currently either always enabled or disabled. With this explicit updating SECONDARY_EXEC_ENABLE_PML in vmx_enable{disable}_pml is not needed so also rename vmx_enable{disable}_pml to vmx_create{destroy}_pml_buffer. Fixes: feda805fe7c4ed9cf78158e73b1218752e3b4314 Signed-off-by: Kai Huang [While at it, change a wrong ASSERT to an "if". The condition can happen if creating the VCPU fails with ENOMEM. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2ac116417583..5eb56ed77c1f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4718,8 +4718,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - /* PML is enabled/disabled in creating/destorying vcpu */ - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + + if (!enable_pml) + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; /* Currently, we allow L1 guest to directly run pcommit instruction. */ exec_control &= ~SECONDARY_EXEC_PCOMMIT; @@ -7804,7 +7805,7 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static int vmx_enable_pml(struct vcpu_vmx *vmx) +static int vmx_create_pml_buffer(struct vcpu_vmx *vmx) { struct page *pml_pg; @@ -7817,18 +7818,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); - return 0; } -static void vmx_disable_pml(struct vcpu_vmx *vmx) +static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { - ASSERT(vmx->pml_pg); - __free_page(vmx->pml_pg); - vmx->pml_pg = NULL; - - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); + if (vmx->pml_pg) { + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + } } static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) @@ -8706,7 +8704,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (enable_pml) - vmx_disable_pml(vmx); + vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); @@ -8790,7 +8788,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) * for the guest, etc. */ if (enable_pml) { - err = vmx_enable_pml(vmx); + err = vmx_create_pml_buffer(vmx); if (err) goto free_vmcs; } -- cgit v1.2.3-70-g09d2