From 35e6eaa3df55822d0cb1df3bf08e6cb816737131 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 16 Dec 2016 16:10:01 +0100 Subject: KVM: x86: don't allow kernel irqchip with split irqchip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split irqchip cannot be created after creating the kernel irqchip, but we forgot to restrict the other way. This is an API change. Reviewed-by: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2f22810a7e0c..c72a8d00a1c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3961,7 +3961,7 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_lock(&kvm->lock); r = -EEXIST; - if (kvm->arch.vpic) + if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; r = -EINVAL; if (kvm->created_vcpus) -- cgit v1.2.3-70-g09d2 From 49776faf93f8074bb4990beac04781a9507d3650 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 16 Dec 2016 16:10:02 +0100 Subject: KVM: x86: decouple irqchip_in_kernel() and pic_irqchip() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit irqchip_in_kernel() tried to save a bit by reusing pic_irqchip(), but it just complicated the code. Add a separate state for the irqchip mode. Reviewed-by: David Hildenbrand [Used Paolo's version of condition in irqchip_in_kernel().] Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 8 +++++++- arch/x86/kvm/irq.h | 15 ++++++++------- arch/x86/kvm/x86.c | 5 +++-- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a7066dc1a7e9..fc03ab1f6110 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -716,6 +716,12 @@ struct kvm_hv { HV_REFERENCE_TSC_PAGE tsc_ref; }; +enum kvm_irqchip_mode { + KVM_IRQCHIP_NONE, + KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ + KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -788,7 +794,7 @@ struct kvm_arch { u64 disabled_quirks; - bool irqchip_split; + enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 035731eb3897..f4965bc2613c 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm) static inline int irqchip_split(struct kvm *kvm) { - return kvm->arch.irqchip_split; + return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT; } -static inline int irqchip_in_kernel(struct kvm *kvm) +static inline int irqchip_kernel(struct kvm *kvm) { - struct kvm_pic *vpic = pic_irqchip(kvm); - bool ret; + return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL; +} - ret = (vpic != NULL); - ret |= irqchip_split(kvm); +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE; - /* Read vpic before kvm->irq_routing. */ + /* Matches with wmb after initializing kvm->irq_routing. */ smp_rmb(); return ret; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c72a8d00a1c0..0630ab438bd5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3894,7 +3894,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, goto split_irqchip_unlock; /* Pairs with irqchip_in_kernel. */ smp_wmb(); - kvm->arch.irqchip_split = true; + kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; r = 0; split_irqchip_unlock: @@ -3988,8 +3988,9 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } - /* Write kvm->irq_routing before kvm->arch.vpic. */ + /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); + kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; kvm->arch.vpic = vpic; create_irqchip_unlock: mutex_unlock(&kvm->lock); -- cgit v1.2.3-70-g09d2 From 099413664c71fcf9d0099eba4f8a4dd59653d5a3 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 16 Dec 2016 16:10:03 +0100 Subject: KVM: x86: make pic setup code look like ioapic setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't treat kvm->arch.vpic specially anymore, so the setup can look like ioapic. This gets a bit more information out of return values. Reviewed-by: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/i8259.c | 16 +++++++++++----- arch/x86/kvm/irq.h | 4 ++-- arch/x86/kvm/x86.c | 30 +++++++++++++++--------------- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 7cc2360f1848..73ea24d4f119 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = { .write = picdev_eclr_write, }; -struct kvm_pic *kvm_create_pic(struct kvm *kvm) +int kvm_pic_init(struct kvm *kvm) { struct kvm_pic *s; int ret; s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) - return NULL; + return -ENOMEM; spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; @@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); - return s; + kvm->arch.vpic = s; + + return 0; fail_unreg_1: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); @@ -648,13 +650,17 @@ fail_unlock: kfree(s); - return NULL; + return ret; } -void kvm_destroy_pic(struct kvm_pic *vpic) +void kvm_pic_destroy(struct kvm *kvm) { + struct kvm_pic *vpic = kvm->arch.vpic; + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + + kvm->arch.vpic = NULL; kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index f4965bc2613c..40d5b2cf6061 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -73,8 +73,8 @@ struct kvm_pic { unsigned long irq_states[PIC_NUM_PINS]; }; -struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_destroy_pic(struct kvm_pic *vpic); +int kvm_pic_init(struct kvm *kvm); +void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0630ab438bd5..05ac71a01f99 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3957,33 +3957,34 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; case KVM_CREATE_IRQCHIP: { - struct kvm_pic *vpic; - mutex_lock(&kvm->lock); + r = -EEXIST; if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; - r = -ENOMEM; - vpic = kvm_create_pic(kvm); - if (vpic) { - r = kvm_ioapic_init(kvm); - if (r) { - mutex_lock(&kvm->slots_lock); - kvm_destroy_pic(vpic); - mutex_unlock(&kvm->slots_lock); - goto create_irqchip_unlock; - } - } else + + r = kvm_pic_init(kvm); + if (r) goto create_irqchip_unlock; + + r = kvm_ioapic_init(kvm); + if (r) { + mutex_lock(&kvm->slots_lock); + kvm_pic_destroy(kvm); + mutex_unlock(&kvm->slots_lock); + goto create_irqchip_unlock; + } + r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); - kvm_destroy_pic(vpic); + kvm_pic_destroy(kvm); mutex_unlock(&kvm->irq_lock); mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; @@ -3991,7 +3992,6 @@ long kvm_arch_vm_ioctl(struct file *filp, /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; - kvm->arch.vpic = vpic; create_irqchip_unlock: mutex_unlock(&kvm->lock); break; -- cgit v1.2.3-70-g09d2 From e5dc48777dcc898210e2f16d80d44718db38cdc3 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 16 Dec 2016 16:10:04 +0100 Subject: KVM: x86: refactor pic setup in kvm_set_routing_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6c0191615f23..1dfeb185a1e3 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -297,14 +297,12 @@ int kvm_set_routing_entry(struct kvm *kvm, case KVM_IRQ_ROUTING_IRQCHIP: delta = 0; switch (ue->u.irqchip.irqchip) { - case KVM_IRQCHIP_PIC_MASTER: - e->set = kvm_set_pic_irq; - max_pin = PIC_NUM_PINS; - break; case KVM_IRQCHIP_PIC_SLAVE: + delta = 8; + /* fall through */ + case KVM_IRQCHIP_PIC_MASTER: e->set = kvm_set_pic_irq; max_pin = PIC_NUM_PINS; - delta = 8; break; case KVM_IRQCHIP_IOAPIC: max_pin = KVM_IOAPIC_NUM_PINS; -- cgit v1.2.3-70-g09d2 From 8231f50d9853274ed104aac86b6b6263ca666c4d Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 16 Dec 2016 16:10:05 +0100 Subject: KVM: x86: prevent setup of invalid routes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check in kvm_set_pic_irq() and kvm_set_ioapic_irq() was just a temporary measure until the code improved enough for us to do this. This changes APIC in a case when KVM_SET_GSI_ROUTING is called to set up pic and ioapic routes before KVM_CREATE_IRQCHIP. Those rules would get overwritten by KVM_CREATE_IRQCHIP at best, so it is pointless to allow it. Userspaces hopefully noticed that things don't work if they do that and don't do that. Reviewed-by: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 1dfeb185a1e3..2639b8d3dce2 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -41,15 +41,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_pic *pic = pic_irqchip(kvm); - - /* - * XXX: rejecting pic routes when pic isn't in use would be better, - * but the default routing table is installed while kvm->arch.vpic is - * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE. - */ - if (!pic) - return -1; - return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); } @@ -58,10 +49,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - - if (!ioapic) - return -1; - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, line_status); } @@ -301,10 +288,16 @@ int kvm_set_routing_entry(struct kvm *kvm, delta = 8; /* fall through */ case KVM_IRQCHIP_PIC_MASTER: + if (!pic_in_kernel(kvm)) + goto out; + e->set = kvm_set_pic_irq; max_pin = PIC_NUM_PINS; break; case KVM_IRQCHIP_IOAPIC: + if (!ioapic_in_kernel(kvm)) + goto out; + max_pin = KVM_IOAPIC_NUM_PINS; e->set = kvm_set_ioapic_irq; break; -- cgit v1.2.3-70-g09d2 From 826da32140dada1467f4216410525511393317e8 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 16 Dec 2016 16:10:06 +0100 Subject: KVM: x86: simplify conditions with split/kernel irqchip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 2 +- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 2639b8d3dce2..b96d3893f121 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -400,7 +400,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm) void kvm_arch_post_irq_routing_update(struct kvm *kvm) { - if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) + if (!irqchip_split(kvm)) return; kvm_make_scan_ioapic_request(kvm); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05ac71a01f99..a356d8e12c2f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4027,7 +4027,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -4051,7 +4051,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) -- cgit v1.2.3-70-g09d2 From f3414bc77419463c0d81eaa2cea7ee4ccb447c7d Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 20 Dec 2016 15:25:57 -0800 Subject: kvm: x86: export maximum number of mmu_page_hash collisions Report the maximum number of mmu_page_hash collisions as a per-VM stat. This will make it easy to identify problems with the mmu_page_hash in the future. Signed-off-by: David Matlack Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 25 +++++++++++++++++-------- arch/x86/kvm/x86.c | 2 ++ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fc03ab1f6110..1bb1ffc0024c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -821,6 +821,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong max_mmu_page_hash_collisions; }; struct kvm_vcpu_stat { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7012de4a1fed..45ee7ae88239 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1904,17 +1904,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, * since it has been deleted from active_mmu_pages but still can be found * at hast list. * - * for_each_gfn_valid_sp() has skipped that kind of pages. + * for_each_valid_sp() has skipped that kind of pages. */ -#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ +#define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \ - || (_sp)->role.invalid) {} else + if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ - if ((_sp)->role.direct) {} else + for_each_valid_sp(_kvm, _sp, _gfn) \ + if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else /* @sp->gfn should be write-protected at the call site */ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -2116,6 +2116,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; bool need_sync = false; bool flush = false; + int collisions = 0; LIST_HEAD(invalid_list); role = vcpu->arch.mmu.base_role; @@ -2130,7 +2131,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { + for_each_valid_sp(vcpu->kvm, sp, gfn) { + if (sp->gfn != gfn) { + collisions++; + continue; + } + if (!need_sync && sp->unsync) need_sync = true; @@ -2153,7 +2159,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); - return sp; + goto out; } ++vcpu->kvm->stat.mmu_cache_miss; @@ -2183,6 +2189,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); +out: + if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) + vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a356d8e12c2f..4aece8b0a4aa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -190,6 +190,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, + { "max_mmu_page_hash_collisions", + VM_STAT(max_mmu_page_hash_collisions) }, { NULL } }; -- cgit v1.2.3-70-g09d2 From 114df303a7eeae8b50ebf68229b7e647714a9bea Mon Sep 17 00:00:00 2001 From: David Matlack Date: Mon, 19 Dec 2016 13:58:25 -0800 Subject: kvm: x86: reduce collisions in mmu_page_hash When using two-dimensional paging, the mmu_page_hash (which provides lookups for existing kvm_mmu_page structs), becomes imbalanced; with too many collisions in buckets 0 and 512. This has been seen to cause mmu_lock to be held for multiple milliseconds in kvm_mmu_get_page on VMs with a large amount of RAM mapped with 4K pages. The current hash function uses the lower 10 bits of gfn to index into mmu_page_hash. When doing shadow paging, gfn is the address of the guest page table being shadow. These tables are 4K-aligned, which makes the low bits of gfn a good hash. However, with two-dimensional paging, no guest page tables are being shadowed, so gfn is the base address that is mapped by the table. Thus page tables (level=1) have a 2MB aligned gfn, page directories (level=2) have a 1GB aligned gfn, etc. This means hashes will only differ in their 10th bit. hash_64() provides a better hash. For example, on a VM with ~200G (99458 direct=1 kvm_mmu_page structs): hash max_mmu_page_hash_collisions -------------------------------------------- low 10 bits 49847 hash_64 105 perfect 97 While we're changing the hash, increase the table size by 4x to better support large VMs (further reduces number of collisions in 200G VM to 29). Note that hash_64() does not provide a good distribution prior to commit ef703f49a6c5 ("Eliminate bad hash multipliers from hash_32() and hash_64()"). Signed-off-by: David Matlack Change-Id: I5aa6b13c834722813c6cca46b8b1ed6f53368ade Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1bb1ffc0024c..7e594a325158 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -115,7 +115,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 -#define KVM_MMU_HASH_SHIFT 10 +#define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 45ee7ae88239..3f9fa39f1469 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -1713,7 +1714,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp) static unsigned kvm_page_table_hashfn(gfn_t gfn) { - return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); + return hash_64(gfn, KVM_MMU_HASH_SHIFT); } static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, -- cgit v1.2.3-70-g09d2 From 27959a4415a5a00881a7b9353ab9b1274da2ca47 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Tue, 6 Dec 2016 16:46:10 -0800 Subject: kvm: x86: mmu: Use symbolic constants for EPT Violation Exit Qualifications This change adds some symbolic constants for VM Exit Qualifications related to EPT Violations and updates handle_ept_violation() to use these constants instead of hard-coded numbers. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 16 ++++++++++++++++ arch/x86/kvm/vmx.c | 22 ++++++++++++++-------- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b5b2d4b924e..25a482fb5241 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -499,6 +499,22 @@ struct vmx_msr_entry { #define ENTRY_FAIL_NMI 3 #define ENTRY_FAIL_VMCS_LINK_PTR 4 +/* + * Exit Qualifications for EPT Violations + */ +#define EPT_VIOLATION_READ_BIT 0 +#define EPT_VIOLATION_WRITE_BIT 1 +#define EPT_VIOLATION_INSTR_BIT 2 +#define EPT_VIOLATION_READABLE_BIT 3 +#define EPT_VIOLATION_WRITABLE_BIT 4 +#define EPT_VIOLATION_EXECUTABLE_BIT 5 +#define EPT_VIOLATION_READ (1 << EPT_VIOLATION_READ_BIT) +#define EPT_VIOLATION_WRITE (1 << EPT_VIOLATION_WRITE_BIT) +#define EPT_VIOLATION_INSTR (1 << EPT_VIOLATION_INSTR_BIT) +#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) +#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) +#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) + /* * VM-instruction error numbers */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a236decb81e4..81159a3878f4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6374,14 +6374,20 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - /* it is a read fault? */ - error_code = (exit_qualification << 2) & PFERR_USER_MASK; - /* it is a write fault? */ - error_code |= exit_qualification & PFERR_WRITE_MASK; - /* It is a fetch fault? */ - error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; - /* ept page table is present? */ - error_code |= (exit_qualification & 0x38) != 0; + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & + (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | + EPT_VIOLATION_EXECUTABLE)) + ? PFERR_PRESENT_MASK : 0; vcpu->arch.exit_qualification = exit_qualification; -- cgit v1.2.3-70-g09d2 From ea4114bcd3a8c84f0eb0b52e56d348c27ddede2e Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Tue, 6 Dec 2016 16:46:11 -0800 Subject: kvm: x86: mmu: Rename spte_is_locklessly_modifiable() This change renames spte_is_locklessly_modifiable() to spte_can_locklessly_be_made_writable() to distinguish it from other forms of lockless modifications. The full set of lockless modifications is covered by spte_has_volatile_bits(). Signed-off-by: Junaid Shahid Reviewed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3f9fa39f1469..e923f393ac26 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -474,7 +474,7 @@ retry: } #endif -static bool spte_is_locklessly_modifiable(u64 spte) +static bool spte_can_locklessly_be_made_writable(u64 spte) { return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); @@ -488,7 +488,7 @@ static bool spte_has_volatile_bits(u64 spte) * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. */ - if (spte_is_locklessly_modifiable(spte)) + if (spte_can_locklessly_be_made_writable(spte)) return true; if (!shadow_accessed_mask) @@ -557,7 +557,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ - if (spte_is_locklessly_modifiable(old_spte) && + if (spte_can_locklessly_be_made_writable(old_spte) && !is_writable_pte(new_spte)) ret = true; @@ -1213,7 +1213,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) u64 spte = *sptep; if (!is_writable_pte(spte) && - !(pt_protect && spte_is_locklessly_modifiable(spte))) + !(pt_protect && spte_can_locklessly_be_made_writable(spte))) return false; rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); @@ -2975,7 +2975,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * Currently, to simplify the code, only the spte write-protected * by dirty-log can be fast fixed. */ - if (!spte_is_locklessly_modifiable(spte)) + if (!spte_can_locklessly_be_made_writable(spte)) goto exit; /* -- cgit v1.2.3-70-g09d2 From 97dceba29a6acbb28d16c8c5757ae9f4e1e482ea Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Tue, 6 Dec 2016 16:46:12 -0800 Subject: kvm: x86: mmu: Fast Page Fault path retries This change adds retries into the Fast Page Fault path. Without the retries, the code still works, but if a retry does end up being needed, then it will result in a second page fault for the same memory access, which will cause much more overhead compared to just retrying within the original fault. This would be especially useful with the upcoming fast access tracking change, as that would make it more likely for retries to be needed (e.g. due to read and write faults happening on different CPUs at the same time). Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 124 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 51 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e923f393ac26..f6d3505c8d18 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2891,6 +2891,10 @@ static bool page_fault_can_be_fast(u32 error_code) return true; } +/* + * Returns true if the SPTE was fixed successfully. Otherwise, + * someone else modified the SPTE from its original value. + */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *sptep, u64 spte) @@ -2917,8 +2921,10 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) - kvm_vcpu_mark_page_dirty(vcpu, gfn); + if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte) + return false; + + kvm_vcpu_mark_page_dirty(vcpu, gfn); return true; } @@ -2933,8 +2939,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - bool ret = false; + bool fault_handled = false; u64 spte = 0ull; + uint retry_count = 0; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return false; @@ -2947,62 +2954,77 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, if (!is_shadow_present_pte(spte) || iterator.level < level) break; - /* - * If the mapping has been changed, let the vcpu fault on the - * same address again. - */ - if (!is_shadow_present_pte(spte)) { - ret = true; - goto exit; - } + do { + /* + * If the mapping has been changed, let the vcpu fault on the + * same address again. + */ + if (!is_shadow_present_pte(spte)) { + fault_handled = true; + break; + } - sp = page_header(__pa(iterator.sptep)); - if (!is_last_spte(spte, sp->role.level)) - goto exit; + sp = page_header(__pa(iterator.sptep)); + if (!is_last_spte(spte, sp->role.level)) + break; - /* - * Check if it is a spurious fault caused by TLB lazily flushed. - * - * Need not check the access of upper level table entries since - * they are always ACC_ALL. - */ - if (is_writable_pte(spte)) { - ret = true; - goto exit; - } + /* + * Check if it is a spurious fault caused by TLB lazily flushed. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ + if (is_writable_pte(spte)) { + fault_handled = true; + break; + } - /* - * Currently, to simplify the code, only the spte write-protected - * by dirty-log can be fast fixed. - */ - if (!spte_can_locklessly_be_made_writable(spte)) - goto exit; + /* + * Currently, to simplify the code, only the spte + * write-protected by dirty-log can be fast fixed. + */ + if (!spte_can_locklessly_be_made_writable(spte)) + break; - /* - * Do not fix write-permission on the large spte since we only dirty - * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() - * that means other pages are missed if its slot is dirty-logged. - * - * Instead, we let the slow page fault path create a normal spte to - * fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). - */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) - goto exit; + /* + * Do not fix write-permission on the large spte since we only + * dirty the first page into the dirty-bitmap in + * fast_pf_fix_direct_spte() that means other pages are missed + * if its slot is dirty-logged. + * + * Instead, we let the slow page fault path create a normal spte + * to fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). + */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + break; + + /* + * Currently, fast page fault only works for direct mapping + * since the gfn is not stable for indirect shadow page. See + * Documentation/virtual/kvm/locking.txt to get more detail. + */ + fault_handled = fast_pf_fix_direct_spte(vcpu, sp, + iterator.sptep, spte); + if (fault_handled) + break; + + if (++retry_count > 4) { + printk_once(KERN_WARNING + "kvm: Fast #PF retrying more than 4 times.\n"); + break; + } + + spte = mmu_spte_get_lockless(iterator.sptep); + + } while (true); - /* - * Currently, fast page fault only works for direct mapping since - * the gfn is not stable for indirect shadow page. - * See Documentation/virtual/kvm/locking.txt to get more detail. - */ - ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); -exit: trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, - spte, ret); + spte, fault_handled); walk_shadow_page_lockless_end(vcpu); - return ret; + return fault_handled; } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, -- cgit v1.2.3-70-g09d2 From 83ef6c8155c0ecb4c1a7e6bfbe425c85f7cb676d Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Tue, 6 Dec 2016 16:46:13 -0800 Subject: kvm: x86: mmu: Refactor accessed/dirty checks in mmu_spte_update/clear This simplifies mmu_spte_update() a little bit. The checks for clearing of accessed and dirty bits are refactored into separate functions, which are used inside both mmu_spte_update() and mmu_spte_clear_track_bits(), as well as kvm_test_age_rmapp(). The new helper functions handle both the case when A/D bits are supported in hardware and the case when they are not. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 66 +++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f6d3505c8d18..cfef95969335 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -504,14 +504,16 @@ static bool spte_has_volatile_bits(u64 spte) return true; } -static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) +static bool is_accessed_spte(u64 spte) { - return (old_spte & bit_mask) && !(new_spte & bit_mask); + return shadow_accessed_mask ? spte & shadow_accessed_mask + : true; } -static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) +static bool is_dirty_spte(u64 spte) { - return (old_spte & bit_mask) != (new_spte & bit_mask); + return shadow_dirty_mask ? spte & shadow_dirty_mask + : spte & PT_WRITABLE_MASK; } /* Rules for using mmu_spte_set: @@ -534,17 +536,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) * will find a read-only spte, even though the writable spte * might be cached on a CPU's TLB, the return value indicates this * case. + * + * Returns true if the TLB needs to be flushed */ static bool mmu_spte_update(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; - bool ret = false; + bool flush = false; WARN_ON(!is_shadow_present_pte(new_spte)); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return ret; + return flush; } if (!spte_has_volatile_bits(old_spte)) @@ -552,6 +556,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); + WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); + /* * For the spte updated out of mmu-lock is safe, since * we always atomically update it, see the comments in @@ -559,38 +565,31 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) */ if (spte_can_locklessly_be_made_writable(old_spte) && !is_writable_pte(new_spte)) - ret = true; - - if (!shadow_accessed_mask) { - /* - * We don't set page dirty when dropping non-writable spte. - * So do it now if the new spte is becoming non-writable. - */ - if (ret) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - return ret; - } + flush = true; /* - * Flush TLB when accessed/dirty bits are changed in the page tables, + * Flush TLB when accessed/dirty states are changed in the page tables, * to guarantee consistency between TLB and page tables. */ - if (spte_is_bit_changed(old_spte, new_spte, - shadow_accessed_mask | shadow_dirty_mask)) - ret = true; - if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) + if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { + flush = true; kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) + } + + if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { + flush = true; kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + } - return ret; + return flush; } /* * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. + * Returns non-zero if the PTE was previously valid. */ static int mmu_spte_clear_track_bits(u64 *sptep) { @@ -614,11 +613,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep) */ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); - if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) + if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); - if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : - PT_WRITABLE_MASK)) + + if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); + return 1; } @@ -1616,7 +1616,6 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, { u64 *sptep; struct rmap_iterator iter; - int young = 0; /* * If there's no access bit in the secondary pte set by the @@ -1626,14 +1625,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, if (!shadow_accessed_mask) goto out; - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - break; - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + if (is_accessed_spte(*sptep)) + return 1; out: - return young; + return 0; } #define RMAP_RECYCLE_THRESHOLD 1000 -- cgit v1.2.3-70-g09d2 From f39a058d0ea2f58b9c69cfcf7c93184f33302c98 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Tue, 6 Dec 2016 16:46:14 -0800 Subject: kvm: x86: mmu: Introduce a no-tracking version of mmu_spte_update mmu_spte_update() tracks changes in the accessed/dirty state of the SPTE being updated and calls kvm_set_pfn_accessed/dirty appropriately. However, in some cases (e.g. when aging the SPTE), this shouldn't be done. mmu_spte_update_no_track() is introduced for use in such cases. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cfef95969335..b8b5259c8ebb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -528,27 +528,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) __set_spte(sptep, new_spte); } -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changed. - * - * Whenever we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB, the return value indicates this - * case. - * - * Returns true if the TLB needs to be flushed +/* + * Update the SPTE (excluding the PFN), but do not track changes in its + * accessed/dirty status. */ -static bool mmu_spte_update(u64 *sptep, u64 new_spte) +static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; - bool flush = false; WARN_ON(!is_shadow_present_pte(new_spte)); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return flush; + return old_spte; } if (!spte_has_volatile_bits(old_spte)) @@ -558,6 +550,28 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); + return old_spte; +} + +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changed. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case. + * + * Returns true if the TLB needs to be flushed + */ +static bool mmu_spte_update(u64 *sptep, u64 new_spte) +{ + bool flush = false; + u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); + + if (!is_shadow_present_pte(old_spte)) + return false; + /* * For the spte updated out of mmu-lock is safe, since * we always atomically update it, see the comments in -- cgit v1.2.3-70-g09d2 From 37f0e8fe6b10ee2ab52576caa721ee1282de74a6 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Tue, 6 Dec 2016 16:46:15 -0800 Subject: kvm: x86: mmu: Do not use bit 63 for tracking special SPTEs MMIO SPTEs currently set both bits 62 and 63 to distinguish them as special PTEs. However, bit 63 is used as the SVE bit in Intel EPT PTEs. The SVE bit is ignored for misconfigured PTEs but not necessarily for not-Present PTEs. Since MMIO SPTEs use an EPT misconfiguration, so using bit 63 for them is acceptable. However, the upcoming fast access tracking feature adds another type of special tracking PTE, which uses not-Present PTEs and hence should not set bit 63. In order to use common bits to distinguish both type of special PTEs, we now use only bit 62 as the special bit. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/include/asm/vmx.h | 9 +++++++-- arch/x86/kvm/vmx.c | 6 +++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7e594a325158..3272a5e4aaad 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -208,6 +208,13 @@ enum { PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting + * with the SVE bit in EPT PTEs. + */ +#define SPTE_SPECIAL_MASK (1ULL << 62) + /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 25a482fb5241..fc061cbb46e0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -467,8 +467,13 @@ enum vmcs_field { #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) -#define VMX_EPT_ACCESS_BIT (1ull << 8) -#define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) + +/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ +#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) + #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 81159a3878f4..6f53dedd9b96 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5236,10 +5236,10 @@ static void ept_set_mmio_spte_mask(void) /* * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). - * Also, magic bits (0x3ull << 62) is set to quickly identify mmio - * spte. + * Also, special bit (62) is set to quickly identify mmio spte. */ - kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); + kvm_mmu_set_mmio_spte_mask(SPTE_SPECIAL_MASK | + VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 -- cgit v1.2.3-70-g09d2 From f160c7b7bb322bf079a5bb4dd34c58f17553f193 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Tue, 6 Dec 2016 16:46:16 -0800 Subject: kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits. This change implements lockless access tracking for Intel CPUs without EPT A bits. This is achieved by marking the PTEs as not-present (but not completely clearing them) when clear_flush_young() is called after marking the pages as accessed. When an EPT Violation is generated as a result of the VM accessing those pages, the PTEs are restored to their original values. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +- arch/x86/include/asm/vmx.h | 9 +- arch/x86/kvm/mmu.c | 279 ++++++++++++++++++++++++++++++---------- arch/x86/kvm/vmx.c | 26 ++-- arch/x86/kvm/x86.c | 2 +- 5 files changed, 239 insertions(+), 80 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3272a5e4aaad..99a71d90b6ae 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1064,7 +1064,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index fc061cbb46e0..a22a4790f1ac 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -469,11 +469,14 @@ enum vmcs_field { #define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_ACCESS_BIT (1ull << 8) #define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ + VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) +#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) /* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ -#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ - VMX_EPT_EXECUTABLE_MASK) - +#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b8b5259c8ebb..64821ca3a7c3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -130,6 +131,10 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + #include #define CREATE_TRACE_POINTS @@ -179,6 +184,25 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_present_mask; +/* + * The mask/value to distinguish a PTE that has been marked not-present for + * access tracking purposes. + * The mask would be either 0 if access tracking is disabled, or + * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + */ +static u64 __read_mostly shadow_acc_track_mask; +static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -188,6 +212,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool is_access_track_spte(u64 spte) +{ + /* Always false if shadow_acc_track_mask is zero. */ + return (spte & shadow_acc_track_mask) == shadow_acc_track_value; +} + /* * the low bit of the generation number is always presumed to be zero. * This disables mmio caching during memslot updates. The concept is @@ -285,7 +315,8 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -293,9 +324,23 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +void kvm_mmu_clear_all_pte_masks(void) +{ + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_mmio_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; +} + static int is_cpuid_PSE36(void) { return 1; @@ -308,7 +353,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); + return (pte != 0) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -482,32 +527,32 @@ static bool spte_can_locklessly_be_made_writable(u64 spte) static bool spte_has_volatile_bits(u64 spte) { + if (!is_shadow_present_pte(spte)) + return false; + /* * Always atomically update spte if it can be updated * out of mmu-lock, it can ensure dirty bit is not lost, * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. */ - if (spte_can_locklessly_be_made_writable(spte)) + if (spte_can_locklessly_be_made_writable(spte) || + is_access_track_spte(spte)) return true; - if (!shadow_accessed_mask) - return false; - - if (!is_shadow_present_pte(spte)) - return false; - - if ((spte & shadow_accessed_mask) && - (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) - return false; + if (shadow_accessed_mask) { + if ((spte & shadow_accessed_mask) == 0 || + (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) + return true; + } - return true; + return false; } static bool is_accessed_spte(u64 spte) { return shadow_accessed_mask ? spte & shadow_accessed_mask - : true; + : !is_access_track_spte(spte); } static bool is_dirty_spte(u64 spte) @@ -651,6 +696,61 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } +static u64 mark_spte_for_access_track(u64 spte) +{ + if (shadow_accessed_mask != 0) + return spte & ~shadow_accessed_mask; + + if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + return spte; + + /* + * Verify that the write-protection that we do below will be fixable + * via the fast page fault path. Currently, that is always the case, at + * least when using EPT (which is when access tracking would be used). + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + spte |= shadow_acc_track_value; + + return spte; +} + +/* Returns the Accessed status of the PTE and resets it at the same time. */ +static bool mmu_spte_age(u64 *sptep) +{ + u64 spte = mmu_spte_get_lockless(sptep); + + if (!is_accessed_spte(spte)) + return false; + + if (shadow_accessed_mask) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(spte)) + kvm_set_pfn_dirty(spte_to_pfn(spte)); + + spte = mark_spte_for_access_track(spte); + mmu_spte_update_no_track(sptep, spte); + } + + return true; +} + static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { /* @@ -1435,7 +1535,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); + sptep, *sptep, gfn, level); need_flush = 1; @@ -1448,7 +1548,8 @@ restart: new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - new_spte &= ~shadow_accessed_mask; + + new_spte = mark_spte_for_access_track(new_spte); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1610,15 +1711,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator uninitialized_var(iter); int young = 0; - BUG_ON(!shadow_accessed_mask); - - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + young |= mmu_spte_age(sptep); trace_kvm_age_page(gfn, level, slot, young); return young; @@ -1632,11 +1726,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator iter; /* - * If there's no access bit in the secondary pte set by the - * hardware it's up to gup-fast/gup to set the access bit in - * the primary pte or in the page structure. + * If there's no access bit in the secondary pte set by the hardware and + * fast access tracking is also not enabled, it's up to gup-fast/gup to + * set the access bit in the primary pte or in the page structure. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) goto out; for_each_rmap_spte(rmap_head, &iter, sptep) @@ -1671,7 +1765,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); @@ -2603,6 +2697,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_dirty_mask; } + if (speculative) + spte = mark_spte_for_access_track(spte); + set_pte: if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); @@ -2656,7 +2753,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, + *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, *sptep, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -2889,16 +2986,28 @@ static bool page_fault_can_be_fast(u32 error_code) if (unlikely(error_code & PFERR_RSVD_MASK)) return false; + /* See if the page fault is due to an NX violation */ + if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) + == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) + return false; + /* - * #PF can be fast only if the shadow page table is present and it - * is caused by write-protect, that means we just need change the - * W bit of the spte which can be done out of mmu-lock. + * #PF can be fast if: + * 1. The shadow page table entry is not present, which could mean that + * the fault is potentially caused by access tracking (if enabled). + * 2. The shadow page table entry is present and the fault + * is caused by write-protect, that means we just need change the W + * bit of the spte which can be done out of mmu-lock. + * + * However, if access tracking is disabled we know that a non-present + * page must be a genuine page fault where we have to create a new SPTE. + * So, if access tracking is disabled, we return true only for write + * accesses to a present page. */ - if (!(error_code & PFERR_PRESENT_MASK) || - !(error_code & PFERR_WRITE_MASK)) - return false; - return true; + return shadow_acc_track_mask != 0 || + ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) + == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); } /* @@ -2907,17 +3016,26 @@ static bool page_fault_can_be_fast(u32 error_code) */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *sptep, u64 spte) + u64 *sptep, u64 old_spte, + bool remove_write_prot, bool remove_acc_track) { gfn_t gfn; + u64 new_spte = old_spte; WARN_ON(!sp->role.direct); - /* - * The gfn of direct spte is stable since it is calculated - * by sp->gfn. - */ - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + if (remove_acc_track) { + u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift) + & shadow_acc_track_saved_bits_mask; + + new_spte &= ~shadow_acc_track_mask; + new_spte &= ~(shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift); + new_spte |= saved_bits; + } + + if (remove_write_prot) + new_spte |= PT_WRITABLE_MASK; /* * Theoretically we could also set dirty bit (and flush TLB) here in @@ -2931,10 +3049,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte) + if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) return false; - kvm_vcpu_mark_page_dirty(vcpu, gfn); + if (remove_write_prot) { + /* + * The gfn of direct spte is stable since it is + * calculated by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } return true; } @@ -2965,35 +3090,55 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, break; do { - /* - * If the mapping has been changed, let the vcpu fault on the - * same address again. - */ - if (!is_shadow_present_pte(spte)) { - fault_handled = true; - break; - } + bool remove_write_prot = false; + bool remove_acc_track; sp = page_header(__pa(iterator.sptep)); if (!is_last_spte(spte, sp->role.level)) break; /* - * Check if it is a spurious fault caused by TLB lazily flushed. + * Check whether the memory access that caused the fault would + * still cause it if it were to be performed right now. If not, + * then this is a spurious fault caused by TLB lazily flushed, + * or some other CPU has already fixed the PTE after the + * current CPU took the fault. * * Need not check the access of upper level table entries since * they are always ACC_ALL. */ - if (is_writable_pte(spte)) { - fault_handled = true; - break; + + if (error_code & PFERR_FETCH_MASK) { + if ((spte & (shadow_x_mask | shadow_nx_mask)) + == shadow_x_mask) { + fault_handled = true; + break; + } + } else if (error_code & PFERR_WRITE_MASK) { + if (is_writable_pte(spte)) { + fault_handled = true; + break; + } + + /* + * Currently, to simplify the code, write-protection can + * be removed in the fast path only if the SPTE was + * write-protected for dirty-logging. + */ + remove_write_prot = + spte_can_locklessly_be_made_writable(spte); + } else { + /* Fault was on Read access */ + if (spte & PT_PRESENT_MASK) { + fault_handled = true; + break; + } } - /* - * Currently, to simplify the code, only the spte - * write-protected by dirty-log can be fast fixed. - */ - if (!spte_can_locklessly_be_made_writable(spte)) + remove_acc_track = is_access_track_spte(spte); + + /* Verify that the fault can be handled in the fast path */ + if (!remove_acc_track && !remove_write_prot) break; /* @@ -3007,7 +3152,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * * See the comments in kvm_arch_commit_memory_region(). */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot) break; /* @@ -3016,7 +3161,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * Documentation/virtual/kvm/locking.txt to get more detail. */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, - iterator.sptep, spte); + iterator.sptep, spte, + remove_write_prot, + remove_acc_track); if (fault_handled) break; @@ -5105,6 +5252,8 @@ static void mmu_destroy_caches(void) int kvm_mmu_module_init(void) { + kvm_mmu_clear_all_pte_masks(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, 0, NULL); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6f53dedd9b96..d2fe3a51876c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6578,6 +6578,19 @@ static void wakeup_handler(void) spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } +void vmx_enable_tdp(void) +{ + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, + enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK); + + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6703,16 +6716,9 @@ static __init int hardware_setup(void) /* SELF-IPI */ vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); - if (enable_ept) { - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? - 0ull : VMX_EPT_READABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else + if (enable_ept) + vmx_enable_tdp(); + else kvm_disable_tdp(); update_ple_window_actual_max(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4aece8b0a4aa..c3ee5e29ea2a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6025,7 +6025,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK); + PT_PRESENT_MASK, 0); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); -- cgit v1.2.3-70-g09d2 From 63dbe14d39b0505e3260bed92e5f4905f49c09d9 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Tue, 6 Dec 2016 16:46:17 -0800 Subject: kvm: x86: mmu: Update documentation for fast page fault mechanism Add a brief description of the lockless access tracking mechanism to the documentation of fast page faults in locking.txt. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/locking.txt | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index fd013bf4115b..1bb8bcaf8497 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -26,9 +26,16 @@ sections. Fast page fault: Fast page fault is the fast path which fixes the guest page fault out of -the mmu-lock on x86. Currently, the page fault can be fast only if the -shadow page table is present and it is caused by write-protect, that means -we just need change the W bit of the spte. +the mmu-lock on x86. Currently, the page fault can be fast in one of the +following two cases: + +1. Access Tracking: The SPTE is not present, but it is marked for access +tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to +restore the saved R/X bits. This is described in more detail later below. + +2. Write-Protection: The SPTE is present and the fault is +caused by write-protect. That means we just need to change the W bit of the +spte. What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and SPTE_MMU_WRITEABLE bit on the spte: @@ -38,7 +45,8 @@ SPTE_MMU_WRITEABLE bit on the spte: page write-protection. On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this +bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or +restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This is safe because whenever changing these bits can be detected by cmpxchg. But we need carefully check these cases: @@ -142,6 +150,21 @@ Since the spte is "volatile" if it can be updated out of mmu-lock, we always atomically update the spte, the race caused by fast page fault can be avoided, See the comments in spte_has_volatile_bits() and mmu_spte_update(). +Lockless Access Tracking: + +This is used for Intel CPUs that are using EPT but do not support the EPT A/D +bits. In this case, when the KVM MMU notifier is called to track accesses to a +page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present +by clearing the RWX bits in the PTE and storing the original R & X bits in +some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the +PTE (using the ignored bit 62). When the VM tries to access the page later on, +a fault is generated and the fast page fault mechanism described above is used +to atomically restore the PTE to a Present state. The W bit is not saved when +the PTE is marked for access tracking and during restoration to the Present +state, the W bit is set depending on whether or not it was a write access. If +it wasn't, then the W bit will remain clear until a write access happens, at +which time it will be set using the Dirty tracking mechanism described above. + 3. Reference ------------ -- cgit v1.2.3-70-g09d2 From f98a3efb284a7950745d6c95be489193e6d4c657 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 15 Dec 2016 18:06:45 +0100 Subject: KVM: x86: use delivery to self in hyperv synic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Interrupt to self can be sent without knowing the APIC ID. Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1572c35b4f1a..08b27e0c7b71 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return -ENOENT; memset(&irq, 0, sizeof(irq)); - irq.dest_id = kvm_apic_id(vcpu->arch.apic); + irq.shorthand = APIC_DEST_SELF; irq.dest_mode = APIC_DEST_PHYSICAL; irq.delivery_mode = APIC_DM_FIXED; irq.vector = vector; irq.level = 1; - ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL); + ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL); trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); return ret; } -- cgit v1.2.3-70-g09d2 From 6e50043912d9c9c119e3c9c5378869d019df70a9 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 15 Dec 2016 18:06:46 +0100 Subject: KVM: x86: replace kvm_apic_id with kvm_{x,x2}apic_id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There were three calls sites: - recalculate_apic_map and kvm_apic_match_physical_addr, where it would only complicate implementation of x2APIC hotplug; - in apic_debug, where it was still somewhat preserved, but keeping the old function just for apic_debug was not worth it Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 31 ++++++++++++++++++++++--------- arch/x86/kvm/lapic.h | 11 ----------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5fe290c1b7d8..7c142f0fe9fd 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) +static inline u8 kvm_xapic_id(struct kvm_lapic *apic) +{ + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; +} + +static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) +{ + return apic->vcpu->vcpu_id; +} + static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->mode) { @@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; - u32 max_id = 255; + u32 max_id = 255; /* enough space for any xAPIC ID */ mutex_lock(&kvm->arch.apic_map_lock); kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) - max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); + max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); @@ -184,12 +194,13 @@ static void recalculate_apic_map(struct kvm *kvm) if (!kvm_apic_present(vcpu)) continue; - aid = kvm_apic_id(apic); - ldr = kvm_lapic_get_reg(apic, APIC_LDR); - + aid = apic_x2apic_mode(apic) ? kvm_x2apic_id(apic) + : kvm_xapic_id(apic); if (aid <= new->max_apic_id) new->phys_map[aid] = apic; + ldr = kvm_lapic_get_reg(apic, APIC_LDR); + if (apic_x2apic_mode(apic)) { new->mode |= KVM_APIC_MODE_X2APIC; } else if (ldr) { @@ -250,6 +261,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + WARN_ON_ONCE(id != apic->vcpu->vcpu_id); + kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); @@ -591,9 +604,9 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) return true; if (apic_x2apic_mode(apic)) - return mda == kvm_apic_id(apic); + return mda == kvm_x2apic_id(apic); - return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic)); + return mda == SET_APIC_DEST_FIELD(kvm_xapic_id(apic)); } static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) @@ -1907,9 +1920,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - apic_debug("%s: vcpu=%p, id=%d, base_msr=" + apic_debug("%s: vcpu=%p, id=0x%x, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, - vcpu, kvm_apic_id(apic), + vcpu, kvm_lapic_get_reg(apic, APIC_ID), vcpu->arch.apic_base, apic->base_address); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e0c80233b3e1..cb16e6fd2330 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -202,17 +202,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } -static inline u32 kvm_apic_id(struct kvm_lapic *apic) -{ - /* To avoid a race between apic_base and following APIC_ID update when - * switching to x2apic_mode, the x2apic mode returns initial x2apic id. - */ - if (apic_x2apic_mode(apic)) - return apic->vcpu->vcpu_id; - - return kvm_lapic_get_reg(apic, APIC_ID) >> 24; -} - bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); -- cgit v1.2.3-70-g09d2 From b4535b58ae0df8b7cf0fe92a0c23aa3cf862e3cc Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 15 Dec 2016 18:06:47 +0100 Subject: KVM: x86: make interrupt delivery fast and slow path behave the same MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Slow path tried to prevent IPIs from x2APIC VCPUs from being delivered to xAPIC VCPUs and vice-versa. Make slow path behave like fast path, which never distinguished that. Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 7c142f0fe9fd..3ebef53d20a0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -592,10 +592,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda) { - if (apic_x2apic_mode(apic)) - return mda == X2APIC_BROADCAST; - - return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST; + return mda == (apic_x2apic_mode(apic) ? + X2APIC_BROADCAST : APIC_BROADCAST); } static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) @@ -606,7 +604,7 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) if (apic_x2apic_mode(apic)) return mda == kvm_x2apic_id(apic); - return mda == SET_APIC_DEST_FIELD(kvm_xapic_id(apic)); + return mda == kvm_xapic_id(apic); } static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) @@ -623,7 +621,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) && (logical_id & mda & 0xffff) != 0; logical_id = GET_APIC_LOGICAL_ID(logical_id); - mda = GET_APIC_DEST_FIELD(mda); switch (kvm_lapic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: @@ -640,9 +637,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) /* The KVM local APIC implementation has two quirks: * - * - the xAPIC MDA stores the destination at bits 24-31, while this - * is not true of struct kvm_lapic_irq's dest_id field. This is - * just a quirk in the API and is not problematic. + * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs + * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID. + * KVM doesn't do that aliasing. * * - in-kernel IOAPIC messages have to be delivered directly to * x2APIC, because the kernel does not support interrupt remapping. @@ -658,13 +655,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; - bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && - !ipi && dest_id == APIC_BROADCAST && x2apic_mda) + !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target)) return X2APIC_BROADCAST; - return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); + return dest_id; } bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, -- cgit v1.2.3-70-g09d2 From 5bd5db385b3e13c702365574c0b7350c6ea45e84 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 15 Dec 2016 18:06:48 +0100 Subject: KVM: x86: allow hotplug of VCPU with APIC ID over 0xff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LAPIC after reset is in xAPIC mode, which poses a problem for hotplug of VCPUs with high APIC ID, because reset VCPU is waiting for INIT/SIPI, but there is no way to uniquely address it using xAPIC. From many possible options, we chose the one that also works on real hardware: accepting interrupts addressed to LAPIC's x2APIC ID even in xAPIC mode. KVM intentionally differs from real hardware, because real hardware (Knights Landing) does just "x2apic_id & 0xff" to decide whether to accept the interrupt in xAPIC mode and it can deliver one interrupt to more than one physical destination, e.g. 0x123 to 0x123 and 0x23. Fixes: 682f732ecf73 ("KVM: x86: bump MAX_VCPUS to 288") Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3ebef53d20a0..7e9ac4606279 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -189,15 +189,26 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_lapic **cluster; u16 mask; - u32 ldr, aid; + u32 ldr; + u8 xapic_id; + u32 x2apic_id; if (!kvm_apic_present(vcpu)) continue; - aid = apic_x2apic_mode(apic) ? kvm_x2apic_id(apic) - : kvm_xapic_id(apic); - if (aid <= new->max_apic_id) - new->phys_map[aid] = apic; + xapic_id = kvm_xapic_id(apic); + x2apic_id = kvm_x2apic_id(apic); + + /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */ + if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && + x2apic_id <= new->max_apic_id) + new->phys_map[x2apic_id] = apic; + /* + * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around, + * prevent them from masking VCPUs with APIC ID <= 0xff. + */ + if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) + new->phys_map[xapic_id] = apic; ldr = kvm_lapic_get_reg(apic, APIC_LDR); @@ -604,6 +615,15 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) if (apic_x2apic_mode(apic)) return mda == kvm_x2apic_id(apic); + /* + * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if + * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and + * this allows unique addressing of VCPUs with APIC ID over 0xff. + * The 0xff condition is needed because writeable xAPIC ID. + */ + if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic)) + return true; + return mda == kvm_xapic_id(apic); } -- cgit v1.2.3-70-g09d2 From 0f89b207b04a1a399e19d35293658e3a571da3d7 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 14 Dec 2016 14:59:23 -0500 Subject: kvm: svm: Use the hardware provided GPA instead of page walk When a guest causes a NPF which requires emulation, KVM sometimes walks the guest page tables to translate the GVA to a GPA. This is unnecessary most of the time on AMD hardware since the hardware provides the GPA in EXITINFO2. The only exception cases involve string operations involving rep or operations that use two memory locations. With rep, the GPA will only be the value of the initial NPF and with dual memory locations we won't know which memory address was translated into EXITINFO2. Signed-off-by: Tom Lendacky Reviewed-by: Borislav Petkov Signed-off-by: Brijesh Singh Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/emulate.c | 20 +++++++++++++---- arch/x86/kvm/svm.c | 2 ++ arch/x86/kvm/x86.c | 45 +++++++++++++++++++++++++++++--------- 5 files changed, 57 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e9cd7befcb76..3e8c287090e4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); +bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 99a71d90b6ae..0419e114f27b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -675,6 +675,9 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; + + /* GPA available (AMD only) */ + bool gpa_available; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 56628a44668b..2b8349a2b14b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -173,6 +173,7 @@ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ +#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -4256,7 +4257,7 @@ static const struct opcode group1[] = { }; static const struct opcode group1A[] = { - I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N, }; static const struct opcode group2[] = { @@ -4294,7 +4295,7 @@ static const struct opcode group5[] = { I(SrcMemFAddr | ImplicitOps, em_call_far), I(SrcMem | NearBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps, em_jmp_far), - I(SrcMem | Stack, em_push), D(Undefined), + I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), }; static const struct opcode group6[] = { @@ -4514,8 +4515,8 @@ static const struct opcode opcode_table[256] = { /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), - I2bv(SrcSI | DstDI | Mov | String, em_mov), - F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r), + I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov), + F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r), /* 0xA8 - 0xAF */ F2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), @@ -5629,3 +5630,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt) { writeback_registers(ctxt); } + +bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->rep_prefix && (ctxt->d & String)) + return false; + + if (ctxt->d & TwoMemOp) + return false; + + return true; +} diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 08a4d3ab3455..d0414f054bdf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4182,6 +4182,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF); + if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c3ee5e29ea2a..edff19d1df97 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4462,6 +4462,21 @@ out: } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, + gpa_t gpa, bool write) +{ + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + return 1; + + if (vcpu_match_mmio_gpa(vcpu, gpa)) { + trace_vcpu_match_mmio(gva, gpa, write, true); + return 1; + } + + return 0; +} + static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) @@ -4488,16 +4503,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if (*gpa == UNMAPPED_GVA) return -1; - /* For APIC access vmexit */ - if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - return 1; - - if (vcpu_match_mmio_gpa(vcpu, *gpa)) { - trace_vcpu_match_mmio(gva, *gpa, write, true); - return 1; - } - - return 0; + return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -4594,6 +4600,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + /* + * If the exit was due to a NPF we may already have a GPA. + * If the GPA is present, use it to avoid the GVA to GPA table walk. + * Note, this cannot be used on string operations since string + * operation using rep will only have the initial GPA from the NPF + * occurred. + */ + if (vcpu->arch.gpa_available && + emulator_can_use_gpa(ctxt) && + vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) && + (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) { + gpa = exception->address; + goto mmio; + } ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -5610,6 +5632,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } restart: + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2; + r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) -- cgit v1.2.3-70-g09d2 From 0f1e261ead16ce09169bf2d223d4c8803576f85e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 17 Dec 2016 16:05:19 +0100 Subject: KVM: x86: add VCPU stat for KVM_REQ_EVENT processing This statistic can be useful to estimate the cost of an IRQ injection scenario, by comparing it with irq_injections. For example the stat shows that sti;hlt triggers more KVM_REQ_EVENT than sti;nop. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0419e114f27b..417502cf42b6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -861,6 +861,7 @@ struct kvm_vcpu_stat { u64 hypercalls; u64 irq_injections; u64 nmi_injections; + u64 req_event; }; struct x86_instruction_info; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index edff19d1df97..b02af6285887 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "irq_injections", VCPU_STAT(irq_injections) }, { "nmi_injections", VCPU_STAT(nmi_injections) }, + { "req_event", VCPU_STAT(req_event) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -6756,6 +6757,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + ++vcpu->stat.req_event; kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { r = 1; -- cgit v1.2.3-70-g09d2 From eb90f3417a0cc4880e979ccc84e41890d410ea5b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 18 Dec 2016 14:02:21 +0100 Subject: KVM: vmx: speed up TPR below threshold vmexits Since we're already in VCPU context, all we have to do here is recompute the PPR value. That will in turn generate a KVM_REQ_EVENT if necessary. Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 6 ++++++ arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/vmx.c | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 7e9ac4606279..6b1d3a76c1d0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -595,6 +595,12 @@ static void apic_update_ppr(struct kvm_lapic *apic) } } +void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) +{ + apic_update_ppr(vcpu->arch.apic); +} +EXPORT_SYMBOL_GPL(kvm_apic_update_ppr); + static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index cb16e6fd2330..5b5b1ba644cb 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -73,6 +73,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); +void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d2fe3a51876c..94fda2010f5f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6152,7 +6152,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_apic_update_ppr(vcpu); return 1; } -- cgit v1.2.3-70-g09d2 From b3c045d33218fe291b04d30e24b6eab0431987e6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 18 Dec 2016 21:47:54 +0100 Subject: KVM: lapic: remove unnecessary KVM_REQ_EVENT on PPR update PPR needs to be updated whenever on every IRR read because we may have missed TPR writes that _increased_ PPR. However, these writes need not generate KVM_REQ_EVENT, because either KVM_REQ_EVENT has been set already in __apic_accept_irq, or we are going to process the interrupt right away. Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6b1d3a76c1d0..a878e33119a3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -570,7 +570,15 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static void apic_update_ppr(struct kvm_lapic *apic) +static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) +{ + int highest_irr = apic_find_highest_irr(apic); + if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) + return -1; + return highest_irr; +} + +static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr) { u32 tpr, isrv, ppr, old_ppr; int isr; @@ -588,11 +596,19 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", apic, ppr, isr, isrv); - if (old_ppr != ppr) { + *new_ppr = ppr; + if (old_ppr != ppr) kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); - if (ppr < old_ppr) - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); - } + + return ppr < old_ppr; +} + +static void apic_update_ppr(struct kvm_lapic *apic) +{ + u32 ppr; + + if (__apic_update_ppr(apic, &ppr)) + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) @@ -2056,17 +2072,13 @@ nomem: int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - int highest_irr; + u32 ppr; if (!apic_enabled(apic)) return -1; - apic_update_ppr(apic); - highest_irr = apic_find_highest_irr(apic); - if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI))) - return -1; - return highest_irr; + __apic_update_ppr(apic, &ppr); + return apic_has_interrupt_for_ppr(apic, ppr); } int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 26fbbee5815e9352187ac18f0aa53534f62567ff Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 18 Dec 2016 13:54:58 +0100 Subject: KVM: lapic: do not set KVM_REQ_EVENT unnecessarily on PPR update On PPR update, we set KVM_REQ_EVENT unconditionally anytime PPR is lowered. But we can take into account IRR here already. Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a878e33119a3..457fb206647d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -607,7 +607,8 @@ static void apic_update_ppr(struct kvm_lapic *apic) { u32 ppr; - if (__apic_update_ppr(apic, &ppr)) + if (__apic_update_ppr(apic, &ppr) && + apic_has_interrupt_for_ppr(apic, ppr) != -1) kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } -- cgit v1.2.3-70-g09d2 From 4d82d12b39132e820b9ac4aa058ccc733db98917 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 18 Dec 2016 21:43:41 +0100 Subject: KVM: lapic: do not scan IRR when delivering an interrupt On interrupt delivery the PPR can only grow (except for auto-EOI), so it is impossible that non-auto-EOI interrupt delivery results in KVM_REQ_EVENT. We can therefore use __apic_update_ppr. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 457fb206647d..10a745faa659 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2115,6 +2115,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) { int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; + u32 ppr; if (vector == -1) return -1; @@ -2126,13 +2127,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) * because the process would deliver it through the IDT. */ - apic_set_isr(vector, apic); - apic_update_ppr(apic); apic_clear_irr(vector, apic); - if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { - apic_clear_isr(vector, apic); + /* + * For auto-EOI interrupts, there might be another pending + * interrupt above PPR, so check whether to raise another + * KVM_REQ_EVENT. + */ apic_update_ppr(apic); + } else { + /* + * For normal interrupts, PPR has been raised and there cannot + * be a higher-priority pending interrupt---except if there was + * a concurrent interrupt injection, but that would have + * triggered KVM_REQ_EVENT already. + */ + apic_set_isr(vector, apic); + __apic_update_ppr(apic, &ppr); } return vector; -- cgit v1.2.3-70-g09d2 From 21e7fbe7db2a983c046a05f12419d88c554a0f5a Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 22 Dec 2016 15:49:55 -0800 Subject: kvm: nVMX: Reorder error checks for emulated VMXON MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Checks on the operand to VMXON are performed after the check for legacy mode operation and the #GP checks, according to the pseudo-code in Intel's SDM. Signed-off-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 94fda2010f5f..4e691035a32d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7180,9 +7180,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) - return 1; - if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); return kvm_skip_emulated_instruction(vcpu); @@ -7194,6 +7191,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + return 1; + if (cpu_has_vmx_msr_bitmap()) { vmx->nested.msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); -- cgit v1.2.3-70-g09d2 From a17f32270af1e1054bbc8858b0f27226a2c859ba Mon Sep 17 00:00:00 2001 From: Piotr Luc Date: Tue, 10 Jan 2017 18:34:03 +0100 Subject: kvm: x86: Expose Intel VPOPCNTDQ feature to guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vector population count instructions for dwords and qwords are to be used in future Intel Xeon & Xeon Phi processors. The bit 14 of CPUID[level:0x07, ECX] indicates that the new instructions are supported by a processor. The spec can be found in the Intel Software Developer Manual (SDM) or in the Instruction Set Extensions Programming Reference (ISE). Signed-off-by: Piotr Luc Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e85f6bd7b9d5..09c2ac741567 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -383,7 +383,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/; + F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = -- cgit v1.2.3-70-g09d2 From 0b4c208d443ba2af82b4c70f99ca8df31e9a0020 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 20 Dec 2016 16:34:50 -0800 Subject: Revert "KVM: nested VMX: disable perf cpuid reporting" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit bc6134942dbbf31c25e9bd7c876be5da81c9e1ce. A CPUID instruction executed in VMX non-root mode always causes a VM-exit, regardless of the leaf being queried. Fixes: bc6134942dbb ("KVM: nested VMX: disable perf cpuid reporting") Signed-off-by: Jim Mattson [The issue solved by bc6134942dbb has been resolved with ff651cb613b4 ("KVM: nVMX: Add nested msr load/restore algorithm").] Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.c | 6 ------ arch/x86/kvm/vmx.c | 2 -- 2 files changed, 8 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 09c2ac741567..c0e2036217ad 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -861,12 +861,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) if (!best) best = check_cpuid_limit(vcpu, function, index); - /* - * Perfmon not yet supported for L2 guest. - */ - if (is_guest_mode(vcpu) && function == 0xa) - best = NULL; - if (best) { *eax = best->eax; *ebx = best->ebx; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4e691035a32d..c7bafa1457e2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8203,8 +8203,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: - if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) - return false; return true; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); -- cgit v1.2.3-70-g09d2 From 8694e4da66a636665f51b94a6a7a40c9fc0dc5ec Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 23 Jan 2017 14:07:18 +0100 Subject: KVM: arm/arm64: Remove struct vgic_irq pending field One of the goals behind the VGIC redesign was to get rid of cached or intermediate state in the data structures, but we decided to allow ourselves to precompute the pending value of an IRQ based on the line level and pending latch state. However, this has now become difficult to base proper GICv3 save/restore on, because there is a potential to modify the pending state without knowing if an interrupt is edge or level configured. See the following post and related message for more background: https://lists.cs.columbia.edu/pipermail/kvmarm/2017-January/023195.html This commit gets rid of the precomputed pending field in favor of a function that calculates the value when needed, irq_is_pending(). The soft_pending field is renamed to pending_latch to represent that this latch is the equivalent hardware latch which gets manipulated by the input signal for edge-triggered interrupts and when writing to the SPENDR/CPENDR registers. After this commit save/restore code should be able to simply restore the pending_latch state, line_level state, and config state in any order and get the desired result. Reviewed-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Andre Przywara Signed-off-by: Christoffer Dall --- include/kvm/arm_vgic.h | 5 +++-- virt/kvm/arm/vgic/vgic-its.c | 6 +++--- virt/kvm/arm/vgic/vgic-mmio-v2.c | 6 +++--- virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 +- virt/kvm/arm/vgic/vgic-mmio.c | 19 +++++-------------- virt/kvm/arm/vgic/vgic-v2.c | 12 +++++------- virt/kvm/arm/vgic/vgic-v3.c | 12 +++++------- virt/kvm/arm/vgic/vgic.c | 16 +++++++--------- virt/kvm/arm/vgic/vgic.h | 8 ++++++++ 9 files changed, 40 insertions(+), 46 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 002f0922cd92..da2ce086ce31 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -101,9 +101,10 @@ struct vgic_irq { */ u32 intid; /* Guest visible INTID */ - bool pending; bool line_level; /* Level only */ - bool soft_pending; /* Level only */ + bool pending_latch; /* The pending latch state used to calculate + * the pending state for both level + * and edge triggered IRQs. */ bool active; /* not used for LPIs */ bool enabled; bool hw; /* Tied to HW IRQ */ diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 8c2b3cdcb2c5..571b64a01c50 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -350,7 +350,7 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); spin_lock(&irq->irq_lock); - irq->pending = pendmask & (1U << bit_nr); + irq->pending_latch = pendmask & (1U << bit_nr); vgic_queue_irq_unlock(vcpu->kvm, irq); vgic_put_irq(vcpu->kvm, irq); } @@ -465,7 +465,7 @@ static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, return -EBUSY; spin_lock(&itte->irq->irq_lock); - itte->irq->pending = true; + itte->irq->pending_latch = true; vgic_queue_irq_unlock(kvm, itte->irq); return 0; @@ -913,7 +913,7 @@ static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, if (!itte) return E_ITS_CLEAR_UNMAPPED_INTERRUPT; - itte->irq->pending = false; + itte->irq->pending_latch = false; return 0; } diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index 78e34bc4d89b..07e67f1e25ef 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -98,7 +98,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); spin_lock(&irq->irq_lock); - irq->pending = true; + irq->pending_latch = true; irq->source |= 1U << source_vcpu->vcpu_id; vgic_queue_irq_unlock(source_vcpu->kvm, irq); @@ -182,7 +182,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, irq->source &= ~((val >> (i * 8)) & 0xff); if (!irq->source) - irq->pending = false; + irq->pending_latch = false; spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); @@ -204,7 +204,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, irq->source |= (val >> (i * 8)) & 0xff; if (irq->source) { - irq->pending = true; + irq->pending_latch = true; vgic_queue_irq_unlock(vcpu->kvm, irq); } else { spin_unlock(&irq->irq_lock); diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 50f42f0f8c4f..2aca52a6c9eb 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -646,7 +646,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); spin_lock(&irq->irq_lock); - irq->pending = true; + irq->pending_latch = true; vgic_queue_irq_unlock(vcpu->kvm, irq); vgic_put_irq(vcpu->kvm, irq); diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index ebe1b9fa3c4d..2670d39d1f86 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -111,7 +111,7 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - if (irq->pending) + if (irq_is_pending(irq)) value |= (1U << i); vgic_put_irq(vcpu->kvm, irq); @@ -131,9 +131,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); spin_lock(&irq->irq_lock); - irq->pending = true; - if (irq->config == VGIC_CONFIG_LEVEL) - irq->soft_pending = true; + irq->pending_latch = true; vgic_queue_irq_unlock(vcpu->kvm, irq); vgic_put_irq(vcpu->kvm, irq); @@ -152,12 +150,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, spin_lock(&irq->irq_lock); - if (irq->config == VGIC_CONFIG_LEVEL) { - irq->soft_pending = false; - irq->pending = irq->line_level; - } else { - irq->pending = false; - } + irq->pending_latch = false; spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); @@ -359,12 +352,10 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); spin_lock(&irq->irq_lock); - if (test_bit(i * 2 + 1, &val)) { + if (test_bit(i * 2 + 1, &val)) irq->config = VGIC_CONFIG_EDGE; - } else { + else irq->config = VGIC_CONFIG_LEVEL; - irq->pending = irq->line_level | irq->soft_pending; - } spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index 834137e7b83f..b834ecdf3225 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -104,7 +104,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) /* Edge is the only case where we preserve the pending bit */ if (irq->config == VGIC_CONFIG_EDGE && (val & GICH_LR_PENDING_BIT)) { - irq->pending = true; + irq->pending_latch = true; if (vgic_irq_is_sgi(intid)) { u32 cpuid = val & GICH_LR_PHYSID_CPUID; @@ -120,9 +120,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) */ if (irq->config == VGIC_CONFIG_LEVEL) { if (!(val & GICH_LR_PENDING_BIT)) - irq->soft_pending = false; - - irq->pending = irq->line_level || irq->soft_pending; + irq->pending_latch = false; } spin_unlock(&irq->irq_lock); @@ -145,11 +143,11 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) { u32 val = irq->intid; - if (irq->pending) { + if (irq_is_pending(irq)) { val |= GICH_LR_PENDING_BIT; if (irq->config == VGIC_CONFIG_EDGE) - irq->pending = false; + irq->pending_latch = false; if (vgic_irq_is_sgi(irq->intid)) { u32 src = ffs(irq->source); @@ -158,7 +156,7 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; irq->source &= ~(1 << (src - 1)); if (irq->source) - irq->pending = true; + irq->pending_latch = true; } } diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index e6b03fd8c374..679ba935698f 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -94,7 +94,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) /* Edge is the only case where we preserve the pending bit */ if (irq->config == VGIC_CONFIG_EDGE && (val & ICH_LR_PENDING_BIT)) { - irq->pending = true; + irq->pending_latch = true; if (vgic_irq_is_sgi(intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2) { @@ -111,9 +111,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) */ if (irq->config == VGIC_CONFIG_LEVEL) { if (!(val & ICH_LR_PENDING_BIT)) - irq->soft_pending = false; - - irq->pending = irq->line_level || irq->soft_pending; + irq->pending_latch = false; } spin_unlock(&irq->irq_lock); @@ -127,11 +125,11 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) u32 model = vcpu->kvm->arch.vgic.vgic_model; u64 val = irq->intid; - if (irq->pending) { + if (irq_is_pending(irq)) { val |= ICH_LR_PENDING_BIT; if (irq->config == VGIC_CONFIG_EDGE) - irq->pending = false; + irq->pending_latch = false; if (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2) { @@ -141,7 +139,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; irq->source &= ~(1 << (src - 1)); if (irq->source) - irq->pending = true; + irq->pending_latch = true; } } diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 6440b56ec90e..dea12df2b879 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -160,7 +160,7 @@ static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) * If the distributor is disabled, pending interrupts shouldn't be * forwarded. */ - if (irq->enabled && irq->pending) { + if (irq->enabled && irq_is_pending(irq)) { if (unlikely(irq->target_vcpu && !irq->target_vcpu->kvm->arch.vgic.enabled)) return NULL; @@ -204,8 +204,8 @@ static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b) goto out; } - penda = irqa->enabled && irqa->pending; - pendb = irqb->enabled && irqb->pending; + penda = irqa->enabled && irq_is_pending(irqa); + pendb = irqb->enabled && irq_is_pending(irqb); if (!penda || !pendb) { ret = (int)pendb - (int)penda; @@ -371,12 +371,10 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, return 0; } - if (irq->config == VGIC_CONFIG_LEVEL) { + if (irq->config == VGIC_CONFIG_LEVEL) irq->line_level = level; - irq->pending = level || irq->soft_pending; - } else { - irq->pending = true; - } + else + irq->pending_latch = true; vgic_queue_irq_unlock(kvm, irq); vgic_put_irq(kvm, irq); @@ -689,7 +687,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); - pending = irq->pending && irq->enabled; + pending = irq_is_pending(irq) && irq->enabled; spin_unlock(&irq->irq_lock); if (pending) diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 859f65c6e056..b2cf34bde243 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -30,6 +30,14 @@ #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) +static inline bool irq_is_pending(struct vgic_irq *irq) +{ + if (irq->config == VGIC_CONFIG_EDGE) + return irq->pending_latch; + else + return irq->pending_latch || irq->line_level; +} + struct vgic_vmcr { u32 ctlr; u32 abpr; -- cgit v1.2.3-70-g09d2 From 10f92c4c537794f4e2b5f545a8953790c5445d0f Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 17 Jan 2017 23:09:13 +0100 Subject: KVM: arm/arm64: vgic: Add debugfs vgic-state file Add a file to debugfs to read the in-kernel state of the vgic. We don't do any locking of the entire VGIC state while traversing all the IRQs, so if the VM is running the user/developer may not see a quiesced state, but should take care to pause the VM using facilities in user space for that purpose. We also don't support LPIs yet, but they can be added easily if needed. Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Andre Przywara Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/Makefile | 1 + arch/arm64/kvm/Makefile | 1 + include/kvm/arm_vgic.h | 5 + virt/kvm/arm/vgic/vgic-debug.c | 283 +++++++++++++++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic-init.c | 4 + virt/kvm/arm/vgic/vgic.h | 3 + 6 files changed, 297 insertions(+) create mode 100644 virt/kvm/arm/vgic/vgic-debug.c diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index d571243ab4d1..12b628187e90 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -33,5 +33,6 @@ obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o obj-y += $(KVM)/arm/vgic/vgic-its.o +obj-y += $(KVM)/arm/vgic/vgic-debug.o obj-y += $(KVM)/irqchip.o obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index d50a82a16ff6..e025beceda39 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -31,6 +31,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-debug.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/irqchip.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index da2ce086ce31..0af1477cfe8d 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -166,6 +166,8 @@ struct vgic_its { struct list_head collection_list; }; +struct vgic_state_iter; + struct vgic_dist { bool in_kernel; bool ready; @@ -213,6 +215,9 @@ struct vgic_dist { spinlock_t lpi_list_lock; struct list_head lpi_list_head; int lpi_list_count; + + /* used by vgic-debug */ + struct vgic_state_iter *iter; }; struct vgic_v2_cpu_if { diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c new file mode 100644 index 000000000000..7072ab743332 --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-debug.c @@ -0,0 +1,283 @@ +/* + * Copyright (C) 2016 Linaro + * Author: Christoffer Dall + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include "vgic.h" + +/* + * Structure to control looping through the entire vgic state. We start at + * zero for each field and move upwards. So, if dist_id is 0 we print the + * distributor info. When dist_id is 1, we have already printed it and move + * on. + * + * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and + * so on. + */ +struct vgic_state_iter { + int nr_cpus; + int nr_spis; + int dist_id; + int vcpu_id; + int intid; +}; + +static void iter_next(struct vgic_state_iter *iter) +{ + if (iter->dist_id == 0) { + iter->dist_id++; + return; + } + + iter->intid++; + if (iter->intid == VGIC_NR_PRIVATE_IRQS && + ++iter->vcpu_id < iter->nr_cpus) + iter->intid = 0; +} + +static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, + loff_t pos) +{ + int nr_cpus = atomic_read(&kvm->online_vcpus); + + memset(iter, 0, sizeof(*iter)); + + iter->nr_cpus = nr_cpus; + iter->nr_spis = kvm->arch.vgic.nr_spis; + + /* Fast forward to the right position if needed */ + while (pos--) + iter_next(iter); +} + +static bool end_of_vgic(struct vgic_state_iter *iter) +{ + return iter->dist_id > 0 && + iter->vcpu_id == iter->nr_cpus && + (iter->intid - VGIC_NR_PRIVATE_IRQS) == iter->nr_spis; +} + +static void *vgic_debug_start(struct seq_file *s, loff_t *pos) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter; + + mutex_lock(&kvm->lock); + iter = kvm->arch.vgic.iter; + if (iter) { + iter = ERR_PTR(-EBUSY); + goto out; + } + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + iter = ERR_PTR(-ENOMEM); + goto out; + } + + iter_init(kvm, iter, *pos); + kvm->arch.vgic.iter = iter; + + if (end_of_vgic(iter)) + iter = NULL; +out: + mutex_unlock(&kvm->lock); + return iter; +} + +static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter = kvm->arch.vgic.iter; + + ++*pos; + iter_next(iter); + if (end_of_vgic(iter)) + iter = NULL; + return iter; +} + +static void vgic_debug_stop(struct seq_file *s, void *v) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter; + + /* + * If the seq file wasn't properly opened, there's nothing to clearn + * up. + */ + if (IS_ERR(v)) + return; + + mutex_lock(&kvm->lock); + iter = kvm->arch.vgic.iter; + kfree(iter); + kvm->arch.vgic.iter = NULL; + mutex_unlock(&kvm->lock); +} + +static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) +{ + seq_printf(s, "Distributor\n"); + seq_printf(s, "===========\n"); + seq_printf(s, "vgic_model:\t%s\n", + (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) ? + "GICv3" : "GICv2"); + seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); + seq_printf(s, "enabled:\t%d\n", dist->enabled); + seq_printf(s, "\n"); + + seq_printf(s, "P=pending_latch, L=line_level, A=active\n"); + seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n"); +} + +static void print_header(struct seq_file *s, struct vgic_irq *irq, + struct kvm_vcpu *vcpu) +{ + int id = 0; + char *hdr = "SPI "; + + if (vcpu) { + hdr = "VCPU"; + id = vcpu->vcpu_id; + } + + seq_printf(s, "\n"); + seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHC HWID TARGET SRC PRI VCPU_ID\n", hdr, id); + seq_printf(s, "---------------------------------------------------------------\n"); +} + +static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, + struct kvm_vcpu *vcpu) +{ + char *type; + if (irq->intid < VGIC_NR_SGIS) + type = "SGI"; + else if (irq->intid < VGIC_NR_PRIVATE_IRQS) + type = "PPI"; + else + type = "SPI"; + + if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS) + print_header(s, irq, vcpu); + + seq_printf(s, " %s %4d " + " %2d " + "%d%d%d%d%d%d " + "%8d " + "%8x " + " %2x " + "%3d " + " %2d " + "\n", + type, irq->intid, + (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1, + irq->pending_latch, + irq->line_level, + irq->active, + irq->enabled, + irq->hw, + irq->config == VGIC_CONFIG_LEVEL, + irq->hwintid, + irq->mpidr, + irq->source, + irq->priority, + (irq->vcpu) ? irq->vcpu->vcpu_id : -1); + +} + +static int vgic_debug_show(struct seq_file *s, void *v) +{ + struct kvm *kvm = (struct kvm *)s->private; + struct vgic_state_iter *iter = (struct vgic_state_iter *)v; + struct vgic_irq *irq; + struct kvm_vcpu *vcpu = NULL; + + if (iter->dist_id == 0) { + print_dist_state(s, &kvm->arch.vgic); + return 0; + } + + if (!kvm->arch.vgic.initialized) + return 0; + + if (iter->vcpu_id < iter->nr_cpus) { + vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); + irq = &vcpu->arch.vgic_cpu.private_irqs[iter->intid]; + } else { + irq = &kvm->arch.vgic.spis[iter->intid - VGIC_NR_PRIVATE_IRQS]; + } + + spin_lock(&irq->irq_lock); + print_irq_state(s, irq, vcpu); + spin_unlock(&irq->irq_lock); + + return 0; +} + +static struct seq_operations vgic_debug_seq_ops = { + .start = vgic_debug_start, + .next = vgic_debug_next, + .stop = vgic_debug_stop, + .show = vgic_debug_show +}; + +static int debug_open(struct inode *inode, struct file *file) +{ + int ret; + ret = seq_open(file, &vgic_debug_seq_ops); + if (!ret) { + struct seq_file *seq; + /* seq_open will have modified file->private_data */ + seq = file->private_data; + seq->private = inode->i_private; + } + + return ret; +}; + +static struct file_operations vgic_debug_fops = { + .owner = THIS_MODULE, + .open = debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +int vgic_debug_init(struct kvm *kvm) +{ + if (!kvm->debugfs_dentry) + return -ENOENT; + + if (!debugfs_create_file("vgic-state", 0444, + kvm->debugfs_dentry, + kvm, + &vgic_debug_fops)) + return -ENOMEM; + + return 0; +} + +int vgic_debug_destroy(struct kvm *kvm) +{ + return 0; +} diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index c737ea0a310a..276139a24e6f 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -259,6 +259,8 @@ int vgic_init(struct kvm *kvm) if (ret) goto out; + vgic_debug_init(kvm); + dist->initialized = true; out: return ret; @@ -288,6 +290,8 @@ static void __kvm_vgic_destroy(struct kvm *kvm) struct kvm_vcpu *vcpu; int i; + vgic_debug_destroy(kvm); + kvm_vgic_dist_destroy(kvm); kvm_for_each_vcpu(i, vcpu, kvm) diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index b2cf34bde243..48da1f65a6c3 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -102,4 +102,7 @@ int kvm_register_vgic_device(unsigned long type); int vgic_lazy_init(struct kvm *kvm); int vgic_init(struct kvm *kvm); +int vgic_debug_init(struct kvm *kvm); +int vgic_debug_destroy(struct kvm *kvm); + #endif -- cgit v1.2.3-70-g09d2 From 3deda5e50c893be38c1b6b3a73f8f8fb5560baa4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 20 Dec 2016 14:02:29 +1100 Subject: KVM: PPC: Book3S HV: Don't try to signal cpu -1 If the target vcpu for kvmppc_fast_vcpu_kick_hv() is not running on any CPU, then we will have vcpu->arch.thread_cpu == -1, and as it happens, kvmppc_fast_vcpu_kick_hv will call kvmppc_ipi_thread with -1 as the cpu argument. Although this is not meaningful, in the past, before commit 1704a81ccebc ("KVM: PPC: Book3S HV: Use msgsnd for IPIs to other cores on POWER9", 2016-11-18), it was harmless because CPU -1 is not in the same core as any real CPU thread. On a POWER9, however, we don't do the "same core" check, so we were trying to do a msgsnd to thread -1, which is invalid. To avoid this, we add a check to see that vcpu->arch.thread_cpu is >= 0 before calling kvmppc_ipi_thread() with it. Since vcpu->arch.thread_vcpu can change asynchronously, we use READ_ONCE to ensure that the value we check is the same value that we use as the argument to kvmppc_ipi_thread(). Fixes: 1704a81ccebc ("KVM: PPC: Book3S HV: Use msgsnd for IPIs to other cores on POWER9") Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ec34e39471a7..8d9cc07b1e9c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -182,7 +182,8 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) ++vcpu->stat.halt_wakeup; } - if (kvmppc_ipi_thread(vcpu->arch.thread_cpu)) + cpu = READ_ONCE(vcpu->arch.thread_cpu); + if (cpu >= 0 && kvmppc_ipi_thread(cpu)) return; /* CPU points to the first thread of the core */ -- cgit v1.2.3-70-g09d2 From 5efa6605151b84029edeb2e07f2d2d74b52c106f Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 11 Nov 2016 12:57:32 +0800 Subject: KVM: PPC: Book 3S: XICS cleanup: remove XICS_RM_REJECT Commit b0221556dbd3 ("KVM: PPC: Book3S HV: Move virtual mode ICP functions to real-mode") removed the setting of the XICS_RM_REJECT flag. And since that commit, nothing else sets the flag any more, so we can remove the flag and the remaining code that handles it, including the counter that counts how many times it get set. Signed-off-by: Li Zhong Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xics.c | 12 +++--------- arch/powerpc/kvm/book3s_xics.h | 2 -- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 20dff102a06f..debda779a240 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -832,10 +832,6 @@ int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) icp->n_rm_check_resend++; icp_check_resend(xics, icp->rm_resend_icp); } - if (icp->rm_action & XICS_RM_REJECT) { - icp->n_rm_reject++; - icp_deliver_irq(xics, icp, icp->rm_reject); - } if (icp->rm_action & XICS_RM_NOTIFY_EOI) { icp->n_rm_notify_eoi++; kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq); @@ -920,7 +916,7 @@ static int xics_debug_show(struct seq_file *m, void *private) int icsid, i; unsigned long flags; unsigned long t_rm_kick_vcpu, t_rm_check_resend; - unsigned long t_rm_reject, t_rm_notify_eoi; + unsigned long t_rm_notify_eoi; unsigned long t_reject, t_check_resend; if (!kvm) @@ -929,7 +925,6 @@ static int xics_debug_show(struct seq_file *m, void *private) t_rm_kick_vcpu = 0; t_rm_notify_eoi = 0; t_rm_check_resend = 0; - t_rm_reject = 0; t_check_resend = 0; t_reject = 0; @@ -952,14 +947,13 @@ static int xics_debug_show(struct seq_file *m, void *private) t_rm_kick_vcpu += icp->n_rm_kick_vcpu; t_rm_notify_eoi += icp->n_rm_notify_eoi; t_rm_check_resend += icp->n_rm_check_resend; - t_rm_reject += icp->n_rm_reject; t_check_resend += icp->n_check_resend; t_reject += icp->n_reject; } - seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", + seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n", t_rm_kick_vcpu, t_rm_check_resend, - t_rm_reject, t_rm_notify_eoi); + t_rm_notify_eoi); seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n", t_check_resend, t_reject); for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 2a50320b55ca..1d5fac80b706 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -73,7 +73,6 @@ struct kvmppc_icp { */ #define XICS_RM_KICK_VCPU 0x1 #define XICS_RM_CHECK_RESEND 0x2 -#define XICS_RM_REJECT 0x4 #define XICS_RM_NOTIFY_EOI 0x8 u32 rm_action; struct kvm_vcpu *rm_kick_target; @@ -84,7 +83,6 @@ struct kvmppc_icp { /* Counters for each reason we exited real mode */ unsigned long n_rm_kick_vcpu; unsigned long n_rm_check_resend; - unsigned long n_rm_reject; unsigned long n_rm_notify_eoi; /* Counters for handling ICP processing in real mode */ unsigned long n_check_resend; -- cgit v1.2.3-70-g09d2 From 37451bc95dee0e666927d6ffdda302dbbaaae6fa Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 11 Nov 2016 12:57:33 +0800 Subject: KVM: PPC: Book 3S: XICS: correct the real mode ICP rejecting counter Some counters are added in Commit 6e0365b78273 ("KVM: PPC: Book3S HV: Add ICP real mode counters"), to provide some performance statistics to determine whether further optimizing is needed for real mode functions. The n_reject counter counts how many times ICP rejects an irq because of priority in real mode. The redelivery of an lsi that is still asserted after eoi doesn't fall into this category, so the increasement there is removed. Also, it needs to be increased in icp_rm_deliver_irq() if it rejects another one. Signed-off-by: Li Zhong Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_xics.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 06edc4366639..9f6c8fe88db6 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -380,6 +380,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ if (reject && reject != XICS_IPI) { arch_spin_unlock(&ics->lock); + icp->n_reject++; new_irq = reject; goto again; } @@ -711,10 +712,8 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) state = &ics->irq_state[src]; /* Still asserted, resend it */ - if (state->asserted) { - icp->n_reject++; + if (state->asserted) icp_rm_deliver_irq(xics, icp, irq); - } if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { icp->rm_action |= XICS_RM_NOTIFY_EOI; -- cgit v1.2.3-70-g09d2 From bf5a71d53835110d46d33eb5335713ffdbff9ab6 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 11 Nov 2016 12:57:34 +0800 Subject: KVM: PPC: Book 3S: XICS: Fix potential issue with duplicate IRQ resends It is possible that in the following order, one irq is resent twice: CPU 1 CPU 2 ics_check_resend() lock ics_lock see resend set unlock ics_lock /* change affinity of the irq */ kvmppc_xics_set_xive() write_xive() lock ics_lock see resend set unlock ics_lock icp_deliver_irq() /* resend */ icp_deliver_irq() /* resend again */ It doesn't have any user-visible effect at present, but needs to be avoided when the following patch implementing the P/Q stuff is applied. This patch clears the resend flag before releasing the ics lock, when we know we will do a re-delivery after checking the flag, or setting the flag. Signed-off-by: Li Zhong Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_xics.c | 3 +++ arch/powerpc/kvm/book3s_xics.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 9f6c8fe88db6..16349c9e86ba 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -52,6 +52,8 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics, if (!state->resend) continue; + state->resend = 0; + arch_spin_unlock(&ics->lock); icp_rm_deliver_irq(xics, icp, state->number); arch_spin_lock(&ics->lock); @@ -400,6 +402,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ smp_mb(); if (!icp->state.need_resend) { + state->resend = 0; arch_spin_unlock(&ics->lock); goto again; } diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index debda779a240..cdfb4ed73601 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -125,6 +125,8 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, if (!state->resend) continue; + state->resend = 0; + XICS_DBG("resend %#x prio %#x\n", state->number, state->priority); @@ -155,6 +157,7 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, deliver = false; if ((state->masked_pending || state->resend) && priority != MASKED) { state->masked_pending = 0; + state->resend = 0; deliver = true; } @@ -488,6 +491,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ smp_mb(); if (!icp->state.need_resend) { + state->resend = 0; arch_spin_unlock(&ics->lock); local_irq_restore(flags); goto again; -- cgit v1.2.3-70-g09d2 From 17d48610ae0fa218aa386b16a538c792991a3652 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 11 Nov 2016 12:57:35 +0800 Subject: KVM: PPC: Book 3S: XICS: Implement ICS P/Q states This patch implements P(Presented)/Q(Queued) states for ICS irqs. When the interrupt is presented, set P. Present if P was not set. If P is already set, don't present again, set Q. When the interrupt is EOI'ed, move Q into P (and clear Q). If it is set, re-present. The asserted flag used by LSI is also incorporated into the P bit. When the irq state is saved, P/Q bits are also saved, they need some qemu modifications to be recognized and passed around to be restored. KVM_XICS_PENDING bit set and saved should also indicate KVM_XICS_PRESENTED bit set and saved. But it is possible some old code doesn't have/recognize the P bit, so when we restore, we set P for PENDING bit, too. The idea and much of the code come from Ben. Signed-off-by: Li Zhong Signed-off-by: Paul Mackerras --- arch/powerpc/include/uapi/asm/kvm.h | 2 + arch/powerpc/kvm/book3s_hv_rm_xics.c | 100 +++++++++++++++++++--------- arch/powerpc/kvm/book3s_xics.c | 125 ++++++++++++++++++++++++----------- arch/powerpc/kvm/book3s_xics.h | 5 +- 4 files changed, 161 insertions(+), 71 deletions(-) diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 3603b6f51b11..e3db3a50127b 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -613,5 +613,7 @@ struct kvm_get_htab_header { #define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40) #define KVM_XICS_MASKED (1ULL << 41) #define KVM_XICS_PENDING (1ULL << 42) +#define KVM_XICS_PRESENTED (1ULL << 43) +#define KVM_XICS_QUEUED (1ULL << 44) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 16349c9e86ba..30f82c79de5d 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -672,51 +672,39 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) return check_too_hard(xics, icp); } -int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; struct kvmppc_ics *ics; struct ics_irq_state *state; - u32 irq = xirr & 0x00ffffff; u16 src; - - if (!xics || !xics->real_mode) - return H_TOO_HARD; + u32 pq_old, pq_new; /* - * ICP State: EOI - * - * Note: If EOI is incorrectly used by SW to lower the CPPR - * value (ie more favored), we do not check for rejection of - * a pending interrupt, this is a SW error and PAPR sepcifies - * that we don't have to deal with it. + * ICS EOI handling: For LSI, if P bit is still set, we need to + * resend it. * - * The sending of an EOI to the ICS is handled after the - * CPPR update - * - * ICP State: Down_CPPR which we handle - * in a separate function as it's shared with H_CPPR. + * For MSI, we move Q bit into P (and clear Q). If it is set, + * resend it. */ - icp_rm_down_cppr(xics, icp, xirr >> 24); - /* IPIs have no EOI */ - if (irq == XICS_IPI) - goto bail; - /* - * EOI handling: If the interrupt is still asserted, we need to - * resend it. We can take a lockless "peek" at the ICS state here. - * - * "Message" interrupts will never have "asserted" set - */ ics = kvmppc_xics_find_ics(xics, irq, &src); if (!ics) goto bail; + state = &ics->irq_state[src]; - /* Still asserted, resend it */ - if (state->asserted) - icp_rm_deliver_irq(xics, icp, irq); + if (state->lsi) + pq_new = state->pq_state; + else + do { + pq_old = state->pq_state; + pq_new = pq_old >> 1; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + if (pq_new & PQ_PRESENTED) + icp_rm_deliver_irq(xics, NULL, irq); if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { icp->rm_action |= XICS_RM_NOTIFY_EOI; @@ -737,10 +725,43 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) state->intr_cpu = -1; } } + bail: return check_too_hard(xics, icp); } +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 irq = xirr & 0x00ffffff; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR specifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_rm_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + return check_too_hard(xics, icp); + + return ics_rm_eoi(vcpu, irq); +} + unsigned long eoi_rc; static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) @@ -827,14 +848,33 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, { struct kvmppc_xics *xics; struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; u32 irq; + u16 src; + u32 pq_old, pq_new; irq = irq_map->v_hwirq; xics = vcpu->kvm->arch.xics; icp = vcpu->arch.icp; kvmppc_rm_handle_irq_desc(irq_map->desc); - icp_rm_deliver_irq(xics, icp, irq); + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return 2; + + state = &ics->irq_state[src]; + + /* only MSIs register bypass producers, so it must be MSI here */ + do { + pq_old = state->pq_state; + pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + /* Test P=1, Q=0, this is the only case where we present */ + if (pq_new == PQ_PRESENTED) + icp_rm_deliver_irq(xics, icp, irq); /* EOI the interrupt */ icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index cdfb4ed73601..c7620622c846 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -75,6 +75,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) struct ics_irq_state *state; struct kvmppc_ics *ics; u16 src; + u32 pq_old, pq_new; XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); @@ -87,25 +88,41 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) if (!state->exists) return -EINVAL; + if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET) + level = 1; + else if (level == KVM_INTERRUPT_UNSET) + level = 0; /* - * We set state->asserted locklessly. This should be fine as - * we are the only setter, thus concurrent access is undefined - * to begin with. + * Take other values the same as 1, consistent with original code. + * maybe WARN here? */ - if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL) - state->asserted = 1; - else if (level == 0 || level == KVM_INTERRUPT_UNSET) { - state->asserted = 0; + + if (!state->lsi && level == 0) /* noop for MSI */ return 0; - } + + do { + pq_old = state->pq_state; + if (state->lsi) { + if (level) { + if (pq_old & PQ_PRESENTED) + /* Setting already set LSI ... */ + return 0; + + pq_new = PQ_PRESENTED; + } else + pq_new = 0; + } else + pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + /* Test P=1, Q=0, this is the only case where we present */ + if (pq_new == PQ_PRESENTED) + icp_deliver_irq(xics, NULL, irq); /* Record which CPU this arrived on for passed-through interrupts */ if (state->host_irq) state->intr_cpu = raw_smp_processor_id(); - /* Attempt delivery */ - icp_deliver_irq(xics, NULL, irq); - return 0; } @@ -768,14 +785,51 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) icp_deliver_irq(xics, icp, reject); } -static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; struct kvmppc_ics *ics; struct ics_irq_state *state; - u32 irq = xirr & 0x00ffffff; u16 src; + u32 pq_old, pq_new; + + /* + * ICS EOI handling: For LSI, if P bit is still set, we need to + * resend it. + * + * For MSI, we move Q bit into P (and clear Q). If it is set, + * resend it. + */ + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq); + return H_PARAMETER; + } + state = &ics->irq_state[src]; + + if (state->lsi) + pq_new = state->pq_state; + else + do { + pq_old = state->pq_state; + pq_new = pq_old >> 1; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + if (pq_new & PQ_PRESENTED) + icp_deliver_irq(xics, icp, irq); + + kvm_notify_acked_irq(vcpu->kvm, 0, irq); + + return H_SUCCESS; +} + +static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 irq = xirr & 0x00ffffff; XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); @@ -798,26 +852,8 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) /* IPIs have no EOI */ if (irq == XICS_IPI) return H_SUCCESS; - /* - * EOI handling: If the interrupt is still asserted, we need to - * resend it. We can take a lockless "peek" at the ICS state here. - * - * "Message" interrupts will never have "asserted" set - */ - ics = kvmppc_xics_find_ics(xics, irq, &src); - if (!ics) { - XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq); - return H_PARAMETER; - } - state = &ics->irq_state[src]; - - /* Still asserted, resend it */ - if (state->asserted) - icp_deliver_irq(xics, icp, irq); - - kvm_notify_acked_irq(vcpu->kvm, 0, irq); - return H_SUCCESS; + return ics_eoi(vcpu, irq); } int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) @@ -975,9 +1011,9 @@ static int xics_debug_show(struct seq_file *m, void *private) for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *irq = &ics->irq_state[i]; - seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", + seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n", irq->number, irq->server, irq->priority, - irq->saved_priority, irq->asserted, + irq->saved_priority, irq->pq_state, irq->resend, irq->masked_pending); } @@ -1196,10 +1232,17 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) val |= prio << KVM_XICS_PRIORITY_SHIFT; if (irqp->lsi) { val |= KVM_XICS_LEVEL_SENSITIVE; - if (irqp->asserted) + if (irqp->pq_state & PQ_PRESENTED) val |= KVM_XICS_PENDING; } else if (irqp->masked_pending || irqp->resend) val |= KVM_XICS_PENDING; + + if (irqp->pq_state & PQ_PRESENTED) + val |= KVM_XICS_PRESENTED; + + if (irqp->pq_state & PQ_QUEUED) + val |= KVM_XICS_QUEUED; + ret = 0; } arch_spin_unlock(&ics->lock); @@ -1251,12 +1294,14 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) irqp->resend = 0; irqp->masked_pending = 0; irqp->lsi = 0; - irqp->asserted = 0; - if (val & KVM_XICS_LEVEL_SENSITIVE) { + irqp->pq_state = 0; + if (val & KVM_XICS_LEVEL_SENSITIVE) irqp->lsi = 1; - if (val & KVM_XICS_PENDING) - irqp->asserted = 1; - } + /* If PENDING, set P in case P is not saved because of old code */ + if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING) + irqp->pq_state |= PQ_PRESENTED; + if (val & KVM_XICS_QUEUED) + irqp->pq_state |= PQ_QUEUED; irqp->exists = 1; arch_spin_unlock(&ics->lock); local_irq_restore(flags); diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 1d5fac80b706..ec5474cf70c6 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -31,16 +31,19 @@ /* Priority value to use for disabling an interrupt */ #define MASKED 0xff +#define PQ_PRESENTED 1 +#define PQ_QUEUED 2 + /* State for one irq source */ struct ics_irq_state { u32 number; u32 server; + u32 pq_state; u8 priority; u8 saved_priority; u8 resend; u8 masked_pending; u8 lsi; /* level-sensitive interrupt */ - u8 asserted; /* Only for LSI */ u8 exists; int intr_cpu; u32 host_irq; -- cgit v1.2.3-70-g09d2 From 21acd0e4df04f02176e773468658c3cebff096bb Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 11 Nov 2016 12:57:36 +0800 Subject: KVM: PPC: Book 3S: XICS: Don't lock twice when checking for resend This patch improves the code that takes lock twice to check the resend flag and do the actual resending, by checking the resend flag locklessly, and add a boolean parameter check_resend to icp_[rm_]deliver_irq(), so the resend flag can be checked in the lock when doing the delivery. We need make sure when we clear the ics's bit in the icp's resend_map, we don't miss the resend flag of the irqs that set the bit. It could be ordered through the barrier in test_and_clear_bit(), and a newly added wmb between setting irq's resend flag, and icp's resend_map. Signed-off-by: Li Zhong Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_xics.c | 40 ++++++++++++------------ arch/powerpc/kvm/book3s_xics.c | 59 +++++++++++++++++------------------- 2 files changed, 48 insertions(+), 51 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 30f82c79de5d..44cfdd281fa1 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -35,7 +35,7 @@ int kvm_irq_bypass = 1; EXPORT_SYMBOL(kvm_irq_bypass); static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq); + u32 new_irq, bool check_resend); static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu); /* -- ICS routines -- */ @@ -44,22 +44,12 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics, { int i; - arch_spin_lock(&ics->lock); - for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *state = &ics->irq_state[i]; - - if (!state->resend) - continue; - - state->resend = 0; - - arch_spin_unlock(&ics->lock); - icp_rm_deliver_irq(xics, icp, state->number); - arch_spin_lock(&ics->lock); + if (state->resend) + icp_rm_deliver_irq(xics, icp, state->number, true); } - arch_spin_unlock(&ics->lock); } /* -- ICP routines -- */ @@ -292,7 +282,7 @@ static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, } static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq) + u32 new_irq, bool check_resend) { struct ics_irq_state *state; struct kvmppc_ics *ics; @@ -337,6 +327,10 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, } } + if (check_resend) + if (!state->resend) + goto out; + /* Clear the resend bit of that interrupt */ state->resend = 0; @@ -384,6 +378,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, arch_spin_unlock(&ics->lock); icp->n_reject++; new_irq = reject; + check_resend = 0; goto again; } } else { @@ -391,9 +386,15 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * We failed to deliver the interrupt we need to set the * resend map bit and mark the ICS state as needing a resend */ - set_bit(ics->icsid, icp->resend_map); state->resend = 1; + /* + * Make sure when checking resend, we don't miss the resend + * if resend_map bit is seen and cleared. + */ + smp_wmb(); + set_bit(ics->icsid, icp->resend_map); + /* * If the need_resend flag got cleared in the ICP some time * between icp_rm_try_to_deliver() atomic update and now, then @@ -404,6 +405,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, if (!icp->state.need_resend) { state->resend = 0; arch_spin_unlock(&ics->lock); + check_resend = 0; goto again; } } @@ -598,7 +600,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, /* Handle reject in real mode */ if (reject && reject != XICS_IPI) { this_icp->n_reject++; - icp_rm_deliver_irq(xics, icp, reject); + icp_rm_deliver_irq(xics, icp, reject, false); } /* Handle resends in real mode */ @@ -666,7 +668,7 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) */ if (reject && reject != XICS_IPI) { icp->n_reject++; - icp_rm_deliver_irq(xics, icp, reject); + icp_rm_deliver_irq(xics, icp, reject, false); } bail: return check_too_hard(xics, icp); @@ -704,7 +706,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq) } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); if (pq_new & PQ_PRESENTED) - icp_rm_deliver_irq(xics, NULL, irq); + icp_rm_deliver_irq(xics, NULL, irq, false); if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { icp->rm_action |= XICS_RM_NOTIFY_EOI; @@ -874,7 +876,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, /* Test P=1, Q=0, this is the only case where we present */ if (pq_new == PQ_PRESENTED) - icp_rm_deliver_irq(xics, icp, irq); + icp_rm_deliver_irq(xics, icp, irq, false); /* EOI the interrupt */ icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index c7620622c846..e48803e2918d 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -63,7 +63,7 @@ /* -- ICS routines -- */ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq); + u32 new_irq, bool check_resend); /* * Return value ideally indicates how the interrupt was handled, but no @@ -117,7 +117,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) /* Test P=1, Q=0, this is the only case where we present */ if (pq_new == PQ_PRESENTED) - icp_deliver_irq(xics, NULL, irq); + icp_deliver_irq(xics, NULL, irq, false); /* Record which CPU this arrived on for passed-through interrupts */ if (state->host_irq) @@ -131,31 +131,14 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, { int i; - unsigned long flags; - - local_irq_save(flags); - arch_spin_lock(&ics->lock); - for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *state = &ics->irq_state[i]; - - if (!state->resend) - continue; - - state->resend = 0; - - XICS_DBG("resend %#x prio %#x\n", state->number, - state->priority); - - arch_spin_unlock(&ics->lock); - local_irq_restore(flags); - icp_deliver_irq(xics, icp, state->number); - local_irq_save(flags); - arch_spin_lock(&ics->lock); + if (state->resend) { + XICS_DBG("resend %#x prio %#x\n", state->number, + state->priority); + icp_deliver_irq(xics, icp, state->number, true); + } } - - arch_spin_unlock(&ics->lock); - local_irq_restore(flags); } static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, @@ -209,7 +192,7 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) state->masked_pending, state->resend); if (write_xive(xics, ics, state, server, priority, priority)) - icp_deliver_irq(xics, icp, irq); + icp_deliver_irq(xics, icp, irq, false); return 0; } @@ -262,7 +245,7 @@ int kvmppc_xics_int_on(struct kvm *kvm, u32 irq) if (write_xive(xics, ics, state, state->server, state->saved_priority, state->saved_priority)) - icp_deliver_irq(xics, icp, irq); + icp_deliver_irq(xics, icp, irq, false); return 0; } @@ -396,7 +379,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, } static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq) + u32 new_irq, bool check_resend) { struct ics_irq_state *state; struct kvmppc_ics *ics; @@ -442,6 +425,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, } } + if (check_resend) + if (!state->resend) + goto out; + /* Clear the resend bit of that interrupt */ state->resend = 0; @@ -490,6 +477,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, arch_spin_unlock(&ics->lock); local_irq_restore(flags); new_irq = reject; + check_resend = 0; goto again; } } else { @@ -497,9 +485,15 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * We failed to deliver the interrupt we need to set the * resend map bit and mark the ICS state as needing a resend */ - set_bit(ics->icsid, icp->resend_map); state->resend = 1; + /* + * Make sure when checking resend, we don't miss the resend + * if resend_map bit is seen and cleared. + */ + smp_wmb(); + set_bit(ics->icsid, icp->resend_map); + /* * If the need_resend flag got cleared in the ICP some time * between icp_try_to_deliver() atomic update and now, then @@ -511,6 +505,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, state->resend = 0; arch_spin_unlock(&ics->lock); local_irq_restore(flags); + check_resend = 0; goto again; } } @@ -702,7 +697,7 @@ static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, /* Handle reject */ if (reject && reject != XICS_IPI) - icp_deliver_irq(xics, icp, reject); + icp_deliver_irq(xics, icp, reject, false); /* Handle resend */ if (resend) @@ -782,7 +777,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) * attempt (see comments in icp_deliver_irq). */ if (reject && reject != XICS_IPI) - icp_deliver_irq(xics, icp, reject); + icp_deliver_irq(xics, icp, reject, false); } static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq) @@ -818,7 +813,7 @@ static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq) } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); if (pq_new & PQ_PRESENTED) - icp_deliver_irq(xics, icp, irq); + icp_deliver_irq(xics, icp, irq, false); kvm_notify_acked_irq(vcpu->kvm, 0, irq); @@ -1307,7 +1302,7 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) local_irq_restore(flags); if (val & KVM_XICS_PENDING) - icp_deliver_irq(xics, NULL, irqp->number); + icp_deliver_irq(xics, NULL, irqp->number, false); return 0; } -- cgit v1.2.3-70-g09d2 From 8464c8842de2249061d3c5abc2ccce1bbbd10e7a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 6 Dec 2016 20:42:05 +1100 Subject: KVM: PPC: Book3S HV: Fix H_PROD to actually wake the target vcpu The H_PROD hypercall is supposed to wake up an idle vcpu. We have an implementation, but because Linux doesn't use it except when doing cpu hotplug, it was never tested properly. AIX does use it, and reported it broken. It turns out we were waking the wrong vcpu (the one doing H_PROD, not the target of the prod) and we weren't handling the case where the target needs an IPI to wake it. Fix it by using the existing kvmppc_fast_vcpu_kick_hv() function, which is intended for this kind of thing, and by using the target vcpu not the current vcpu. We were also not looking at the prodded flag when checking whether a ceded vcpu should wake up, so this adds checks for the prodded flag alongside the checks for pending exceptions. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8d9cc07b1e9c..856cc9d38efd 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -774,12 +774,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) } tvcpu->arch.prodded = 1; smp_mb(); - if (vcpu->arch.ceded) { - if (swait_active(&vcpu->wq)) { - swake_up(&vcpu->wq); - vcpu->stat.halt_wakeup++; - } - } + if (tvcpu->arch.ceded) + kvmppc_fast_vcpu_kick_hv(tvcpu); break; case H_CONFER: target = kvmppc_get_gpr(vcpu, 4); @@ -2621,7 +2617,8 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) int i; for_each_runnable_thread(i, vcpu, vc) { - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) + if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded || + vcpu->arch.prodded) return 1; } @@ -2807,7 +2804,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) break; n_ceded = 0; for_each_runnable_thread(i, v, vc) { - if (!v->arch.pending_exceptions) + if (!v->arch.pending_exceptions && !v->arch.prodded) n_ceded += v->arch.ceded; else v->arch.ceded = 0; -- cgit v1.2.3-70-g09d2 From fcd4f3c6d150357a02af8526e69bfebb82dd5d46 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 25 Jan 2017 13:27:22 +0100 Subject: KVM: PPC: Book3S PR: Refactor program interrupt related code into separate function The function kvmppc_handle_exit_pr() is quite huge and thus hard to read, and even contains a "spaghetti-code"-like goto between the different case labels of the big switch statement. This can be made much more readable by moving the code related to injecting program interrupts / instruction emulation into a separate function instead. Signed-off-by: Thomas Huth Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_pr.c | 130 +++++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 1482961ceb4d..d4dfc0ca2a44 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -902,6 +902,69 @@ static void kvmppc_clear_debug(struct kvm_vcpu *vcpu) } } +static int kvmppc_exit_pr_progint(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + enum emulation_result er; + ulong flags; + u32 last_inst; + int emul, r; + + /* + * shadow_srr1 only contains valid flags if we came here via a program + * exception. The other exceptions (emulation assist, FP unavailable, + * etc.) do not provide flags in SRR1, so use an illegal-instruction + * exception when injecting a program interrupt into the guest. + */ + if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + else + flags = SRR1_PROGILL; + + emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); + if (emul != EMULATE_DONE) + return RESUME_GUEST; + + if (kvmppc_get_msr(vcpu) & MSR_PR) { +#ifdef EXIT_DEBUG + pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n", + kvmppc_get_pc(vcpu), last_inst); +#endif + if ((last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) { + kvmppc_core_queue_program(vcpu, flags); + return RESUME_GUEST; + } + } + + vcpu->stat.emulated_inst_exits++; + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + r = RESUME_GUEST_NV; + break; + case EMULATE_AGAIN: + r = RESUME_GUEST; + break; + case EMULATE_FAIL: + pr_crit("%s: emulation at %lx failed (%08x)\n", + __func__, kvmppc_get_pc(vcpu), last_inst); + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + case EMULATE_DO_MMIO: + run->exit_reason = KVM_EXIT_MMIO; + r = RESUME_HOST_NV; + break; + case EMULATE_EXIT_USER: + r = RESUME_HOST_NV; + break; + default: + BUG(); + } + + return r; +} + int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { @@ -1044,71 +1107,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOK3S_INTERRUPT_PROGRAM: case BOOK3S_INTERRUPT_H_EMUL_ASSIST: - { - enum emulation_result er; - ulong flags; - u32 last_inst; - int emul; - -program_interrupt: - /* - * shadow_srr1 only contains valid flags if we came here via - * a program exception. The other exceptions (emulation assist, - * FP unavailable, etc.) do not provide flags in SRR1, so use - * an illegal-instruction exception when injecting a program - * interrupt into the guest. - */ - if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) - flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; - else - flags = SRR1_PROGILL; - - emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); - if (emul != EMULATE_DONE) { - r = RESUME_GUEST; - break; - } - - if (kvmppc_get_msr(vcpu) & MSR_PR) { -#ifdef EXIT_DEBUG - pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n", - kvmppc_get_pc(vcpu), last_inst); -#endif - if ((last_inst & 0xff0007ff) != - (INS_DCBZ & 0xfffffff7)) { - kvmppc_core_queue_program(vcpu, flags); - r = RESUME_GUEST; - break; - } - } - - vcpu->stat.emulated_inst_exits++; - er = kvmppc_emulate_instruction(run, vcpu); - switch (er) { - case EMULATE_DONE: - r = RESUME_GUEST_NV; - break; - case EMULATE_AGAIN: - r = RESUME_GUEST; - break; - case EMULATE_FAIL: - printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", - __func__, kvmppc_get_pc(vcpu), last_inst); - kvmppc_core_queue_program(vcpu, flags); - r = RESUME_GUEST; - break; - case EMULATE_DO_MMIO: - run->exit_reason = KVM_EXIT_MMIO; - r = RESUME_HOST_NV; - break; - case EMULATE_EXIT_USER: - r = RESUME_HOST_NV; - break; - default: - BUG(); - } + r = kvmppc_exit_pr_progint(run, vcpu, exit_nr); break; - } case BOOK3S_INTERRUPT_SYSCALL: { u32 last_sc; @@ -1185,7 +1185,7 @@ program_interrupt: emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); if (emul == EMULATE_DONE) - goto program_interrupt; + r = kvmppc_exit_pr_progint(run, vcpu, exit_nr); else r = RESUME_GUEST; -- cgit v1.2.3-70-g09d2 From ab22a4733fe919d22bc2957680506ed17e40941e Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 21 Dec 2016 20:29:28 -0800 Subject: kvm: x86: mmu: Rename EPT_VIOLATION_READ/WRITE/INSTR constants Rename the EPT_VIOLATION_READ/WRITE/INSTR constants to EPT_VIOLATION_ACC_READ/WRITE/INSTR to more clearly indicate that these signify the type of the memory access as opposed to the permissions granted by the PTE. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 12 ++++++------ arch/x86/kvm/vmx.c | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a22a4790f1ac..cc54b7026567 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -510,15 +510,15 @@ struct vmx_msr_entry { /* * Exit Qualifications for EPT Violations */ -#define EPT_VIOLATION_READ_BIT 0 -#define EPT_VIOLATION_WRITE_BIT 1 -#define EPT_VIOLATION_INSTR_BIT 2 +#define EPT_VIOLATION_ACC_READ_BIT 0 +#define EPT_VIOLATION_ACC_WRITE_BIT 1 +#define EPT_VIOLATION_ACC_INSTR_BIT 2 #define EPT_VIOLATION_READABLE_BIT 3 #define EPT_VIOLATION_WRITABLE_BIT 4 #define EPT_VIOLATION_EXECUTABLE_BIT 5 -#define EPT_VIOLATION_READ (1 << EPT_VIOLATION_READ_BIT) -#define EPT_VIOLATION_WRITE (1 << EPT_VIOLATION_WRITE_BIT) -#define EPT_VIOLATION_INSTR (1 << EPT_VIOLATION_INSTR_BIT) +#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) +#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) +#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) #define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) #define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) #define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c7bafa1457e2..81c301def1af 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6375,13 +6375,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) trace_kvm_page_fault(gpa, exit_qualification); /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_READ) + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) ? PFERR_USER_MASK : 0; /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_WRITE) + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) ? PFERR_WRITE_MASK : 0; /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_INSTR) + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; /* ept page table entry is present? */ error_code |= (exit_qualification & -- cgit v1.2.3-70-g09d2 From 312b616b30d87581b88d3db54c14ed89610cc97b Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 21 Dec 2016 20:29:29 -0800 Subject: kvm: x86: mmu: Set SPTE_SPECIAL_MASK within mmu.c Instead of the caller including the SPTE_SPECIAL_MASK in the masks being supplied to kvm_mmu_set_mmio_spte_mask() and kvm_mmu_set_mask_ptes(), those functions now themselves include the SPTE_SPECIAL_MASK. Note that bit 63 is now reset in the default MMIO mask. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 5 ++++- arch/x86/kvm/vmx.c | 6 ++---- arch/x86/kvm/x86.c | 3 --- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 64821ca3a7c3..e3312e22e8db 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -208,7 +208,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { - shadow_mmio_mask = mmio_mask; + shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); @@ -318,6 +318,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, u64 acc_track_mask) { + if (acc_track_mask != 0) + acc_track_mask |= SPTE_SPECIAL_MASK; + shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 81c301def1af..d850d5d36182 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5236,10 +5236,8 @@ static void ept_set_mmio_spte_mask(void) /* * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). - * Also, special bit (62) is set to quickly identify mmio spte. */ - kvm_mmu_set_mmio_spte_mask(SPTE_SPECIAL_MASK | - VMX_EPT_MISCONFIG_WX_VALUE); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -6585,7 +6583,7 @@ void vmx_enable_tdp(void) enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK); + enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6e2c71ea0627..4fd4d4f35caf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5952,9 +5952,6 @@ static void kvm_set_mmio_spte_mask(void) /* Mask the reserved physical address bits. */ mask = rsvd_bits(maxphyaddr, 51); - /* Bit 62 is always reserved for 32bit host. */ - mask |= 0x3ull << 62; - /* Set the present bit. */ mask |= 1ull; -- cgit v1.2.3-70-g09d2 From 20d65236d01cdbe14a88f0e2c0f985669f8c41fc Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 21 Dec 2016 20:29:31 -0800 Subject: kvm: x86: mmu: Update comment in mark_spte_for_access_track Reword the comment to hopefully make it more clear. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e3312e22e8db..e13041ac7cdf 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -708,9 +708,9 @@ static u64 mark_spte_for_access_track(u64 spte) return spte; /* - * Verify that the write-protection that we do below will be fixable - * via the fast page fault path. Currently, that is always the case, at - * least when using EPT (which is when access tracking would be used). + * Making an Access Tracking PTE will result in removal of write access + * from the PTE. So, verify that we will be able to restore the write + * access in the fast page fault path later on. */ WARN_ONCE((spte & PT_WRITABLE_MASK) && !spte_can_locklessly_be_made_writable(spte), -- cgit v1.2.3-70-g09d2 From d162f30a7cebe9731fd331419b3a14089d0b41e3 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 21 Dec 2016 20:29:30 -0800 Subject: kvm: x86: mmu: Move pgtbl walk inside retry loop in fast_page_fault Redo the page table walk in fast_page_fault when retrying so that we are working on the latest PTE even if the hierarchy changes. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e13041ac7cdf..437d16274701 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3088,14 +3088,16 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, return false; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) - if (!is_shadow_present_pte(spte) || iterator.level < level) - break; do { bool remove_write_prot = false; bool remove_acc_track; + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || + iterator.level < level) + break; + sp = page_header(__pa(iterator.sptep)); if (!is_last_spte(spte, sp->role.level)) break; @@ -3176,8 +3178,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, break; } - spte = mmu_spte_get_lockless(iterator.sptep); - } while (true); trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, -- cgit v1.2.3-70-g09d2 From d3e328f2cb01f6f09259a5810baae3edf5416076 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 21 Dec 2016 20:29:32 -0800 Subject: kvm: x86: mmu: Verify that restored PTE has needed perms in fast page fault Before fast page fault restores an access track PTE back to a regular PTE, it now also verifies that the restored PTE would grant the necessary permissions for the faulting access to succeed. If not, it falls back to the slow page fault path. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 127 ++++++++++++++++++++++++++++------------------------- 1 file changed, 68 insertions(+), 59 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 437d16274701..2fd7586aad4d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -373,6 +373,11 @@ static int is_last_spte(u64 pte, int level) return 0; } +static bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + static kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -728,6 +733,23 @@ static u64 mark_spte_for_access_track(u64 spte) return spte; } +/* Restore an acc-track PTE back to a regular PTE */ +static u64 restore_acc_track_spte(u64 spte) +{ + u64 new_spte = spte; + u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) + & shadow_acc_track_saved_bits_mask; + + WARN_ON_ONCE(!is_access_track_spte(spte)); + + new_spte &= ~shadow_acc_track_mask; + new_spte &= ~(shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift); + new_spte |= saved_bits; + + return new_spte; +} + /* Returns the Accessed status of the PTE and resets it at the same time. */ static bool mmu_spte_age(u64 *sptep) { @@ -3019,27 +3041,12 @@ static bool page_fault_can_be_fast(u32 error_code) */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *sptep, u64 old_spte, - bool remove_write_prot, bool remove_acc_track) + u64 *sptep, u64 old_spte, u64 new_spte) { gfn_t gfn; - u64 new_spte = old_spte; WARN_ON(!sp->role.direct); - if (remove_acc_track) { - u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift) - & shadow_acc_track_saved_bits_mask; - - new_spte &= ~shadow_acc_track_mask; - new_spte &= ~(shadow_acc_track_saved_bits_mask << - shadow_acc_track_saved_bits_shift); - new_spte |= saved_bits; - } - - if (remove_write_prot) - new_spte |= PT_WRITABLE_MASK; - /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in @@ -3055,7 +3062,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) return false; - if (remove_write_prot) { + if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { /* * The gfn of direct spte is stable since it is * calculated by sp->gfn. @@ -3067,6 +3074,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return true; } +static bool is_access_allowed(u32 fault_err_code, u64 spte) +{ + if (fault_err_code & PFERR_FETCH_MASK) + return is_executable_pte(spte); + + if (fault_err_code & PFERR_WRITE_MASK) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + /* * Return value: * - true: let the vcpu to access on the same address again. @@ -3090,8 +3109,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, walk_shadow_page_lockless_begin(vcpu); do { - bool remove_write_prot = false; - bool remove_acc_track; + u64 new_spte; for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) if (!is_shadow_present_pte(spte) || @@ -3112,52 +3130,44 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * Need not check the access of upper level table entries since * they are always ACC_ALL. */ + if (is_access_allowed(error_code, spte)) { + fault_handled = true; + break; + } - if (error_code & PFERR_FETCH_MASK) { - if ((spte & (shadow_x_mask | shadow_nx_mask)) - == shadow_x_mask) { - fault_handled = true; - break; - } - } else if (error_code & PFERR_WRITE_MASK) { - if (is_writable_pte(spte)) { - fault_handled = true; - break; - } + new_spte = spte; + + if (is_access_track_spte(spte)) + new_spte = restore_acc_track_spte(new_spte); + + /* + * Currently, to simplify the code, write-protection can + * be removed in the fast path only if the SPTE was + * write-protected for dirty-logging or access tracking. + */ + if ((error_code & PFERR_WRITE_MASK) && + spte_can_locklessly_be_made_writable(spte)) + { + new_spte |= PT_WRITABLE_MASK; /* - * Currently, to simplify the code, write-protection can - * be removed in the fast path only if the SPTE was - * write-protected for dirty-logging. + * Do not fix write-permission on the large spte. Since + * we only dirty the first page into the dirty-bitmap in + * fast_pf_fix_direct_spte(), other pages are missed + * if its slot has dirty logging enabled. + * + * Instead, we let the slow page fault path create a + * normal spte to fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). */ - remove_write_prot = - spte_can_locklessly_be_made_writable(spte); - } else { - /* Fault was on Read access */ - if (spte & PT_PRESENT_MASK) { - fault_handled = true; + if (sp->role.level > PT_PAGE_TABLE_LEVEL) break; - } } - remove_acc_track = is_access_track_spte(spte); - /* Verify that the fault can be handled in the fast path */ - if (!remove_acc_track && !remove_write_prot) - break; - - /* - * Do not fix write-permission on the large spte since we only - * dirty the first page into the dirty-bitmap in - * fast_pf_fix_direct_spte() that means other pages are missed - * if its slot is dirty-logged. - * - * Instead, we let the slow page fault path create a normal spte - * to fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). - */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot) + if (new_spte == spte || + !is_access_allowed(error_code, new_spte)) break; /* @@ -3167,8 +3177,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - remove_write_prot, - remove_acc_track); + new_spte); if (fault_handled) break; -- cgit v1.2.3-70-g09d2 From a679c547d19ded9b7d1a88e17cf5e5b69ac619b6 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 15 Dec 2016 15:58:14 +0100 Subject: KVM: s390: gaccess: add ESOP2 handling When we access guest memory and run into a protection exception, we need to pass the exception data to the guest. ESOP2 provides detailed information about all protection exceptions which ESOP1 only partially provided. The gaccess changes make sure, that the guest always gets all available information. Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 12 +++++++++--- arch/s390/tools/gen_facilities.c | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 4aa8a7e2a1da..6e94705efd4e 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -465,7 +465,9 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar, struct trans_exc_code_bits { unsigned long addr : 52; /* Translation-exception Address */ unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */ - unsigned long : 6; + unsigned long : 2; + unsigned long b56 : 1; + unsigned long : 3; unsigned long b60 : 1; unsigned long b61 : 1; unsigned long as : 2; /* ASCE Identifier */ @@ -497,14 +499,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (code) { case PGM_PROTECTION: switch (prot) { + case PROT_TYPE_LA: + tec->b56 = 1; + break; + case PROT_TYPE_KEYC: + tec->b60 = 1; + break; case PROT_TYPE_ALC: tec->b60 = 1; /* FALL THROUGH */ case PROT_TYPE_DAT: tec->b61 = 1; break; - default: /* LA and KEYC set b61 to 0, other params undefined */ - return code; } /* FALL THROUGH */ case PGM_ASCE_TYPE: diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 8cc53b1e6d03..b6ac8dfee001 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -80,6 +80,7 @@ static struct facility_def facility_defs[] = { 76, /* msa extension 3 */ 77, /* msa extension 4 */ 78, /* enhanced-DAT 2 */ + 131, /* enhanced-SOP 2 and side-effect */ -1 /* END */ } }, -- cgit v1.2.3-70-g09d2 From cd1836f583d78bdd15ef748f4d85bf007569c7ad Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Thu, 4 Aug 2016 09:57:36 +0200 Subject: KVM: s390: instruction-execution-protection support The new Instruction Execution Protection needs to be enabled before the guest can use it. Therefore we pass the IEP facility bit to the guest and enable IEP interpretation. Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/vsie.c | 3 +++ arch/s390/tools/gen_facilities.c | 1 + 3 files changed, 6 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bec71e902be3..e5130818fb03 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1938,6 +1938,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi) vcpu->arch.sie_block->ecb2 |= 0x08; + if (test_kvm_facility(vcpu->kvm, 130)) + vcpu->arch.sie_block->ecb2 |= 0x20; vcpu->arch.sie_block->eca = 0x1002000U; if (sclp.has_cei) vcpu->arch.sie_block->eca |= 0x80000000U; diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index d8673e243f13..ed62c6d57d93 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -324,6 +324,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* Run-time-Instrumentation */ if (test_kvm_facility(vcpu->kvm, 64)) scb_s->ecb3 |= scb_o->ecb3 & 0x01U; + /* Instruction Execution Prevention */ + if (test_kvm_facility(vcpu->kvm, 130)) + scb_s->ecb2 |= scb_o->ecb2 & 0x20U; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF)) scb_s->eca |= scb_o->eca & 0x00000001U; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB)) diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index b6ac8dfee001..0cf802de52a1 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -80,6 +80,7 @@ static struct facility_def facility_defs[] = { 76, /* msa extension 3 */ 77, /* msa extension 4 */ 78, /* enhanced-DAT 2 */ + 130, /* instruction-execution-protection */ 131, /* enhanced-SOP 2 and side-effect */ -1 /* END */ } -- cgit v1.2.3-70-g09d2 From d051ae531324fb5130366d47e05bf8eadeb95535 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 13 Dec 2016 14:25:32 +0100 Subject: KVM: s390: get rid of bogus cc initialization The plo inline assembly has a cc output operand that is always written to and is also as such an operand declared. Therefore the compiler is free to omit the rather pointless and misleading initialization. Get rid of this. Signed-off-by: Heiko Carstens Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e5130818fb03..4f74511015b8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -217,7 +217,7 @@ static void allow_cpu_feat(unsigned long nr) static inline int plo_test_bit(unsigned char nr) { register unsigned long r0 asm("0") = (unsigned long) nr | 0x100; - int cc = 3; /* subfunction not available */ + int cc; asm volatile( /* Parameter registers are ignored for "test bit" */ -- cgit v1.2.3-70-g09d2 From 27f67f8727843fbbbcd05a003183af79693759e9 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 9 Dec 2016 12:44:40 +0100 Subject: KVM: s390: Get rid of ar_t sparse with __CHECK_ENDIAN__ shows that ar_t was never properly used across KVM on s390. We can now: - fix all places - do not make ar_t special Since ar_t is just used as a register number (no endianness issues for u8), and all other register numbers are also just plain int variables, let's just use u8, which matches the __u8 in the userspace ABI for the memop ioctl. Signed-off-by: Christian Borntraeger Acked-by: Janosch Frank Reviewed-by: Cornelia Huck --- arch/s390/kvm/gaccess.c | 14 +++++++------- arch/s390/kvm/gaccess.h | 10 +++++----- arch/s390/kvm/kvm-s390.h | 10 ++++------ arch/s390/kvm/priv.c | 30 +++++++++++++++--------------- 4 files changed, 31 insertions(+), 33 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 6e94705efd4e..4492c9363178 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -373,7 +373,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu) ipte_unlock_simple(vcpu); } -static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar, +static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, enum gacc_mode mode) { union alet alet; @@ -487,7 +487,7 @@ enum prot_type { }; static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, - ar_t ar, enum gacc_mode mode, enum prot_type prot) + u8 ar, enum gacc_mode mode, enum prot_type prot) { struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; struct trans_exc_code_bits *tec; @@ -545,7 +545,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, } static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, - unsigned long ga, ar_t ar, enum gacc_mode mode) + unsigned long ga, u8 ar, enum gacc_mode mode) { int rc; struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw); @@ -777,7 +777,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, +static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, unsigned long *pages, unsigned long nr_pages, const union asce asce, enum gacc_mode mode) { @@ -809,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, return 0; } -int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, +int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len, enum gacc_mode mode) { psw_t *psw = &vcpu->arch.sie_block->gpsw; @@ -883,7 +883,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * Note: The IPTE lock is not taken during this function, so the caller * has to take care of this. */ -int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, +int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, unsigned long *gpa, enum gacc_mode mode) { psw_t *psw = &vcpu->arch.sie_block->gpsw; @@ -916,7 +916,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, /** * check_gva_range - test a range of guest virtual addresses for accessibility */ -int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, unsigned long length, enum gacc_mode mode) { unsigned long gpa; diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 8756569ad938..5c9cc18f3b4a 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -162,11 +162,11 @@ enum gacc_mode { }; int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, - ar_t ar, unsigned long *gpa, enum gacc_mode mode); -int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, + u8 ar, unsigned long *gpa, enum gacc_mode mode); +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, unsigned long length, enum gacc_mode mode); -int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, +int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len, enum gacc_mode mode); int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, @@ -218,7 +218,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * if data has been changed in guest space in case of an exception. */ static inline __must_check -int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, +int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len) { return access_guest(vcpu, ga, ar, data, len, GACC_STORE); @@ -238,7 +238,7 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, * data will be copied from guest space to kernel space. */ static inline __must_check -int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, +int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len) { return access_guest(vcpu, ga, ar, data, len, GACC_FETCH); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 3a4e97f1a9e6..22a0a7ceffad 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -86,9 +86,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); } -typedef u8 __bitwise ar_t; - -static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar) +static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar) { u32 base2 = vcpu->arch.sie_block->ipb >> 28; u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); @@ -101,7 +99,7 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar) static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, u64 *address1, u64 *address2, - ar_t *ar_b1, ar_t *ar_b2) + u8 *ar_b1, u8 *ar_b2) { u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; @@ -125,7 +123,7 @@ static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2 *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16; } -static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar) +static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, u8 *ar) { u32 base2 = vcpu->arch.sie_block->ipb >> 28; u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + @@ -140,7 +138,7 @@ static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar) return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2; } -static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, ar_t *ar) +static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, u8 *ar) { u32 base2 = vcpu->arch.sie_block->ipb >> 28; u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index e18435355c16..1ecc1cffdf7c 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -54,7 +54,7 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu) static int handle_set_clock(struct kvm_vcpu *vcpu) { int rc; - ar_t ar; + u8 ar; u64 op2, val; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) @@ -79,7 +79,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) u64 operand2; u32 address; int rc; - ar_t ar; + u8 ar; vcpu->stat.instruction_spx++; @@ -117,7 +117,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) u64 operand2; u32 address; int rc; - ar_t ar; + u8 ar; vcpu->stat.instruction_stpx++; @@ -147,7 +147,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) u16 vcpu_id = vcpu->vcpu_id; u64 ga; int rc; - ar_t ar; + u8 ar; vcpu->stat.instruction_stap++; @@ -380,7 +380,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu) u32 tpi_data[3]; int rc; u64 addr; - ar_t ar; + u8 ar; addr = kvm_s390_get_base_disp_s(vcpu, &ar); if (addr & 3) @@ -548,7 +548,7 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) psw_compat_t new_psw; u64 addr; int rc; - ar_t ar; + u8 ar; if (gpsw->mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -575,7 +575,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) psw_t new_psw; u64 addr; int rc; - ar_t ar; + u8 ar; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -597,7 +597,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu) u64 stidp_data = vcpu->kvm->arch.model.cpuid; u64 operand2; int rc; - ar_t ar; + u8 ar; vcpu->stat.instruction_stidp++; @@ -644,7 +644,7 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem) ASCEBC(mem->vm[0].cpi, 16); } -static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, ar_t ar, +static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, u8 ar, u8 fc, u8 sel1, u16 sel2) { vcpu->run->exit_reason = KVM_EXIT_S390_STSI; @@ -663,7 +663,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) unsigned long mem = 0; u64 operand2; int rc = 0; - ar_t ar; + u8 ar; vcpu->stat.instruction_stsi++; VCPU_EVENT(vcpu, 3, "STSI: fc: %u sel1: %u sel2: %u", fc, sel1, sel2); @@ -970,7 +970,7 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu) int reg, rc, nr_regs; u32 ctl_array[16]; u64 ga; - ar_t ar; + u8 ar; vcpu->stat.instruction_lctl++; @@ -1009,7 +1009,7 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu) int reg, rc, nr_regs; u32 ctl_array[16]; u64 ga; - ar_t ar; + u8 ar; vcpu->stat.instruction_stctl++; @@ -1043,7 +1043,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) int reg, rc, nr_regs; u64 ctl_array[16]; u64 ga; - ar_t ar; + u8 ar; vcpu->stat.instruction_lctlg++; @@ -1081,7 +1081,7 @@ static int handle_stctg(struct kvm_vcpu *vcpu) int reg, rc, nr_regs; u64 ctl_array[16]; u64 ga; - ar_t ar; + u8 ar; vcpu->stat.instruction_stctg++; @@ -1132,7 +1132,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu) unsigned long hva, gpa; int ret = 0, cc = 0; bool writable; - ar_t ar; + u8 ar; vcpu->stat.instruction_tprot++; -- cgit v1.2.3-70-g09d2 From 53743aa7f14671dea6f3567ddca2f7d97454f3fe Mon Sep 17 00:00:00 2001 From: Maxim Samoylov Date: Wed, 10 Feb 2016 10:31:23 +0100 Subject: KVM: s390: Introduce Vector Enhancements facility 1 to the guest We can directly forward the vector enhancement facility 1 to the guest if available and VX is requested by user space. Please note that user space will have to take care of the final state of the facility bit when migrating to older machines. Reviewed-by: David Hildenbrand Signed-off-by: Maxim Samoylov Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4f74511015b8..1fd4b854efdc 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -505,6 +505,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) } else if (MACHINE_HAS_VX) { set_kvm_facility(kvm->arch.model.fac_mask, 129); set_kvm_facility(kvm->arch.model.fac_list, 129); + if (test_facility(135)) { + set_kvm_facility(kvm->arch.model.fac_mask, 135); + set_kvm_facility(kvm->arch.model.fac_list, 135); + } r = 0; } else r = -EINVAL; -- cgit v1.2.3-70-g09d2 From 2f87d942be9d0f86e44fbcbd473264c26c7f1809 Mon Sep 17 00:00:00 2001 From: Guenther Hutzl Date: Fri, 3 Jun 2016 14:37:17 +0200 Subject: KVM: s390: Introduce BCD Vector Instructions to the guest We can directly forward the vector BCD instructions to the guest if available and VX is requested by user space. Please note that user space will have to take care of the final state of the facility bit when migrating to older machines. Signed-off-by: Guenther Hutzl Reviewed-by: Christian Borntraeger Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1fd4b854efdc..69401b8d4521 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -505,6 +505,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) } else if (MACHINE_HAS_VX) { set_kvm_facility(kvm->arch.model.fac_mask, 129); set_kvm_facility(kvm->arch.model.fac_list, 129); + if (test_facility(134)) { + set_kvm_facility(kvm->arch.model.fac_mask, 134); + set_kvm_facility(kvm->arch.model.fac_list, 134); + } if (test_facility(135)) { set_kvm_facility(kvm->arch.model.fac_mask, 135); set_kvm_facility(kvm->arch.model.fac_list, 135); -- cgit v1.2.3-70-g09d2 From f41711788c9c281a61c8cf3222dca8a0e74a4fb3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 May 2016 12:33:52 +0200 Subject: KVM: s390: guestdbg: filter i-fetch events on icpts We already filter PER events reported via icpt code 8. For icpt code 4 and 56, this is still missing. So let's properly detect if we have a debugging event and if we have to inject a PER i-fetch event into the guest at all. Signed-off-by: David Hildenbrand Reviewed-by: Christian Borntraeger Cc: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/guestdbg.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index d7c6a7f53ced..a2077833ab01 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -388,14 +388,13 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu) #define per_write_wp_event(code) \ (code & (PER_CODE_STORE | PER_CODE_STORE_REAL)) -static int debug_exit_required(struct kvm_vcpu *vcpu) +static int debug_exit_required(struct kvm_vcpu *vcpu, u8 perc, + unsigned long peraddr) { - u8 perc = vcpu->arch.sie_block->perc; struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; struct kvm_hw_wp_info_arch *wp_info = NULL; struct kvm_hw_bp_info_arch *bp_info = NULL; unsigned long addr = vcpu->arch.sie_block->gpsw.addr; - unsigned long peraddr = vcpu->arch.sie_block->peraddr; if (guestdbg_hw_bp_enabled(vcpu)) { if (per_write_wp_event(perc) && @@ -442,6 +441,8 @@ exit_required: int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) { + const u64 cr10 = vcpu->arch.sie_block->gcr[10]; + const u64 cr11 = vcpu->arch.sie_block->gcr[11]; const u8 ilen = kvm_s390_get_ilen(vcpu); struct kvm_s390_pgm_info pgm_info = { .code = PGM_PER, @@ -454,7 +455,19 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) * instruction generated a PER i-fetch event. PER address therefore * points at the previous PSW address (could be an EXECUTE function). */ - return kvm_s390_inject_prog_irq(vcpu, &pgm_info); + if (!guestdbg_enabled(vcpu)) + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); + + if (debug_exit_required(vcpu, pgm_info.per_code, pgm_info.per_address)) + vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; + + if (!guest_per_enabled(vcpu) || + !(vcpu->arch.sie_block->gcr[9] & PER_EVENT_IFETCH)) + return 0; + + if (in_addr_range(pgm_info.per_address, cr10, cr11)) + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); + return 0; } static void filter_guest_per_event(struct kvm_vcpu *vcpu) @@ -500,7 +513,8 @@ void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) { int new_as; - if (debug_exit_required(vcpu)) + if (debug_exit_required(vcpu, vcpu->arch.sie_block->perc, + vcpu->arch.sie_block->peraddr)) vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; filter_guest_per_event(vcpu); -- cgit v1.2.3-70-g09d2 From 3fa8cad7402cfe982a60d746609e89aafe15d131 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 May 2016 12:00:49 +0200 Subject: KVM: s390: prepare to read random guest instructions We will have to read instructions not residing at the current PSW address. Reviewed-by: Eric Farman Signed-off-by: David Hildenbrand Cc: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.h | 9 +++++---- arch/s390/kvm/kvm-s390.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 5c9cc18f3b4a..7ce47fd36f28 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -247,10 +247,11 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, /** * read_guest_instr - copy instruction data from guest space to kernel space * @vcpu: virtual cpu + * @ga: guest address * @data: destination address in kernel space * @len: number of bytes to copy * - * Copy @len bytes from the current psw address (guest space) to @data (kernel + * Copy @len bytes from the given address (guest space) to @data (kernel * space). * * The behaviour of read_guest_instr is identical to read_guest, except that @@ -258,10 +259,10 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, * address-space mode. */ static inline __must_check -int read_guest_instr(struct kvm_vcpu *vcpu, void *data, unsigned long len) +int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data, + unsigned long len) { - return access_guest(vcpu, vcpu->arch.sie_block->gpsw.addr, 0, data, len, - GACC_IFETCH); + return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH); } /** diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 69401b8d4521..66e73f4ed64b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2588,7 +2588,7 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu) * to look up the current opcode to get the length of the instruction * to be able to forward the PSW. */ - rc = read_guest_instr(vcpu, &opcode, 1); + rc = read_guest_instr(vcpu, vcpu->arch.sie_block->gpsw.addr, &opcode, 1); ilen = insn_length(opcode); if (rc < 0) { return rc; -- cgit v1.2.3-70-g09d2 From a69cbe81b2f38437113c05019a134a4731a3aa78 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 May 2016 12:40:11 +0200 Subject: KVM: s390: guestdbg: filter PER i-fetch on EXECUTE properly When we get a PER i-fetch event on an EXECUTE or EXECUTE RELATIVE LONG instruction, because the executed instruction generated a PER i-fetch event, then the PER address points at the EXECUTE function, not the fetched one. Therefore, when filtering PER events, we have to take care of the really fetched instruction, which we can only get by reading in guest virtual memory. For icpt code 4 and 56, we directly have additional information about an EXECUTE instruction at hand. For icpt code 8, we always have to read in guest virtual memory. Signed-off-by: David Hildenbrand Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger [small fixes] --- arch/s390/kvm/guestdbg.c | 98 ++++++++++++++++++++++++++++++++++++++++++----- arch/s390/kvm/intercept.c | 4 +- arch/s390/kvm/kvm-s390.h | 2 +- 3 files changed, 93 insertions(+), 11 deletions(-) diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index a2077833ab01..23d9a4e12da1 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -436,6 +436,64 @@ exit_required: return 1; } +static int per_fetched_addr(struct kvm_vcpu *vcpu, unsigned long *addr) +{ + u8 exec_ilen = 0; + u16 opcode[3]; + int rc; + + if (vcpu->arch.sie_block->icptcode == ICPT_PROGI) { + /* PER address references the fetched or the execute instr */ + *addr = vcpu->arch.sie_block->peraddr; + /* + * Manually detect if we have an EXECUTE instruction. As + * instructions are always 2 byte aligned we can read the + * first two bytes unconditionally + */ + rc = read_guest_instr(vcpu, *addr, &opcode, 2); + if (rc) + return rc; + if (opcode[0] >> 8 == 0x44) + exec_ilen = 4; + if ((opcode[0] & 0xff0f) == 0xc600) + exec_ilen = 6; + } else { + /* instr was suppressed, calculate the responsible instr */ + *addr = __rewind_psw(vcpu->arch.sie_block->gpsw, + kvm_s390_get_ilen(vcpu)); + if (vcpu->arch.sie_block->icptstatus & 0x01) { + exec_ilen = (vcpu->arch.sie_block->icptstatus & 0x60) >> 4; + if (!exec_ilen) + exec_ilen = 4; + } + } + + if (exec_ilen) { + /* read the complete EXECUTE instr to detect the fetched addr */ + rc = read_guest_instr(vcpu, *addr, &opcode, exec_ilen); + if (rc) + return rc; + if (exec_ilen == 6) { + /* EXECUTE RELATIVE LONG - RIL-b format */ + s32 rl = *((s32 *) (opcode + 1)); + + /* rl is a _signed_ 32 bit value specifying halfwords */ + *addr += (u64)(s64) rl * 2; + } else { + /* EXECUTE - RX-a format */ + u32 base = (opcode[1] & 0xf000) >> 12; + u32 disp = opcode[1] & 0x0fff; + u32 index = opcode[0] & 0x000f; + + *addr = base ? vcpu->run->s.regs.gprs[base] : 0; + *addr += index ? vcpu->run->s.regs.gprs[index] : 0; + *addr += disp; + } + *addr = kvm_s390_logical_to_effective(vcpu, *addr); + } + return 0; +} + #define guest_per_enabled(vcpu) \ (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) @@ -449,6 +507,8 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) .per_code = PER_CODE_IFETCH, .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen), }; + unsigned long fetched_addr; + int rc; /* * The PSW points to the next instruction, therefore the intercepted @@ -465,21 +525,29 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) !(vcpu->arch.sie_block->gcr[9] & PER_EVENT_IFETCH)) return 0; - if (in_addr_range(pgm_info.per_address, cr10, cr11)) + rc = per_fetched_addr(vcpu, &fetched_addr); + if (rc < 0) + return rc; + if (rc) + /* instruction-fetching exceptions */ + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + if (in_addr_range(fetched_addr, cr10, cr11)) return kvm_s390_inject_prog_irq(vcpu, &pgm_info); return 0; } -static void filter_guest_per_event(struct kvm_vcpu *vcpu) +static int filter_guest_per_event(struct kvm_vcpu *vcpu) { const u8 perc = vcpu->arch.sie_block->perc; - u64 peraddr = vcpu->arch.sie_block->peraddr; u64 addr = vcpu->arch.sie_block->gpsw.addr; u64 cr9 = vcpu->arch.sie_block->gcr[9]; u64 cr10 = vcpu->arch.sie_block->gcr[10]; u64 cr11 = vcpu->arch.sie_block->gcr[11]; /* filter all events, demanded by the guest */ u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK; + unsigned long fetched_addr; + int rc; if (!guest_per_enabled(vcpu)) guest_perc = 0; @@ -491,9 +559,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu) guest_perc &= ~PER_CODE_BRANCH; /* filter "instruction-fetching" events */ - if (guest_perc & PER_CODE_IFETCH && - !in_addr_range(peraddr, cr10, cr11)) - guest_perc &= ~PER_CODE_IFETCH; + if (guest_perc & PER_CODE_IFETCH) { + rc = per_fetched_addr(vcpu, &fetched_addr); + if (rc < 0) + return rc; + /* + * Don't inject an irq on exceptions. This would make handling + * on icpt code 8 very complex (as PSW was already rewound). + */ + if (rc || !in_addr_range(fetched_addr, cr10, cr11)) + guest_perc &= ~PER_CODE_IFETCH; + } /* All other PER events will be given to the guest */ /* TODO: Check altered address/address space */ @@ -502,6 +578,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu) if (!guest_perc) vcpu->arch.sie_block->iprcc &= ~PGM_PER; + return 0; } #define pssec(vcpu) (vcpu->arch.sie_block->gcr[1] & _ASCE_SPACE_SWITCH) @@ -509,15 +586,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu) #define old_ssec(vcpu) ((vcpu->arch.sie_block->tecmc >> 31) & 0x1) #define old_as_is_home(vcpu) !(vcpu->arch.sie_block->tecmc & 0xffff) -void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) +int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) { - int new_as; + int rc, new_as; if (debug_exit_required(vcpu, vcpu->arch.sie_block->perc, vcpu->arch.sie_block->peraddr)) vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; - filter_guest_per_event(vcpu); + rc = filter_guest_per_event(vcpu); + if (rc) + return rc; /* * Only RP, SAC, SACF, PT, PTI, PR, PC instructions can trigger @@ -546,4 +625,5 @@ void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) (pssec(vcpu) || old_ssec(vcpu))) vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; } + return 0; } diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 7a27eebab28a..8b13f7098c61 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -238,7 +238,9 @@ static int handle_prog(struct kvm_vcpu *vcpu) vcpu->stat.exit_program_interruption++; if (guestdbg_enabled(vcpu) && per_event(vcpu)) { - kvm_s390_handle_per_event(vcpu); + rc = kvm_s390_handle_per_event(vcpu); + if (rc) + return rc; /* the interrupt might have been filtered out completely */ if (vcpu->arch.sie_block->iprcc == 0) return 0; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 22a0a7ceffad..af9fa91a0c91 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -377,7 +377,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu); void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); -void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); +int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); /* support for Basic/Extended SCA handling */ static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) -- cgit v1.2.3-70-g09d2 From 4bead2a423ea5268b0ab3cba058e215c65ee2cbd Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 27 Jan 2017 10:23:59 +0100 Subject: KVM: s390: Fix RRBE return code not being CC reset_guest_reference_bit needs to return the CC, so we can set it in the guest PSW when emulating RRBE. Right now it only returns 0. Let's fix that. Signed-off-by: Janosch Frank Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- arch/s390/mm/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 7a1897c51c54..f70db837ddc4 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -741,7 +741,7 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); - return 0; + return cc; } EXPORT_SYMBOL(reset_guest_reference_bit); -- cgit v1.2.3-70-g09d2 From a8c39dd77cb9fad0d0e5c5e6581851bdcbc1e6f6 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 18 Jan 2017 16:01:02 +0100 Subject: KVM: s390: Add debug logging to basic cpu model interface Let's log something for changes in facilities, cpuid and ibc now that we have a cpu model in QEMU. All of these calls are pretty seldom, so we will not spill the log, the they will help to understand pontential guest issues, for example if some instructions are fenced off. As the s390 debug feature has a limited amount of parameters and strings must not go away we limit the facility printing to 3 double words, instead of building that list dynamically. This should be enough for several years. If we ever exceed 3 double words then the logging will be incomplete but no functional impact will happen. Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4078ba630689..dabd3b15bf11 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -829,6 +829,13 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) } memcpy(kvm->arch.model.fac_list, proc->fac_list, S390_ARCH_FAC_LIST_SIZE_BYTE); + VM_EVENT(kvm, 3, "SET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx", + kvm->arch.model.ibc, + kvm->arch.model.cpuid); + VM_EVENT(kvm, 3, "SET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx", + kvm->arch.model.fac_list[0], + kvm->arch.model.fac_list[1], + kvm->arch.model.fac_list[2]); } else ret = -EFAULT; kfree(proc); @@ -902,6 +909,13 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr) proc->ibc = kvm->arch.model.ibc; memcpy(&proc->fac_list, kvm->arch.model.fac_list, S390_ARCH_FAC_LIST_SIZE_BYTE); + VM_EVENT(kvm, 3, "GET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx", + kvm->arch.model.ibc, + kvm->arch.model.cpuid); + VM_EVENT(kvm, 3, "GET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx", + kvm->arch.model.fac_list[0], + kvm->arch.model.fac_list[1], + kvm->arch.model.fac_list[2]); if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc))) ret = -EFAULT; kfree(proc); @@ -925,6 +939,17 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) S390_ARCH_FAC_LIST_SIZE_BYTE); memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list, sizeof(S390_lowcore.stfle_fac_list)); + VM_EVENT(kvm, 3, "GET: host ibc: 0x%4.4x, host cpuid: 0x%16.16llx", + kvm->arch.model.ibc, + kvm->arch.model.cpuid); + VM_EVENT(kvm, 3, "GET: host facmask: 0x%16.16llx.%16.16llx.%16.16llx", + mach->fac_mask[0], + mach->fac_mask[1], + mach->fac_mask[2]); + VM_EVENT(kvm, 3, "GET: host faclist: 0x%16.16llx.%16.16llx.%16.16llx", + mach->fac_list[0], + mach->fac_list[1], + mach->fac_list[2]); if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach))) ret = -EFAULT; kfree(mach); -- cgit v1.2.3-70-g09d2 From 2df903a89a81c44da7ae94837af5c00a72ce6aaf Mon Sep 17 00:00:00 2001 From: Vijaya Kumar K Date: Thu, 26 Jan 2017 19:50:46 +0530 Subject: KVM: arm/arm64: vgic: Implement support for userspace access Read and write of some registers like ISPENDR and ICPENDR from userspace requires special handling when compared to guest access for these registers. Refer to Documentation/virtual/kvm/devices/arm-vgic-v3.txt for handling of ISPENDR, ICPENDR registers handling. Add infrastructure to support guest and userspace read and write for the required registers Also moved vgic_uaccess from vgic-mmio-v2.c to vgic-mmio.c Signed-off-by: Vijaya Kumar K Reviewed-by: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio-v2.c | 25 ----------- virt/kvm/arm/vgic/vgic-mmio-v3.c | 96 ++++++++++++++++++++++++++++++++-------- virt/kvm/arm/vgic/vgic-mmio.c | 78 +++++++++++++++++++++++++++++--- virt/kvm/arm/vgic/vgic-mmio.h | 19 ++++++++ 4 files changed, 169 insertions(+), 49 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index 07e67f1e25ef..270eb4ab528e 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -407,31 +407,6 @@ int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) return -ENXIO; } -/* - * When userland tries to access the VGIC register handlers, we need to - * create a usable struct vgic_io_device to be passed to the handlers and we - * have to set up a buffer similar to what would have happened if a guest MMIO - * access occurred, including doing endian conversions on BE systems. - */ -static int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, - bool is_write, int offset, u32 *val) -{ - unsigned int len = 4; - u8 buf[4]; - int ret; - - if (is_write) { - vgic_data_host_to_mmio_bus(buf, len, *val); - ret = kvm_io_gic_ops.write(vcpu, &dev->dev, offset, len, buf); - } else { - ret = kvm_io_gic_ops.read(vcpu, &dev->dev, offset, len, buf); - if (!ret) - *val = vgic_data_mmio_bus_to_host(buf, len); - } - - return ret; -} - int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val) { diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 2aca52a6c9eb..3548bb207a1b 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -207,6 +207,60 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, return 0; } +static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* + * pending state of interrupt is latched in pending_latch variable. + * Userspace will save and restore pending state and line_level + * separately. + * Refer to Documentation/virtual/kvm/devices/arm-vgic-v3.txt + * for handling of ISPENDR and ICPENDR. + */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->pending_latch) + value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +static void vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + spin_lock(&irq->irq_lock); + if (test_bit(i, &val)) { + /* + * pending_latch is set irrespective of irq type + * (level or edge) to avoid dependency that VM should + * restore irq config before pending info. + */ + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq); + } else { + irq->pending_latch = false; + spin_unlock(&irq->irq_lock); + } + + vgic_put_irq(vcpu->kvm, irq); + } +} + /* We want to avoid outer shareable. */ u64 vgic_sanitise_shareability(u64 field) { @@ -356,7 +410,7 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, * We take some special care here to fix the calculation of the register * offset. */ -#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, bpi, acc) \ +#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \ { \ .reg_offset = off, \ .bits_per_irq = bpi, \ @@ -371,6 +425,8 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, .access_flags = acc, \ .read = rd, \ .write = wr, \ + .uaccess_read = ur, \ + .uaccess_write = uw, \ } static const struct vgic_register_region vgic_v3_dist_registers[] = { @@ -378,40 +434,42 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = { vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, 16, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, - vgic_mmio_read_rao, vgic_mmio_write_wi, 1, + vgic_mmio_read_rao, vgic_mmio_write_wi, NULL, NULL, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, - vgic_mmio_read_enable, vgic_mmio_write_senable, 1, + vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, - vgic_mmio_read_enable, vgic_mmio_write_cenable, 1, + vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, - vgic_mmio_read_pending, vgic_mmio_write_spending, 1, + vgic_mmio_read_pending, vgic_mmio_write_spending, + vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, - vgic_mmio_read_pending, vgic_mmio_write_cpending, 1, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + vgic_mmio_read_raz, vgic_mmio_write_wi, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, - vgic_mmio_read_active, vgic_mmio_write_sactive, 1, + vgic_mmio_read_active, vgic_mmio_write_sactive, NULL, NULL, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, - vgic_mmio_read_active, vgic_mmio_write_cactive, 1, + vgic_mmio_read_active, vgic_mmio_write_cactive, NULL, NULL, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, - vgic_mmio_read_priority, vgic_mmio_write_priority, 8, - VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, + 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 8, + vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR, - vgic_mmio_read_config, vgic_mmio_write_config, 2, + vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 1, + vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER, - vgic_mmio_read_irouter, vgic_mmio_write_irouter, 64, + vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICD_IDREGS, vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, @@ -449,11 +507,13 @@ static const struct vgic_register_region vgic_v3_sgibase_registers[] = { REGISTER_DESC_WITH_LENGTH(GICR_ICENABLER0, vgic_mmio_read_enable, vgic_mmio_write_cenable, 4, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_ISPENDR0, - vgic_mmio_read_pending, vgic_mmio_write_spending, 4, + REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISPENDR0, + vgic_mmio_read_pending, vgic_mmio_write_spending, + vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(GICR_ICPENDR0, - vgic_mmio_read_pending, vgic_mmio_write_cpending, 4, + REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICPENDR0, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_ISACTIVER0, vgic_mmio_read_active, vgic_mmio_write_sactive, 4, diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 2670d39d1f86..3fab26422946 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -475,6 +475,74 @@ static bool check_region(const struct kvm *kvm, return false; } +static const struct vgic_register_region * +vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, + gpa_t addr, int len) +{ + const struct vgic_register_region *region; + + region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, + addr - iodev->base_addr); + if (!region || !check_region(vcpu->kvm, region, addr, len)) + return NULL; + + return region; +} + +static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, u32 *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + struct kvm_vcpu *r_vcpu; + + region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); + if (!region) { + *val = 0; + return 0; + } + + r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; + if (region->uaccess_read) + *val = region->uaccess_read(r_vcpu, addr, sizeof(u32)); + else + *val = region->read(r_vcpu, addr, sizeof(u32)); + + return 0; +} + +static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, const u32 *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + struct kvm_vcpu *r_vcpu; + + region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); + if (!region) + return 0; + + r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; + if (region->uaccess_write) + region->uaccess_write(r_vcpu, addr, sizeof(u32), *val); + else + region->write(r_vcpu, addr, sizeof(u32), *val); + + return 0; +} + +/* + * Userland access to VGIC registers. + */ +int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, + bool is_write, int offset, u32 *val) +{ + if (is_write) + return vgic_uaccess_write(vcpu, &dev->dev, offset, val); + else + return vgic_uaccess_read(vcpu, &dev->dev, offset, val); +} + static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { @@ -482,9 +550,8 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, const struct vgic_register_region *region; unsigned long data = 0; - region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, - addr - iodev->base_addr); - if (!region || !check_region(vcpu->kvm, region, addr, len)) { + region = vgic_get_mmio_region(vcpu, iodev, addr, len); + if (!region) { memset(val, 0, len); return 0; } @@ -515,9 +582,8 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, const struct vgic_register_region *region; unsigned long data = vgic_data_mmio_bus_to_host(val, len); - region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, - addr - iodev->base_addr); - if (!region || !check_region(vcpu->kvm, region, addr, len)) + region = vgic_get_mmio_region(vcpu, iodev, addr, len); + if (!region) return 0; switch (iodev->iodev_type) { diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 84961b4e4422..7b30296a7306 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -34,6 +34,10 @@ struct vgic_register_region { gpa_t addr, unsigned int len, unsigned long val); }; + unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + void (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); }; extern struct kvm_io_device_ops kvm_io_gic_ops; @@ -86,6 +90,18 @@ extern struct kvm_io_device_ops kvm_io_gic_ops; .write = wr, \ } +#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = 0, \ + .len = length, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + .uaccess_read = urd, \ + .uaccess_write = uwr, \ + } + int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu, struct vgic_register_region *reg_desc, struct vgic_io_device *region, @@ -158,6 +174,9 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val); +int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, + bool is_write, int offset, u32 *val); + unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); -- cgit v1.2.3-70-g09d2 From 94574c9488e253c65ad2e8955163e330f07119b6 Mon Sep 17 00:00:00 2001 From: Vijaya Kumar K Date: Thu, 26 Jan 2017 19:50:47 +0530 Subject: KVM: arm/arm64: vgic: Add distributor and redistributor access VGICv3 Distributor and Redistributor registers are accessed using KVM_DEV_ARM_VGIC_GRP_DIST_REGS and KVM_DEV_ARM_VGIC_GRP_REDIST_REGS with KVM_SET_DEVICE_ATTR and KVM_GET_DEVICE_ATTR ioctls. These registers are accessed as 32-bit and cpu mpidr value passed along with register offset is used to identify the cpu for redistributor registers access. The version of VGIC v3 specification is defined here Documentation/virtual/kvm/devices/arm-vgic-v3.txt Also update arch/arm/include/uapi/asm/kvm.h to compile for AArch32 mode. Signed-off-by: Vijaya Kumar K Reviewed-by: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Marc Zyngier --- arch/arm/include/uapi/asm/kvm.h | 4 + arch/arm64/include/uapi/asm/kvm.h | 4 + virt/kvm/arm/vgic/vgic-kvm-device.c | 161 ++++++++++++++++++++++++++++++++---- virt/kvm/arm/vgic/vgic-mmio-v2.c | 40 ++++----- virt/kvm/arm/vgic/vgic-mmio-v3.c | 85 +++++++++++++++++++ virt/kvm/arm/vgic/vgic-mmio.c | 2 +- virt/kvm/arm/vgic/vgic.h | 40 ++++++++- 7 files changed, 300 insertions(+), 36 deletions(-) diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index af05f8e0903e..0ae6035e0604 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -181,10 +181,14 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2 #define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32 #define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT) +#define KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32 +#define KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \ + (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT) #define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 #define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 #define KVM_DEV_ARM_VGIC_GRP_CTRL 4 +#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5 #define KVM_DEV_ARM_VGIC_CTRL_INIT 0 /* KVM_IRQ_LINE irq field index values */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 3051f86a9b5f..56dc08db6492 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -201,10 +201,14 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2 #define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32 #define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT) +#define KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32 +#define KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \ + (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT) #define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 #define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 #define KVM_DEV_ARM_VGIC_GRP_CTRL 4 +#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5 #define KVM_DEV_ARM_VGIC_CTRL_INIT 0 /* Device Control API on vcpu fd */ diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index fbe87a63d250..227337fd5156 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "vgic.h" /* common helpers */ @@ -230,14 +231,8 @@ int kvm_register_vgic_device(unsigned long type) return ret; } -struct vgic_reg_attr { - struct kvm_vcpu *vcpu; - gpa_t addr; -}; - -static int parse_vgic_v2_attr(struct kvm_device *dev, - struct kvm_device_attr *attr, - struct vgic_reg_attr *reg_attr) +int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr) { int cpuid; @@ -292,14 +287,14 @@ static bool lock_all_vcpus(struct kvm *kvm) } /** - * vgic_attr_regs_access_v2 - allows user space to access VGIC v2 state + * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state * * @dev: kvm device handle * @attr: kvm device attribute * @reg: address the value is read or written * @is_write: true if userspace is writing a register */ -static int vgic_attr_regs_access_v2(struct kvm_device *dev, +static int vgic_v2_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, u32 *reg, bool is_write) { @@ -308,7 +303,7 @@ static int vgic_attr_regs_access_v2(struct kvm_device *dev, struct kvm_vcpu *vcpu; int ret; - ret = parse_vgic_v2_attr(dev, attr, ®_attr); + ret = vgic_v2_parse_attr(dev, attr, ®_attr); if (ret) return ret; @@ -362,7 +357,7 @@ static int vgic_v2_set_attr(struct kvm_device *dev, if (get_user(reg, uaddr)) return -EFAULT; - return vgic_attr_regs_access_v2(dev, attr, ®, true); + return vgic_v2_attr_regs_access(dev, attr, ®, true); } } @@ -384,7 +379,7 @@ static int vgic_v2_get_attr(struct kvm_device *dev, u32 __user *uaddr = (u32 __user *)(long)attr->addr; u32 reg = 0; - ret = vgic_attr_regs_access_v2(dev, attr, ®, false); + ret = vgic_v2_attr_regs_access(dev, attr, ®, false); if (ret) return ret; return put_user(reg, uaddr); @@ -428,16 +423,149 @@ struct kvm_device_ops kvm_arm_vgic_v2_ops = { .has_attr = vgic_v2_has_attr, }; +int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr) +{ + unsigned long vgic_mpidr, mpidr_reg; + + /* + * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group, + * attr might not hold MPIDR. Hence assume vcpu0. + */ + if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) { + vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >> + KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT; + + mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr); + reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg); + } else { + reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0); + } + + if (!reg_attr->vcpu) + return -EINVAL; + + reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + + return 0; +} + +/* + * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state + * + * @dev: kvm device handle + * @attr: kvm device attribute + * @reg: address the value is read or written + * @is_write: true if userspace is writing a register + */ +static int vgic_v3_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u64 *reg, bool is_write) +{ + struct vgic_reg_attr reg_attr; + gpa_t addr; + struct kvm_vcpu *vcpu; + int ret; + u32 tmp32; + + ret = vgic_v3_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + mutex_lock(&dev->kvm->lock); + + if (unlikely(!vgic_initialized(dev->kvm))) { + ret = -EBUSY; + goto out; + } + + if (!lock_all_vcpus(dev->kvm)) { + ret = -EBUSY; + goto out; + } + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + if (is_write) + tmp32 = *reg; + + ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32); + if (!is_write) + *reg = tmp32; + break; + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + if (is_write) + tmp32 = *reg; + + ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32); + if (!is_write) + *reg = tmp32; + break; + default: + ret = -EINVAL; + break; + } + + unlock_all_vcpus(dev->kvm); +out: + mutex_unlock(&dev->kvm->lock); + return ret; +} + static int vgic_v3_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - return vgic_set_common_attr(dev, attr); + int ret; + + ret = vgic_set_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 tmp32; + u64 reg; + + if (get_user(tmp32, uaddr)) + return -EFAULT; + + reg = tmp32; + return vgic_v3_attr_regs_access(dev, attr, ®, true); + } + } + return -ENXIO; } static int vgic_v3_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - return vgic_get_common_attr(dev, attr); + int ret; + + ret = vgic_get_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u64 reg; + u32 tmp32; + + ret = vgic_v3_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + tmp32 = reg; + return put_user(tmp32, uaddr); + } + } + + return -ENXIO; } static int vgic_v3_has_attr(struct kvm_device *dev, @@ -451,6 +579,9 @@ static int vgic_v3_has_attr(struct kvm_device *dev, return 0; } break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + return vgic_v3_has_attr_regs(dev, attr); case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: return 0; case KVM_DEV_ARM_VGIC_GRP_CTRL: diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index 270eb4ab528e..fa68dd41756e 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -369,21 +369,30 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) { - int nr_irqs = dev->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; - const struct vgic_register_region *regions; + const struct vgic_register_region *region; + struct vgic_io_device iodev; + struct vgic_reg_attr reg_attr; + struct kvm_vcpu *vcpu; gpa_t addr; - int nr_regions, i, len; + int ret; + + ret = vgic_v2_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; - addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - regions = vgic_v2_dist_registers; - nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); + iodev.regions = vgic_v2_dist_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); + iodev.base_addr = 0; break; case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - regions = vgic_v2_cpu_registers; - nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + iodev.regions = vgic_v2_cpu_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + iodev.base_addr = 0; break; default: return -ENXIO; @@ -393,18 +402,11 @@ int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) if (addr & 3) return -ENXIO; - for (i = 0; i < nr_regions; i++) { - if (regions[i].bits_per_irq) - len = (regions[i].bits_per_irq * nr_irqs) / 8; - else - len = regions[i].len; - - if (regions[i].reg_offset <= addr && - regions[i].reg_offset + len > addr) - return 0; - } + region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); + if (!region) + return -ENXIO; - return -ENXIO; + return 0; } int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 3548bb207a1b..2031138cfcf4 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -18,6 +18,8 @@ #include #include +#include +#include #include "vgic.h" #include "vgic-mmio.h" @@ -433,6 +435,9 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = { REGISTER_DESC_WITH_LENGTH(GICD_CTLR, vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, 16, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICD_STATUSR, + vgic_mmio_read_rao, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, vgic_mmio_read_rao, vgic_mmio_write_wi, NULL, NULL, 1, VGIC_ACCESS_32bit), @@ -480,12 +485,18 @@ static const struct vgic_register_region vgic_v3_rdbase_registers[] = { REGISTER_DESC_WITH_LENGTH(GICR_CTLR, vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_STATUSR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_IIDR, vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_TYPER, vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_WAKER, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), @@ -606,6 +617,48 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address) return ret; } +int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + const struct vgic_register_region *region; + struct vgic_io_device iodev; + struct vgic_reg_attr reg_attr; + struct kvm_vcpu *vcpu; + gpa_t addr; + int ret; + + ret = vgic_v3_parse_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + iodev.regions = vgic_v3_dist_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); + iodev.base_addr = 0; + break; + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{ + iodev.regions = vgic_v3_rdbase_registers; + iodev.nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers); + iodev.base_addr = 0; + break; + } + default: + return -ENXIO; + } + + /* We only support aligned 32-bit accesses. */ + if (addr & 3) + return -ENXIO; + + region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); + if (!region) + return -ENXIO; + + return 0; +} /* * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI * generation register ICC_SGI1R_EL1) with a given VCPU. @@ -712,3 +765,35 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) vgic_put_irq(vcpu->kvm, irq); } } + +int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v3_dist_registers, + .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers), + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} + +int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device rd_dev = { + .regions = vgic_v3_rdbase_registers, + .nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers), + }; + + struct vgic_io_device sgi_dev = { + .regions = vgic_v3_sgibase_registers, + .nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers), + }; + + /* SGI_base is the next 64K frame after RD_base */ + if (offset >= SZ_64K) + return vgic_uaccess(vcpu, &sgi_dev, is_write, offset - SZ_64K, + val); + else + return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val); +} diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 3fab26422946..746c8afca718 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -475,7 +475,7 @@ static bool check_region(const struct kvm *kvm, return false; } -static const struct vgic_register_region * +const struct vgic_register_region * vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, int len) { diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 48da1f65a6c3..4505fd4919fd 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -30,6 +30,28 @@ #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) +#define VGIC_AFFINITY_0_SHIFT 0 +#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT) +#define VGIC_AFFINITY_1_SHIFT 8 +#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT) +#define VGIC_AFFINITY_2_SHIFT 16 +#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT) +#define VGIC_AFFINITY_3_SHIFT 24 +#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT) + +#define VGIC_AFFINITY_LEVEL(reg, level) \ + ((((reg) & VGIC_AFFINITY_## level ##_MASK) \ + >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) + +/* + * The Userspace encodes the affinity differently from the MPIDR, + * Below macro converts vgic userspace format to MPIDR reg format. + */ +#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \ + VGIC_AFFINITY_LEVEL(val, 1) | \ + VGIC_AFFINITY_LEVEL(val, 2) | \ + VGIC_AFFINITY_LEVEL(val, 3)) + static inline bool irq_is_pending(struct vgic_irq *irq) { if (irq->config == VGIC_CONFIG_EDGE) @@ -45,6 +67,18 @@ struct vgic_vmcr { u32 pmr; }; +struct vgic_reg_attr { + struct kvm_vcpu *vcpu; + gpa_t addr; +}; + +int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr); +int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr); +const struct vgic_register_region * +vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, + gpa_t addr, int len); struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); @@ -97,7 +131,11 @@ bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); void vgic_enable_lpis(struct kvm_vcpu *vcpu); int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); - +int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); +int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); int kvm_register_vgic_device(unsigned long type); int vgic_lazy_init(struct kvm *kvm); int vgic_init(struct kvm *kvm); -- cgit v1.2.3-70-g09d2 From 4b927b94d5df24381e850dc1a3ecb34b3136d545 Mon Sep 17 00:00:00 2001 From: Vijaya Kumar K Date: Thu, 26 Jan 2017 19:50:48 +0530 Subject: KVM: arm/arm64: vgic: Introduce find_reg_by_id() In order to implement vGICv3 CPU interface access, we will need to perform table lookup of system registers. We would need both index_to_params() and find_reg() exported for that purpose, but instead we export a single function which combines them both. Signed-off-by: Pavel Fedin Signed-off-by: Vijaya Kumar K Reviewed-by: Andre Przywara Reviewed-by: Eric Auger Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 27 ++++++++++++++++----------- arch/arm64/kvm/sys_regs.h | 4 ++++ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 87e7e6608cd8..caa47ce7e006 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1795,6 +1795,17 @@ static bool index_to_params(u64 id, struct sys_reg_params *params) } } +const struct sys_reg_desc *find_reg_by_id(u64 id, + struct sys_reg_params *params, + const struct sys_reg_desc table[], + unsigned int num) +{ + if (!index_to_params(id, params)) + return NULL; + + return find_reg(params, table, num); +} + /* Decode an index value, and find the sys_reg_desc entry. */ static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id) @@ -1807,11 +1818,8 @@ static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) return NULL; - if (!index_to_params(id, ¶ms)) - return NULL; - table = get_target_table(vcpu->arch.target, true, &num); - r = find_reg(¶ms, table, num); + r = find_reg_by_id(id, ¶ms, table, num); if (!r) r = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); @@ -1918,10 +1926,8 @@ static int get_invariant_sys_reg(u64 id, void __user *uaddr) struct sys_reg_params params; const struct sys_reg_desc *r; - if (!index_to_params(id, ¶ms)) - return -ENOENT; - - r = find_reg(¶ms, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)); + r = find_reg_by_id(id, ¶ms, invariant_sys_regs, + ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; @@ -1935,9 +1941,8 @@ static int set_invariant_sys_reg(u64 id, void __user *uaddr) int err; u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */ - if (!index_to_params(id, ¶ms)) - return -ENOENT; - r = find_reg(¶ms, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)); + r = find_reg_by_id(id, ¶ms, invariant_sys_regs, + ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index dbbb01cfbee9..9c6ffd0f0196 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -136,6 +136,10 @@ static inline int cmp_sys_reg(const struct sys_reg_desc *i1, return i1->Op2 - i2->Op2; } +const struct sys_reg_desc *find_reg_by_id(u64 id, + struct sys_reg_params *params, + const struct sys_reg_desc table[], + unsigned int num); #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x -- cgit v1.2.3-70-g09d2 From 5c34153704898ed7ec0f8c0dceb651cbe4b713fd Mon Sep 17 00:00:00 2001 From: Vijaya Kumar K Date: Thu, 26 Jan 2017 19:50:49 +0530 Subject: irqchip/gic-v3: Add missing system register definitions Define register definitions for ICH_VMCR_EL2, ICC_CTLR_EL1 and ICH_VTR_EL2, ICC_BPR0_EL1, ICC_BPR1_EL1 registers. Signed-off-by: Vijaya Kumar K Reviewed-by: Eric Auger Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 43 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index e808f8ae6f14..7f6d904a62c6 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -352,8 +352,30 @@ /* * CPU interface registers */ -#define ICC_CTLR_EL1_EOImode_drop_dir (0U << 1) -#define ICC_CTLR_EL1_EOImode_drop (1U << 1) +#define ICC_CTLR_EL1_EOImode_SHIFT (1) +#define ICC_CTLR_EL1_EOImode_drop_dir (0U << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_EOImode_drop (1U << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_EOImode_MASK (1 << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_CBPR_SHIFT 0 +#define ICC_CTLR_EL1_CBPR_MASK (1 << ICC_CTLR_EL1_CBPR_SHIFT) +#define ICC_CTLR_EL1_PRI_BITS_SHIFT 8 +#define ICC_CTLR_EL1_PRI_BITS_MASK (0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT) +#define ICC_CTLR_EL1_ID_BITS_SHIFT 11 +#define ICC_CTLR_EL1_ID_BITS_MASK (0x7 << ICC_CTLR_EL1_ID_BITS_SHIFT) +#define ICC_CTLR_EL1_SEIS_SHIFT 14 +#define ICC_CTLR_EL1_SEIS_MASK (0x1 << ICC_CTLR_EL1_SEIS_SHIFT) +#define ICC_CTLR_EL1_A3V_SHIFT 15 +#define ICC_CTLR_EL1_A3V_MASK (0x1 << ICC_CTLR_EL1_A3V_SHIFT) +#define ICC_PMR_EL1_SHIFT 0 +#define ICC_PMR_EL1_MASK (0xff << ICC_PMR_EL1_SHIFT) +#define ICC_BPR0_EL1_SHIFT 0 +#define ICC_BPR0_EL1_MASK (0x7 << ICC_BPR0_EL1_SHIFT) +#define ICC_BPR1_EL1_SHIFT 0 +#define ICC_BPR1_EL1_MASK (0x7 << ICC_BPR1_EL1_SHIFT) +#define ICC_IGRPEN0_EL1_SHIFT 0 +#define ICC_IGRPEN0_EL1_MASK (1 << ICC_IGRPEN0_EL1_SHIFT) +#define ICC_IGRPEN1_EL1_SHIFT 0 +#define ICC_IGRPEN1_EL1_MASK (1 << ICC_IGRPEN1_EL1_SHIFT) #define ICC_SRE_EL1_SRE (1U << 0) /* @@ -384,12 +406,29 @@ #define ICH_VMCR_CTLR_SHIFT 0 #define ICH_VMCR_CTLR_MASK (0x21f << ICH_VMCR_CTLR_SHIFT) +#define ICH_VMCR_CBPR_SHIFT 4 +#define ICH_VMCR_CBPR_MASK (1 << ICH_VMCR_CBPR_SHIFT) +#define ICH_VMCR_EOIM_SHIFT 9 +#define ICH_VMCR_EOIM_MASK (1 << ICH_VMCR_EOIM_SHIFT) #define ICH_VMCR_BPR1_SHIFT 18 #define ICH_VMCR_BPR1_MASK (7 << ICH_VMCR_BPR1_SHIFT) #define ICH_VMCR_BPR0_SHIFT 21 #define ICH_VMCR_BPR0_MASK (7 << ICH_VMCR_BPR0_SHIFT) #define ICH_VMCR_PMR_SHIFT 24 #define ICH_VMCR_PMR_MASK (0xffUL << ICH_VMCR_PMR_SHIFT) +#define ICH_VMCR_ENG0_SHIFT 0 +#define ICH_VMCR_ENG0_MASK (1 << ICH_VMCR_ENG0_SHIFT) +#define ICH_VMCR_ENG1_SHIFT 1 +#define ICH_VMCR_ENG1_MASK (1 << ICH_VMCR_ENG1_SHIFT) + +#define ICH_VTR_PRI_BITS_SHIFT 29 +#define ICH_VTR_PRI_BITS_MASK (7 << ICH_VTR_PRI_BITS_SHIFT) +#define ICH_VTR_ID_BITS_SHIFT 23 +#define ICH_VTR_ID_BITS_MASK (7 << ICH_VTR_ID_BITS_SHIFT) +#define ICH_VTR_SEIS_SHIFT 22 +#define ICH_VTR_SEIS_MASK (1 << ICH_VTR_SEIS_SHIFT) +#define ICH_VTR_A3V_SHIFT 21 +#define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) #define ICC_IAR1_EL1_SPURIOUS 0x3ff -- cgit v1.2.3-70-g09d2 From 5fb247d79c04240dce86c842976cde1edde7f7ed Mon Sep 17 00:00:00 2001 From: Vijaya Kumar K Date: Thu, 26 Jan 2017 19:50:50 +0530 Subject: KVM: arm/arm64: vgic: Introduce VENG0 and VENG1 fields to vmcr struct ICC_VMCR_EL2 supports virtual access to ICC_IGRPEN1_EL1.Enable and ICC_IGRPEN0_EL1.Enable fields. Add grpen0 and grpen1 member variables to struct vmcr to support read and write of these fields. Also refactor vgic_set_vmcr and vgic_get_vmcr() code. Drop ICH_VMCR_CTLR_SHIFT and ICH_VMCR_CTLR_MASK macros and instead use ICH_VMCR_EOI* and ICH_VMCR_CBPR* macros. Signed-off-by: Vijaya Kumar K Reviewed-by: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 2 -- virt/kvm/arm/vgic/vgic-mmio-v2.c | 16 ---------------- virt/kvm/arm/vgic/vgic-mmio.c | 16 ++++++++++++++++ virt/kvm/arm/vgic/vgic-v3.c | 20 ++++++++++++++++++-- virt/kvm/arm/vgic/vgic.h | 5 +++++ 5 files changed, 39 insertions(+), 20 deletions(-) diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 7f6d904a62c6..170e00a40826 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -404,8 +404,6 @@ #define ICH_HCR_EN (1 << 0) #define ICH_HCR_UIE (1 << 1) -#define ICH_VMCR_CTLR_SHIFT 0 -#define ICH_VMCR_CTLR_MASK (0x21f << ICH_VMCR_CTLR_SHIFT) #define ICH_VMCR_CBPR_SHIFT 4 #define ICH_VMCR_CBPR_MASK (1 << ICH_VMCR_CBPR_SHIFT) #define ICH_VMCR_EOIM_SHIFT 9 diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index fa68dd41756e..a3ad7ff95c9b 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -213,22 +213,6 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, } } -static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_vmcr(vcpu, vmcr); - else - vgic_v3_set_vmcr(vcpu, vmcr); -} - -static void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_get_vmcr(vcpu, vmcr); - else - vgic_v3_get_vmcr(vcpu, vmcr); -} - #define GICC_ARCH_VERSION_V2 0x2 /* These are for userland accesses only, there is no guest-facing emulation. */ diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 746c8afca718..1d1886e7d640 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -385,6 +385,22 @@ vgic_find_mmio_region(const struct vgic_register_region *region, int nr_regions, sizeof(region[0]), match_region); } +void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_vmcr(vcpu, vmcr); + else + vgic_v3_set_vmcr(vcpu, vmcr); +} + +void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_get_vmcr(vcpu, vmcr); + else + vgic_v3_get_vmcr(vcpu, vmcr); +} + /* * kvm_mmio_read_buf() returns a value in a format where it can be converted * to a byte array and be directly observed as the guest wanted it to appear diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 679ba935698f..42ff9c9826d3 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -175,10 +175,18 @@ void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { u32 vmcr; - vmcr = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK; + /* + * Ignore the FIQen bit, because GIC emulation always implies + * SRE=1 which means the vFIQEn bit is also RES1. + */ + vmcr = ((vmcrp->ctlr >> ICC_CTLR_EL1_EOImode_SHIFT) << + ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK; + vmcr |= (vmcrp->ctlr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK; vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; + vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; + vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr; } @@ -187,10 +195,18 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr; - vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT; + /* + * Ignore the FIQen bit, because GIC emulation always implies + * SRE=1 which means the vFIQEn bit is also RES1. + */ + vmcrp->ctlr = ((vmcr >> ICH_VMCR_EOIM_SHIFT) << + ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK; + vmcrp->ctlr |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; + vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT; + vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT; } #define INITIAL_PENDBASER_VALUE \ diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 4505fd4919fd..ecfe1a6c841a 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -65,6 +65,9 @@ struct vgic_vmcr { u32 abpr; u32 bpr; u32 pmr; + /* Below member variable are valid only for GICv3 */ + u32 grpen0; + u32 grpen1; }; struct vgic_reg_attr { @@ -137,6 +140,8 @@ int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int kvm_register_vgic_device(unsigned long type); +void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); int vgic_lazy_init(struct kvm *kvm); int vgic_init(struct kvm *kvm); -- cgit v1.2.3-70-g09d2 From d017d7b0bd7ab32644d35666a6c4412daa0b0a1d Mon Sep 17 00:00:00 2001 From: Vijaya Kumar K Date: Thu, 26 Jan 2017 19:50:51 +0530 Subject: KVM: arm/arm64: vgic: Implement VGICv3 CPU interface access VGICv3 CPU interface registers are accessed using KVM_DEV_ARM_VGIC_CPU_SYSREGS ioctl. These registers are accessed as 64-bit. The cpu MPIDR value is passed along with register id. It is used to identify the cpu for registers access. The VM that supports SEIs expect it on destination machine to handle guest aborts and hence checked for ICC_CTLR_EL1.SEIS compatibility. Similarly, VM that supports Affinity Level 3 that is required for AArch64 mode, is required to be supported on destination machine. Hence checked for ICC_CTLR_EL1.A3V compatibility. The arch/arm64/kvm/vgic-sys-reg-v3.c handles read and write of VGIC CPU registers for AArch64. For AArch32 mode, arch/arm/kvm/vgic-v3-coproc.c file is created but APIs are not implemented. Updated arch/arm/include/uapi/asm/kvm.h with new definitions required to compile for AArch32. The version of VGIC v3 specification is defined here Documentation/virtual/kvm/devices/arm-vgic-v3.txt Acked-by: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Pavel Fedin Signed-off-by: Vijaya Kumar K Signed-off-by: Marc Zyngier --- arch/arm/include/uapi/asm/kvm.h | 3 + arch/arm/kvm/Makefile | 4 +- arch/arm/kvm/vgic-v3-coproc.c | 35 ++++ arch/arm64/include/uapi/asm/kvm.h | 3 + arch/arm64/kvm/Makefile | 3 +- arch/arm64/kvm/vgic-sys-reg-v3.c | 346 ++++++++++++++++++++++++++++++++++++ include/kvm/arm_vgic.h | 8 + virt/kvm/arm/vgic/vgic-kvm-device.c | 27 +++ virt/kvm/arm/vgic/vgic-mmio-v3.c | 6 + virt/kvm/arm/vgic/vgic-v3.c | 8 + virt/kvm/arm/vgic/vgic.h | 25 +++ 11 files changed, 465 insertions(+), 3 deletions(-) create mode 100644 arch/arm/kvm/vgic-v3-coproc.c create mode 100644 arch/arm64/kvm/vgic-sys-reg-v3.c diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 0ae6035e0604..7a3e5370ba68 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -186,9 +186,12 @@ struct kvm_arch_memory_slot { (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT) #define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 #define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) +#define KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff) #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 #define KVM_DEV_ARM_VGIC_GRP_CTRL 4 #define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5 +#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 + #define KVM_DEV_ARM_VGIC_CTRL_INIT 0 /* KVM_IRQ_LINE irq field index values */ diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 12b628187e90..7b3670c2ae7b 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -7,7 +7,7 @@ ifeq ($(plus_virt),+virt) plus_virt_def := -DREQUIRES_VIRT=1 endif -ccflags-y += -Iarch/arm/kvm +ccflags-y += -Iarch/arm/kvm -Ivirt/kvm/arm/vgic CFLAGS_arm.o := -I. $(plus_virt_def) CFLAGS_mmu.o := -I. @@ -20,7 +20,7 @@ kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vf obj-$(CONFIG_KVM_ARM_HOST) += hyp/ obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o -obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o +obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o vgic-v3-coproc.o obj-y += $(KVM)/arm/aarch32.o obj-y += $(KVM)/arm/vgic/vgic.o diff --git a/arch/arm/kvm/vgic-v3-coproc.c b/arch/arm/kvm/vgic-v3-coproc.c new file mode 100644 index 000000000000..f41abf76366f --- /dev/null +++ b/arch/arm/kvm/vgic-v3-coproc.c @@ -0,0 +1,35 @@ +/* + * VGIC system registers handling functions for AArch32 mode + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include "vgic.h" + +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, + u64 *reg) +{ + /* + * TODO: Implement for AArch32 + */ + return -ENXIO; +} + +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, + u64 *reg) +{ + /* + * TODO: Implement for AArch32 + */ + return -ENXIO; +} diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 56dc08db6492..be379d7bc9f1 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -206,9 +206,12 @@ struct kvm_arch_memory_slot { (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT) #define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 #define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) +#define KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff) #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 #define KVM_DEV_ARM_VGIC_GRP_CTRL 4 #define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5 +#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 + #define KVM_DEV_ARM_VGIC_CTRL_INIT 0 /* Device Control API on vcpu fd */ diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index e025beceda39..afd51bebb9c5 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -2,7 +2,7 @@ # Makefile for Kernel-based Virtual Machine module # -ccflags-y += -Iarch/arm64/kvm +ccflags-y += -Iarch/arm64/kvm -Ivirt/kvm/arm/vgic CFLAGS_arm.o := -I. CFLAGS_mmu.o := -I. @@ -19,6 +19,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o +kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c new file mode 100644 index 000000000000..79f37e37d367 --- /dev/null +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -0,0 +1,346 @@ +/* + * VGIC system registers handling functions for AArch64 mode + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include "vgic.h" +#include "sys_regs.h" + +static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 host_pri_bits, host_id_bits, host_seis, host_a3v, seis, a3v; + struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; + struct vgic_vmcr vmcr; + u64 val; + + vgic_get_vmcr(vcpu, &vmcr); + if (p->is_write) { + val = p->regval; + + /* + * Disallow restoring VM state if not supported by this + * hardware. + */ + host_pri_bits = ((val & ICC_CTLR_EL1_PRI_BITS_MASK) >> + ICC_CTLR_EL1_PRI_BITS_SHIFT) + 1; + if (host_pri_bits > vgic_v3_cpu->num_pri_bits) + return false; + + vgic_v3_cpu->num_pri_bits = host_pri_bits; + + host_id_bits = (val & ICC_CTLR_EL1_ID_BITS_MASK) >> + ICC_CTLR_EL1_ID_BITS_SHIFT; + if (host_id_bits > vgic_v3_cpu->num_id_bits) + return false; + + vgic_v3_cpu->num_id_bits = host_id_bits; + + host_seis = ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT); + seis = (val & ICC_CTLR_EL1_SEIS_MASK) >> + ICC_CTLR_EL1_SEIS_SHIFT; + if (host_seis != seis) + return false; + + host_a3v = ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT); + a3v = (val & ICC_CTLR_EL1_A3V_MASK) >> ICC_CTLR_EL1_A3V_SHIFT; + if (host_a3v != a3v) + return false; + + /* + * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. + * The vgic_set_vmcr() will convert to ICH_VMCR layout. + */ + vmcr.ctlr = val & ICC_CTLR_EL1_CBPR_MASK; + vmcr.ctlr |= val & ICC_CTLR_EL1_EOImode_MASK; + vgic_set_vmcr(vcpu, &vmcr); + } else { + val = 0; + val |= (vgic_v3_cpu->num_pri_bits - 1) << + ICC_CTLR_EL1_PRI_BITS_SHIFT; + val |= vgic_v3_cpu->num_id_bits << ICC_CTLR_EL1_ID_BITS_SHIFT; + val |= ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT) << + ICC_CTLR_EL1_SEIS_SHIFT; + val |= ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT) << + ICC_CTLR_EL1_A3V_SHIFT; + /* + * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. + * Extract it directly using ICC_CTLR_EL1 reg definitions. + */ + val |= vmcr.ctlr & ICC_CTLR_EL1_CBPR_MASK; + val |= vmcr.ctlr & ICC_CTLR_EL1_EOImode_MASK; + + p->regval = val; + } + + return true; +} + +static bool access_gic_pmr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + if (p->is_write) { + vmcr.pmr = (p->regval & ICC_PMR_EL1_MASK) >> ICC_PMR_EL1_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); + } else { + p->regval = (vmcr.pmr << ICC_PMR_EL1_SHIFT) & ICC_PMR_EL1_MASK; + } + + return true; +} + +static bool access_gic_bpr0(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + if (p->is_write) { + vmcr.bpr = (p->regval & ICC_BPR0_EL1_MASK) >> + ICC_BPR0_EL1_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); + } else { + p->regval = (vmcr.bpr << ICC_BPR0_EL1_SHIFT) & + ICC_BPR0_EL1_MASK; + } + + return true; +} + +static bool access_gic_bpr1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + struct vgic_vmcr vmcr; + + if (!p->is_write) + p->regval = 0; + + vgic_get_vmcr(vcpu, &vmcr); + if (!((vmcr.ctlr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT)) { + if (p->is_write) { + vmcr.abpr = (p->regval & ICC_BPR1_EL1_MASK) >> + ICC_BPR1_EL1_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); + } else { + p->regval = (vmcr.abpr << ICC_BPR1_EL1_SHIFT) & + ICC_BPR1_EL1_MASK; + } + } else { + if (!p->is_write) + p->regval = min((vmcr.bpr + 1), 7U); + } + + return true; +} + +static bool access_gic_grpen0(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + if (p->is_write) { + vmcr.grpen0 = (p->regval & ICC_IGRPEN0_EL1_MASK) >> + ICC_IGRPEN0_EL1_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); + } else { + p->regval = (vmcr.grpen0 << ICC_IGRPEN0_EL1_SHIFT) & + ICC_IGRPEN0_EL1_MASK; + } + + return true; +} + +static bool access_gic_grpen1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + if (p->is_write) { + vmcr.grpen1 = (p->regval & ICC_IGRPEN1_EL1_MASK) >> + ICC_IGRPEN1_EL1_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); + } else { + p->regval = (vmcr.grpen1 << ICC_IGRPEN1_EL1_SHIFT) & + ICC_IGRPEN1_EL1_MASK; + } + + return true; +} + +static void vgic_v3_access_apr_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, u8 apr, u8 idx) +{ + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + uint32_t *ap_reg; + + if (apr) + ap_reg = &vgicv3->vgic_ap1r[idx]; + else + ap_reg = &vgicv3->vgic_ap0r[idx]; + + if (p->is_write) + *ap_reg = p->regval; + else + p->regval = *ap_reg; +} + +static bool access_gic_aprn(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r, u8 apr) +{ + struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; + u8 idx = r->Op2 & 3; + + /* + * num_pri_bits are initialized with HW supported values. + * We can rely safely on num_pri_bits even if VM has not + * restored ICC_CTLR_EL1 before restoring APnR registers. + */ + switch (vgic_v3_cpu->num_pri_bits) { + case 7: + vgic_v3_access_apr_reg(vcpu, p, apr, idx); + break; + case 6: + if (idx > 1) + goto err; + vgic_v3_access_apr_reg(vcpu, p, apr, idx); + break; + default: + if (idx > 0) + goto err; + vgic_v3_access_apr_reg(vcpu, p, apr, idx); + } + + return true; +err: + if (!p->is_write) + p->regval = 0; + + return false; +} + +static bool access_gic_ap0r(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) + +{ + return access_gic_aprn(vcpu, p, r, 0); +} + +static bool access_gic_ap1r(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + return access_gic_aprn(vcpu, p, r, 1); +} + +static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + /* Validate SRE bit */ + if (p->is_write) { + if (!(p->regval & ICC_SRE_EL1_SRE)) + return false; + } else { + p->regval = vgicv3->vgic_sre; + } + + return true; +} +static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { + /* ICC_PMR_EL1 */ + { Op0(3), Op1(0), CRn(4), CRm(6), Op2(0), access_gic_pmr }, + /* ICC_BPR0_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(8), Op2(3), access_gic_bpr0 }, + /* ICC_AP0R0_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(8), Op2(4), access_gic_ap0r }, + /* ICC_AP0R1_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(8), Op2(5), access_gic_ap0r }, + /* ICC_AP0R2_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(8), Op2(6), access_gic_ap0r }, + /* ICC_AP0R3_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(8), Op2(7), access_gic_ap0r }, + /* ICC_AP1R0_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(9), Op2(0), access_gic_ap1r }, + /* ICC_AP1R1_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(9), Op2(1), access_gic_ap1r }, + /* ICC_AP1R2_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(9), Op2(2), access_gic_ap1r }, + /* ICC_AP1R3_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(9), Op2(3), access_gic_ap1r }, + /* ICC_BPR1_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(12), Op2(3), access_gic_bpr1 }, + /* ICC_CTLR_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(12), Op2(4), access_gic_ctlr }, + /* ICC_SRE_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(12), Op2(5), access_gic_sre }, + /* ICC_IGRPEN0_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(12), Op2(6), access_gic_grpen0 }, + /* ICC_GRPEN1_EL1 */ + { Op0(3), Op1(0), CRn(12), CRm(12), Op2(7), access_gic_grpen1 }, +}; + +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, + u64 *reg) +{ + struct sys_reg_params params; + u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64; + + params.regval = *reg; + params.is_write = is_write; + params.is_aarch32 = false; + params.is_32bit = false; + + if (find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs))) + return 0; + + return -ENXIO; +} + +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, + u64 *reg) +{ + struct sys_reg_params params; + const struct sys_reg_desc *r; + u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64; + + if (is_write) + params.regval = *reg; + params.is_write = is_write; + params.is_aarch32 = false; + params.is_32bit = false; + + r = find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); + if (!r) + return -ENXIO; + + if (!r->access(vcpu, ¶ms, r)) + return -EINVAL; + + if (!is_write) + *reg = params.regval; + + return 0; +} diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 0af1477cfe8d..b72dd2ad5f44 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -71,6 +71,8 @@ struct vgic_global { /* GIC system register CPU interface */ struct static_key_false gicv3_cpuif; + + u32 ich_vtr_el2; }; extern struct vgic_global kvm_vgic_global_state; @@ -275,6 +277,12 @@ struct vgic_cpu { u64 pendbaser; bool lpis_enabled; + + /* Cache guest priority bits */ + u32 num_pri_bits; + + /* Cache guest interrupt ID bits */ + u32 num_id_bits; }; extern struct static_key_false vgic_v2_cpuif_trap; diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 227337fd5156..b30372b59dc2 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -504,6 +504,14 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, if (!is_write) *reg = tmp32; break; + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 regid; + + regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); + ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write, + regid, reg); + break; + } default: ret = -EINVAL; break; @@ -537,6 +545,15 @@ static int vgic_v3_set_attr(struct kvm_device *dev, reg = tmp32; return vgic_v3_attr_regs_access(dev, attr, ®, true); } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + + if (get_user(reg, uaddr)) + return -EFAULT; + + return vgic_v3_attr_regs_access(dev, attr, ®, true); + } } return -ENXIO; } @@ -563,6 +580,15 @@ static int vgic_v3_get_attr(struct kvm_device *dev, tmp32 = reg; return put_user(tmp32, uaddr); } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 reg; + + ret = vgic_v3_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + return put_user(reg, uaddr); + } } return -ENXIO; @@ -581,6 +607,7 @@ static int vgic_v3_has_attr(struct kvm_device *dev, break; case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: return vgic_v3_has_attr_regs(dev, attr); case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: return 0; diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 2031138cfcf4..549ae459ca0e 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -645,6 +645,12 @@ int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) iodev.base_addr = 0; break; } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { + u64 reg, id; + + id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); + return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, ®); + } default: return -ENXIO; } diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 42ff9c9826d3..edc6ee2dc852 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -238,6 +238,13 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) vgic_v3->vgic_sre = 0; } + vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_ID_BITS_MASK) >> + ICH_VTR_ID_BITS_SHIFT; + vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_PRI_BITS_MASK) >> + ICH_VTR_PRI_BITS_SHIFT) + 1; + /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EN; } @@ -336,6 +343,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info) */ kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; kvm_vgic_global_state.can_emulate_gicv2 = false; + kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2; if (!info->vcpu.start) { kvm_info("GICv3: no GICV resource entry\n"); diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index ecfe1a6c841a..a5a45f66c058 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -52,6 +52,27 @@ VGIC_AFFINITY_LEVEL(val, 2) | \ VGIC_AFFINITY_LEVEL(val, 3)) +/* + * As per Documentation/virtual/kvm/devices/arm-vgic-v3.txt, + * below macros are defined for CPUREG encoding. + */ +#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 +#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT 14 +#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK 0x0000000000003800 +#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT 11 +#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK 0x0000000000000780 +#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT 7 +#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK 0x0000000000000078 +#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT 3 +#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK 0x0000000000000007 +#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT 0 + +#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \ + KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) + static inline bool irq_is_pending(struct vgic_irq *irq) { if (irq->config == VGIC_CONFIG_EDGE) @@ -139,6 +160,10 @@ int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u64 id, u64 *val); +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, + u64 *reg); int kvm_register_vgic_device(unsigned long type); void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -- cgit v1.2.3-70-g09d2 From e96a006cb0663cca88c45ce125ec5e183f568f80 Mon Sep 17 00:00:00 2001 From: Vijaya Kumar K Date: Thu, 26 Jan 2017 19:50:52 +0530 Subject: KVM: arm/arm64: vgic: Implement KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO ioctl Userspace requires to store and restore of line_level for level triggered interrupts using ioctl KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO. Reviewed-by: Eric Auger Signed-off-by: Vijaya Kumar K Signed-off-by: Marc Zyngier --- arch/arm/include/uapi/asm/kvm.h | 6 +++++ arch/arm64/include/uapi/asm/kvm.h | 6 +++++ virt/kvm/arm/vgic/vgic-kvm-device.c | 45 ++++++++++++++++++++++++++++++- virt/kvm/arm/vgic/vgic-mmio-v3.c | 14 ++++++++++ virt/kvm/arm/vgic/vgic-mmio.c | 54 +++++++++++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic-mmio.h | 5 ++++ virt/kvm/arm/vgic/vgic.h | 2 ++ 7 files changed, 131 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 7a3e5370ba68..6ebd3e6a1fd1 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -191,6 +191,12 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_GRP_CTRL 4 #define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5 #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 +#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7 +#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10 +#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \ + (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) +#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff +#define VGIC_LEVEL_INFO_LINE_LEVEL 0 #define KVM_DEV_ARM_VGIC_CTRL_INIT 0 diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index be379d7bc9f1..c2860358ae3e 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -211,6 +211,12 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_GRP_CTRL 4 #define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5 #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 +#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7 +#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10 +#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \ + (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) +#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff +#define VGIC_LEVEL_INFO_LINE_LEVEL 0 #define KVM_DEV_ARM_VGIC_CTRL_INIT 0 diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index b30372b59dc2..d181d2baee9c 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -512,6 +512,21 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, regid, reg); break; } + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + unsigned int info, intid; + + info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> + KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT; + if (info == VGIC_LEVEL_INFO_LINE_LEVEL) { + intid = attr->attr & + KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; + ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, + intid, reg); + } else { + ret = -EINVAL; + } + break; + } default: ret = -EINVAL; break; @@ -554,6 +569,17 @@ static int vgic_v3_set_attr(struct kvm_device *dev, return vgic_v3_attr_regs_access(dev, attr, ®, true); } + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u64 reg; + u32 tmp32; + + if (get_user(tmp32, uaddr)) + return -EFAULT; + + reg = tmp32; + return vgic_v3_attr_regs_access(dev, attr, ®, true); + } } return -ENXIO; } @@ -589,8 +615,18 @@ static int vgic_v3_get_attr(struct kvm_device *dev, return ret; return put_user(reg, uaddr); } - } + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u64 reg; + u32 tmp32; + ret = vgic_v3_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + tmp32 = reg; + return put_user(tmp32, uaddr); + } + } return -ENXIO; } @@ -611,6 +647,13 @@ static int vgic_v3_has_attr(struct kvm_device *dev, return vgic_v3_has_attr_regs(dev, attr); case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: return 0; + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { + if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> + KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) == + VGIC_LEVEL_INFO_LINE_LEVEL) + return 0; + break; + } case KVM_DEV_ARM_VGIC_GRP_CTRL: switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 549ae459ca0e..6afb3b484886 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -803,3 +803,17 @@ int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, else return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val); } + +int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u32 intid, u64 *val) +{ + if (intid % 32) + return -EINVAL; + + if (is_write) + vgic_write_irq_line_level_info(vcpu, intid, *val); + else + *val = vgic_read_irq_line_level_info(vcpu, intid); + + return 0; +} diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 1d1886e7d640..3654b4c835ef 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -362,6 +362,60 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, } } +u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) +{ + int i; + u64 val = 0; + int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + + for (i = 0; i < 32; i++) { + struct vgic_irq *irq; + + if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) + continue; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level) + val |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return val; +} + +void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, + const u64 val) +{ + int i; + int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + + for (i = 0; i < 32; i++) { + struct vgic_irq *irq; + bool new_level; + + if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) + continue; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + /* + * Line level is set irrespective of irq type + * (level or edge) to avoid dependency that VM should + * restore irq config before line level. + */ + new_level = !!(val & (1U << i)); + spin_lock(&irq->irq_lock); + irq->line_level = new_level; + if (new_level) + vgic_queue_irq_unlock(vcpu->kvm, irq); + else + spin_unlock(&irq->irq_lock); + + vgic_put_irq(vcpu->kvm, irq); + } +} + static int match_region(const void *key, const void *elt) { const unsigned int offset = (unsigned long)key; diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 7b30296a7306..98bb566b660a 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -177,6 +177,11 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, bool is_write, int offset, u32 *val); +u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); + +void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, + const u64 val); + unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index a5a45f66c058..db28f7cadab2 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -164,6 +164,8 @@ int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, u64 *val); int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, u64 *reg); +int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, + u32 intid, u64 *val); int kvm_register_vgic_device(unsigned long type); void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -- cgit v1.2.3-70-g09d2 From e363e05e12d76b3311618d442ad545dff4d08728 Mon Sep 17 00:00:00 2001 From: Vijaya Kumar K Date: Thu, 26 Jan 2017 19:50:53 +0530 Subject: KVM: arm/arm64: Documentation: Update arm-vgic-v3.txt Update error code returned for Invalid CPU interface register value and access in AArch32 mode. Acked-by: Christoffer Dall Reviewed-by: Eric Auger Signed-off-by: Vijaya Kumar K Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/devices/arm-vgic-v3.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt index 9348b3caccd7..c1a24612c198 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt @@ -118,7 +118,7 @@ Groups: -EBUSY: One or more VCPUs are running - KVM_DEV_ARM_VGIC_CPU_SYSREGS + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS Attributes: The attr field of kvm_device_attr encodes two values: bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 | @@ -139,13 +139,15 @@ Groups: All system regs accessed through this API are (rw, 64-bit) and kvm_device_attr.addr points to a __u64 value. - KVM_DEV_ARM_VGIC_CPU_SYSREGS accesses the CPU interface registers for the + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS accesses the CPU interface registers for the CPU specified by the mpidr field. + CPU interface registers access is not implemented for AArch32 mode. + Error -ENXIO is returned when accessed in AArch32 mode. Errors: -ENXIO: Getting or setting this register is not yet supported -EBUSY: VCPU is running - -EINVAL: Invalid mpidr supplied + -EINVAL: Invalid mpidr or register value supplied KVM_DEV_ARM_VGIC_GRP_NR_IRQS @@ -204,3 +206,6 @@ Groups: architecture defined MPIDR, and the field is encoded as follows: | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | | Aff3 | Aff2 | Aff1 | Aff0 | + Errors: + -EINVAL: vINTID is not multiple of 32 or + info field is not VGIC_LEVEL_INFO_LINE_LEVEL -- cgit v1.2.3-70-g09d2 From 8f36ebaf21fdae99c091c67e8b6fab33969f2667 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 25 Jan 2017 12:29:59 +0000 Subject: arm/arm64: KVM: Enforce unconditional flush to PoC when mapping to stage-2 When we fault in a page, we flush it to the PoC (Point of Coherency) if the faulting vcpu has its own caches off, so that it can observe the page we just brought it. But if the vcpu has its caches on, we skip that step. Bad things happen when *another* vcpu tries to access that page with its own caches disabled. At that point, there is no garantee that the data has made it to the PoC, and we access stale data. The obvious fix is to always flush to PoC when a page is faulted in, no matter what the state of the vcpu is. Cc: stable@vger.kernel.org Fixes: 2d58b733c876 ("arm64: KVM: force cache clean on page fault when caches are off") Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_mmu.h | 9 +-------- arch/arm64/include/asm/kvm_mmu.h | 3 +-- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 74a44727f8e1..a58bbaa3ec60 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -150,18 +150,12 @@ static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, * and iterate over the range. */ - bool need_flush = !vcpu_has_cache_enabled(vcpu) || ipa_uncached; - VM_BUG_ON(size & ~PAGE_MASK); - if (!need_flush && !icache_is_pipt()) - goto vipt_cache; - while (size) { void *va = kmap_atomic_pfn(pfn); - if (need_flush) - kvm_flush_dcache_to_poc(va, PAGE_SIZE); + kvm_flush_dcache_to_poc(va, PAGE_SIZE); if (icache_is_pipt()) __cpuc_coherent_user_range((unsigned long)va, @@ -173,7 +167,6 @@ static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, kunmap_atomic(va); } -vipt_cache: if (!icache_is_pipt() && !icache_is_vivt_asid_tagged()) { /* any kind of VIPT cache */ __flush_icache_all(); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 6f72fe8b0e3e..6d22017ebbad 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -241,8 +241,7 @@ static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, { void *va = page_address(pfn_to_page(pfn)); - if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached) - kvm_flush_dcache_to_poc(va, size); + kvm_flush_dcache_to_poc(va, size); if (!icache_is_aliasing()) { /* PIPT */ flush_icache_range((unsigned long)va, -- cgit v1.2.3-70-g09d2 From 13b7756cec3d6e8329fa21c314fe150c12601c6c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 25 Jan 2017 13:33:11 +0000 Subject: arm/arm64: KVM: Stop propagating cacheability status of a faulted page Now that we unconditionally flush newly mapped pages to the PoC, there is no need to care about the "uncached" status of individual pages - they must all be visible all the way down. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_mmu.h | 3 +-- arch/arm/kvm/mmu.c | 11 ++++------- arch/arm64/include/asm/kvm_mmu.h | 3 +-- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index a58bbaa3ec60..95f38dcd611d 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -129,8 +129,7 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, - unsigned long size, - bool ipa_uncached) + unsigned long size) { /* * If we are going to insert an instruction page and the icache is diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index a5265edbeeab..5cc35080cbf7 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1232,9 +1232,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, } static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, - unsigned long size, bool uncached) + unsigned long size) { - __coherent_cache_guest_page(vcpu, pfn, size, uncached); + __coherent_cache_guest_page(vcpu, pfn, size); } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, @@ -1250,7 +1250,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct vm_area_struct *vma; kvm_pfn_t pfn; pgprot_t mem_type = PAGE_S2; - bool fault_ipa_uncached; bool logging_active = memslot_is_logging(memslot); unsigned long flags = 0; @@ -1337,8 +1336,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (!hugetlb && !force_pte) hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); - fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT; - if (hugetlb) { pmd_t new_pmd = pfn_pmd(pfn, mem_type); new_pmd = pmd_mkhuge(new_pmd); @@ -1346,7 +1343,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, new_pmd = kvm_s2pmd_mkwrite(new_pmd); kvm_set_pfn_dirty(pfn); } - coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached); + coherent_cache_guest_page(vcpu, pfn, PMD_SIZE); ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { pte_t new_pte = pfn_pte(pfn, mem_type); @@ -1356,7 +1353,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } - coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached); + coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE); ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 6d22017ebbad..aa1e6db15a2d 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -236,8 +236,7 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, - unsigned long size, - bool ipa_uncached) + unsigned long size) { void *va = page_address(pfn_to_page(pfn)); -- cgit v1.2.3-70-g09d2 From 9d93dc1c96ec446bef9c34a189ea24556f1af89a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 25 Jan 2017 13:47:43 +0000 Subject: arm/arm64: KVM: Get rid of KVM_MEMSLOT_INCOHERENT KVM_MEMSLOT_INCOHERENT is not used anymore, as we've killed its only use in the arm/arm64 MMU code. Let's remove the last artifacts. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/kvm/mmu.c | 9 --------- include/linux/kvm_host.h | 1 - 2 files changed, 10 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 5cc35080cbf7..962616fd4ddd 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1876,15 +1876,6 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) { - /* - * Readonly memslots are not incoherent with the caches by definition, - * but in practice, they are used mostly to emulate ROMs or NOR flashes - * that the guest may consider devices and hence map as uncached. - * To prevent incoherency issues in these cases, tag all readonly - * regions as incoherent. - */ - if (slot->flags & KVM_MEM_READONLY) - slot->flags |= KVM_MEMSLOT_INCOHERENT; return 0; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1c5190dab2c1..cda457bcedc1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -45,7 +45,6 @@ * include/linux/kvm_h. */ #define KVM_MEMSLOT_INVALID (1UL << 16) -#define KVM_MEMSLOT_INCOHERENT (1UL << 17) /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 -- cgit v1.2.3-70-g09d2 From ccc4df4e2c3825919456c13b153d2a67bbf328dc Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:48:57 +1100 Subject: Documentation: Correct duplicate section number in kvm/api.txt Both KVM_CREATE_SPAPR_TCE_64 and KVM_REINJECT_CONTROL have section number 4.98 in Documentation/virtual/kvm/api.txt, presumably due to a naive merge. This corrects the duplication. [paulus@ozlabs.org - correct section numbers for following sections, KVM_PPC_CONFIGURE_V3_MMU and KVM_PPC_GET_RMMU_INFO, as well.] Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- Documentation/virtual/kvm/api.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4470671b0c26..aca994a90355 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3177,7 +3177,7 @@ of IOMMU pages. The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. -4.98 KVM_REINJECT_CONTROL +4.99 KVM_REINJECT_CONTROL Capability: KVM_CAP_REINJECT_CONTROL Architectures: x86 @@ -3201,7 +3201,7 @@ struct kvm_reinject_control { pit_reinject = 0 (!reinject mode) is recommended, unless running an old operating system that uses the PIT for timing (e.g. Linux 2.4.x). -4.99 KVM_PPC_CONFIGURE_V3_MMU +4.100 KVM_PPC_CONFIGURE_V3_MMU Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 Architectures: ppc @@ -3232,7 +3232,7 @@ process table, which is in the guest's space. This field is formatted as the second doubleword of the partition table entry, as defined in the Power ISA V3.00, Book III section 5.7.6.1. -4.100 KVM_PPC_GET_RMMU_INFO +4.101 KVM_PPC_GET_RMMU_INFO Capability: KVM_CAP_PPC_RADIX_MMU Architectures: ppc -- cgit v1.2.3-70-g09d2 From ef1ead0c3b1dfb43d33caa4f50c8d214f86b6bc8 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:48:58 +1100 Subject: KVM: PPC: Book3S HV: HPT resizing documentation and reserved numbers This adds a new powerpc-specific KVM_CAP_SPAPR_RESIZE_HPT capability to advertise whether KVM is capable of handling the PAPR extensions for resizing the hashed page table during guest runtime. It also adds definitions for two new VM ioctl()s to implement this extension, and documentation of the same. Note that, HPT resizing is already possible with KVM PR without kernel modification, since the HPT is managed within userspace (qemu). The capability defined here will only be set where an in-kernel implementation of resizing is necessary, i.e. for KVM HV. To determine if the userspace resize implementation can be used, it's necessary to check KVM_CAP_PPC_ALLOC_HTAB. Unfortunately older kernels incorrectly set KVM_CAP_PPC_ALLOC_HTAB even with KVM PR. If userspace it want to support resizing with KVM PR on such kernels, it will need a workaround. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- Documentation/virtual/kvm/api.txt | 95 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 11 +++++ 2 files changed, 106 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index aca994a90355..64f217af0416 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3266,6 +3266,101 @@ The ap_encodings gives the supported page sizes and their AP field encodings, encoded with the AP value in the top 3 bits and the log base 2 of the page size in the bottom 6 bits. +4.102 KVM_PPC_RESIZE_HPT_PREPARE + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + >0 if a new HPT is being prepared, the value is an estimated + number of milliseconds until preparation is complete + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENOMEM if unable to allocate the new HPT + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this starts, stops or monitors +the preparation of a new potential HPT for the guest, essentially +implementing the H_RESIZE_HPT_PREPARE hypercall. + +If called with shift > 0 when there is no pending HPT for the guest, +this begins preparation of a new pending HPT of size 2^(shift) bytes. +It then returns a positive integer with the estimated number of +milliseconds until preparation is complete. + +If called when there is a pending HPT whose size does not match that +requested in the parameters, discards the existing pending HPT and +creates a new one as above. + +If called when there is a pending HPT of the size requested, will: + * If preparation of the pending HPT is already complete, return 0 + * If preparation of the pending HPT has failed, return an error + code, then discard the pending HPT. + * If preparation of the pending HPT is still in progress, return an + estimated number of milliseconds until preparation is complete. + +If called with shift == 0, discards any currently pending HPT and +returns 0 (i.e. cancels any in-progress preparation). + +flags is reserved for future expansion, currently setting any bits in +flags will result in an -EINVAL. + +Normally this will be called repeatedly with the same parameters until +it returns <= 0. The first call will initiate preparation, subsequent +ones will monitor preparation until it completes or fails. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + +4.103 KVM_PPC_RESIZE_HPT_COMMIT + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENXIO is there is no pending HPT, or the pending HPT doesn't + have the requested size + -EBUSY if the pending HPT is not fully prepared + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this requests that the guest be +transferred to working with the new HPT, essentially implementing the +H_RESIZE_HPT_COMMIT hypercall. + +This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has +returned 0 with the same parameters. In other cases +KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or +-EBUSY, though others may be possible if the preparation was started, +but failed). + +This will have undefined effects on the guest if it has not already +placed itself in a quiescent state where no vcpu will make MMU enabled +memory accesses. + +On succsful completion, the pending HPT will become the guest's active +HPT and the previous HPT will be discarded. + +On failure, the guest will still be operating on its previous HPT. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + 5. The kvm_run structure ------------------------ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e0035808c814..7964b970b9ad 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -685,6 +685,13 @@ struct kvm_ppc_smmu_info { struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; }; +/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + #define KVMIO 0xAE /* machine type bits, to be used as argument to KVM_CREATE_VM */ @@ -871,6 +878,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_USER_INSTR0 130 #define KVM_CAP_MSI_DEVID 131 #define KVM_CAP_PPC_HTM 132 +#define KVM_CAP_SPAPR_RESIZE_HPT 133 #define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_HASH_V3 135 @@ -1189,6 +1197,9 @@ struct kvm_s390_ucas_mapping { #define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) /* Available with KVM_CAP_PPC_RTAS */ #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args) +/* Available with KVM_CAP_SPAPR_RESIZE_HPT */ +#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) +#define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) /* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) /* Available with KVM_CAP_PPC_RADIX_MMU */ -- cgit v1.2.3-70-g09d2 From db9a290d9c3c596e5325e2a42133594435e5de46 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:48:59 +1100 Subject: KVM: PPC: Book3S HV: Rename kvm_alloc_hpt() for clarity The difference between kvm_alloc_hpt() and kvmppc_alloc_hpt() is not at all obvious from the name. In practice kvmppc_alloc_hpt() allocates an HPT by whatever means, and calls kvm_alloc_hpt() which will attempt to allocate it with CMA only. To make this less confusing, rename kvm_alloc_hpt() to kvm_alloc_hpt_cma(). Similarly, kvm_release_hpt() is renamed kvm_free_hpt_cma(). Signed-off-by: David Gibson Reviewed-by: Thomas Huth Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_ppc.h | 4 ++-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 8 ++++---- arch/powerpc/kvm/book3s_hv_builtin.c | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 48c760f89590..6fad1f12e9ec 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -186,8 +186,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, unsigned long tce_value, unsigned long npages); extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba); -extern struct page *kvm_alloc_hpt(unsigned long nr_pages); -extern void kvm_release_hpt(struct page *page, unsigned long nr_pages); +extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages); +extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); extern void kvmppc_core_free_memslot(struct kvm *kvm, diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 9df3d940acec..16f278417c69 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -62,7 +62,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) } kvm->arch.hpt_cma_alloc = 0; - page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT)); + page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); memset((void *)hpt, 0, (1ul << order)); @@ -108,7 +108,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) out_freehpt: if (kvm->arch.hpt_cma_alloc) - kvm_release_hpt(page, 1 << (order - PAGE_SHIFT)); + kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); else free_pages(hpt, order - PAGE_SHIFT); return -ENOMEM; @@ -157,8 +157,8 @@ void kvmppc_free_hpt(struct kvm *kvm) { vfree(kvm->arch.revmap); if (kvm->arch.hpt_cma_alloc) - kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), - 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); + kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt_virt), + 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); else if (kvm->arch.hpt_virt) free_pages(kvm->arch.hpt_virt, kvm->arch.hpt_order - PAGE_SHIFT); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fe08fea54b70..96e7e609f621 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -57,19 +57,19 @@ static int __init early_parse_kvm_cma_resv(char *p) } early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv); -struct page *kvm_alloc_hpt(unsigned long nr_pages) +struct page *kvm_alloc_hpt_cma(unsigned long nr_pages) { VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES)); } -EXPORT_SYMBOL_GPL(kvm_alloc_hpt); +EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma); -void kvm_release_hpt(struct page *page, unsigned long nr_pages) +void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages) { cma_release(kvm_cma, page, nr_pages); } -EXPORT_SYMBOL_GPL(kvm_release_hpt); +EXPORT_SYMBOL_GPL(kvm_free_hpt_cma); /** * kvm_cma_reserve() - reserve area for kvm hash pagetable -- cgit v1.2.3-70-g09d2 From 3f9d4f5a5f35e402e91bedf0c15e29cef187a29d Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:00 +1100 Subject: KVM: PPC: Book3S HV: Gather HPT related variables into sub-structure Currently, the powerpc kvm_arch structure contains a number of variables tracking the state of the guest's hashed page table (HPT) in KVM HV. This patch gathers them all together into a single kvm_hpt_info substructure. This makes life more convenient for the upcoming HPT resizing implementation. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 20 +++++--- arch/powerpc/kvm/book3s_64_mmu_hv.c | 92 ++++++++++++++++++------------------- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 ++++++++++++------------- 4 files changed, 92 insertions(+), 84 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index b2dbeac3f450..ea6f0c659936 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -241,12 +241,24 @@ struct kvm_arch_memory_slot { #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ }; +struct kvm_hpt_info { + /* Host virtual (linear mapping) address of guest HPT */ + unsigned long virt; + /* Array of reverse mapping entries for each guest HPTE */ + struct revmap_entry *rev; + unsigned long npte; + unsigned long mask; + /* Guest HPT size is 2**(order) bytes */ + u32 order; + /* 1 if HPT allocated with CMA, 0 otherwise */ + int cma; +}; + struct kvm_arch { unsigned int lpid; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE unsigned int tlb_sets; - unsigned long hpt_virt; - struct revmap_entry *revmap; + struct kvm_hpt_info hpt; atomic64_t mmio_update; unsigned int host_lpid; unsigned long host_lpcr; @@ -256,15 +268,11 @@ struct kvm_arch { unsigned long lpcr; unsigned long vrma_slb_v; int hpte_setup_done; - u32 hpt_order; atomic_t vcpus_running; u32 online_vcores; - unsigned long hpt_npte; - unsigned long hpt_mask; atomic_t hpte_mod_interest; cpumask_t need_tlb_flush; cpumask_t cpu_in_guest; - int hpt_cma_alloc; u8 radix; pgd_t *pgtable; u64 process_table; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 16f278417c69..2af63ce129bc 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -61,12 +61,12 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) order = PPC_MIN_HPT_ORDER; } - kvm->arch.hpt_cma_alloc = 0; + kvm->arch.hpt.cma = 0; page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); memset((void *)hpt, 0, (1ul << order)); - kvm->arch.hpt_cma_alloc = 1; + kvm->arch.hpt.cma = 1; } /* Lastly try successively smaller sizes from the page allocator */ @@ -81,22 +81,22 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) if (!hpt) return -ENOMEM; - kvm->arch.hpt_virt = hpt; - kvm->arch.hpt_order = order; + kvm->arch.hpt.virt = hpt; + kvm->arch.hpt.order = order; /* HPTEs are 2**4 bytes long */ - kvm->arch.hpt_npte = 1ul << (order - 4); + kvm->arch.hpt.npte = 1ul << (order - 4); /* 128 (2**7) bytes in each HPTEG */ - kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; + kvm->arch.hpt.mask = (1ul << (order - 7)) - 1; atomic64_set(&kvm->arch.mmio_update, 0); /* Allocate reverse map array */ - rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); + rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt.npte); if (!rev) { pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); goto out_freehpt; } - kvm->arch.revmap = rev; + kvm->arch.hpt.rev = rev; kvm->arch.sdr1 = __pa(hpt) | (order - 18); pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", @@ -107,7 +107,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) return 0; out_freehpt: - if (kvm->arch.hpt_cma_alloc) + if (kvm->arch.hpt.cma) kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); else free_pages(hpt, order - PAGE_SHIFT); @@ -132,10 +132,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) goto out; } } - if (kvm->arch.hpt_virt) { - order = kvm->arch.hpt_order; + if (kvm->arch.hpt.virt) { + order = kvm->arch.hpt.order; /* Set the entire HPT to 0, i.e. invalid HPTEs */ - memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); + memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); /* * Reset all the reverse-mapping chains for all memslots */ @@ -155,13 +155,13 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) void kvmppc_free_hpt(struct kvm *kvm) { - vfree(kvm->arch.revmap); - if (kvm->arch.hpt_cma_alloc) - kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt_virt), - 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); - else if (kvm->arch.hpt_virt) - free_pages(kvm->arch.hpt_virt, - kvm->arch.hpt_order - PAGE_SHIFT); + vfree(kvm->arch.hpt.rev); + if (kvm->arch.hpt.cma) + kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt.virt), + 1 << (kvm->arch.hpt.order - PAGE_SHIFT)); + else if (kvm->arch.hpt.virt) + free_pages(kvm->arch.hpt.virt, + kvm->arch.hpt.order - PAGE_SHIFT); } /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ @@ -196,8 +196,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, if (npages > 1ul << (40 - porder)) npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ - if (npages > kvm->arch.hpt_mask + 1) - npages = kvm->arch.hpt_mask + 1; + if (npages > kvm->arch.hpt.mask + 1) + npages = kvm->arch.hpt.mask + 1; hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); @@ -207,7 +207,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, for (i = 0; i < npages; ++i) { addr = i << porder; /* can't use hpt_hash since va > 64 bits */ - hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt.mask; /* * We assume that the hash table is empty and no * vcpus are using it at this stage. Since we create @@ -340,11 +340,11 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, preempt_enable(); return -ENOENT; } - hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; if (cpu_has_feature(CPU_FTR_ARCH_300)) v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); - gr = kvm->arch.revmap[index].guest_rpte; + gr = kvm->arch.hpt.rev[index].guest_rpte; unlock_hpte(hptep, orig_v); preempt_enable(); @@ -485,8 +485,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } } index = vcpu->arch.pgfault_index; - hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); - rev = &kvm->arch.revmap[index]; + hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); + rev = &kvm->arch.hpt.rev[index]; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); @@ -748,7 +748,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long h, i, j; __be64 *hptep; unsigned long ptel, psize, rcbits; @@ -768,7 +768,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, * rmap chain lock. */ i = *rmapp & KVMPPC_RMAP_INDEX; - hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); @@ -860,7 +860,7 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm, static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; __be64 *hptep; int ret = 0; @@ -880,7 +880,7 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); j = rev[i].forw; /* If this HPTE isn't referenced, ignore it */ @@ -923,7 +923,7 @@ int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; unsigned long *hp; int ret = 1; @@ -940,7 +940,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, if (*rmapp & KVMPPC_RMAP_PRESENT) { i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); + hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4)); j = rev[i].forw; if (be64_to_cpu(hp[1]) & HPTE_R_R) goto out; @@ -980,7 +980,7 @@ static int vcpus_running(struct kvm *kvm) */ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; unsigned long n; unsigned long v, r; @@ -1005,7 +1005,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) i = head = *rmapp & KVMPPC_RMAP_INDEX; do { unsigned long hptep1; - hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); j = rev[i].forw; /* @@ -1311,8 +1311,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, flags = ctx->flags; i = ctx->index; - hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); - revp = kvm->arch.revmap + i; + hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); + revp = kvm->arch.hpt.rev + i; lbuf = (unsigned long __user *)buf; nb = 0; @@ -1327,7 +1327,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, /* Skip uninteresting entries, i.e. clean on not-first pass */ if (!first_pass) { - while (i < kvm->arch.hpt_npte && + while (i < kvm->arch.hpt.npte && !hpte_dirty(revp, hptp)) { ++i; hptp += 2; @@ -1337,7 +1337,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, hdr.index = i; /* Grab a series of valid entries */ - while (i < kvm->arch.hpt_npte && + while (i < kvm->arch.hpt.npte && hdr.n_valid < 0xffff && nb + HPTE_SIZE < count && record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { @@ -1353,7 +1353,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, ++revp; } /* Now skip invalid entries while we can */ - while (i < kvm->arch.hpt_npte && + while (i < kvm->arch.hpt.npte && hdr.n_invalid < 0xffff && record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { /* found an invalid entry */ @@ -1374,7 +1374,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, } /* Check if we've wrapped around the hash table */ - if (i >= kvm->arch.hpt_npte) { + if (i >= kvm->arch.hpt.npte) { i = 0; ctx->first_pass = 0; break; @@ -1433,11 +1433,11 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, err = -EINVAL; i = hdr.index; - if (i >= kvm->arch.hpt_npte || - i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) + if (i >= kvm->arch.hpt.npte || + i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt.npte) break; - hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); lbuf = (unsigned long __user *)buf; for (j = 0; j < hdr.n_valid; ++j) { __be64 hpte_v; @@ -1624,8 +1624,8 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, kvm = p->kvm; i = p->hpt_index; - hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); - for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { + hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); + for (; len != 0 && i < kvm->arch.hpt.npte; ++i, hptp += 2) { if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) continue; @@ -1635,7 +1635,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, cpu_relax(); v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; hr = be64_to_cpu(hptp[1]); - gr = kvm->arch.revmap[i].guest_rpte; + gr = kvm->arch.hpt.rev[i].guest_rpte; unlock_hpte(hptp, v); preempt_enable(); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bdf281cc88c0..02607128a4d4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3197,7 +3197,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out; /* another vcpu beat us to it */ /* Allocate hashed page table (if not done already) and reset it */ - if (!kvm->arch.hpt_virt) { + if (!kvm->arch.hpt.virt) { err = kvmppc_alloc_hpt(kvm, NULL); if (err) { pr_err("KVM: Couldn't alloc HPT\n"); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index b095afcd4309..175748acc9a1 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -86,10 +86,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, if (*rmap & KVMPPC_RMAP_PRESENT) { i = *rmap & KVMPPC_RMAP_INDEX; - head = &kvm->arch.revmap[i]; + head = &kvm->arch.hpt.rev[i]; if (realmode) head = real_vmalloc_addr(head); - tail = &kvm->arch.revmap[head->back]; + tail = &kvm->arch.hpt.rev[head->back]; if (realmode) tail = real_vmalloc_addr(tail); rev->forw = i; @@ -154,8 +154,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, lock_rmap(rmap); head = *rmap & KVMPPC_RMAP_INDEX; - next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]); - prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]); + next = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->forw]); + prev = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->back]); next->back = rev->back; prev->forw = rev->forw; if (head == pte_index) { @@ -292,11 +292,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Find and lock the HPTEG slot to use */ do_insert: - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); for (i = 0; i < 8; ++i) { if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 && try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | @@ -327,7 +327,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, } pte_index += i; } else { - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | HPTE_V_ABSENT)) { /* Lock the slot and check again */ @@ -344,7 +344,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, } /* Save away the guest's idea of the second HPTE dword */ - rev = &kvm->arch.revmap[pte_index]; + rev = &kvm->arch.hpt.rev[pte_index]; if (realmode) rev = real_vmalloc_addr(rev); if (rev) { @@ -469,9 +469,9 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); pte = orig_pte = be64_to_cpu(hpte[0]); @@ -487,7 +487,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, return H_NOT_FOUND; } - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); v = pte & ~HPTE_V_HVLOCK; if (v & HPTE_V_VALID) { hpte[0] &= ~cpu_to_be64(HPTE_V_VALID); @@ -557,13 +557,13 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) break; } if (req != 1 || flags == 3 || - pte_index >= kvm->arch.hpt_npte) { + pte_index >= kvm->arch.hpt.npte) { /* parameter error */ args[j] = ((0xa0 | flags) << 56) + pte_index; ret = H_PARAMETER; break; } - hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4)); + hp = (__be64 *) (kvm->arch.hpt.virt + (pte_index << 4)); /* to avoid deadlock, don't spin except for first */ if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) { if (n) @@ -600,7 +600,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) } args[j] = ((0x80 | flags) << 56) + pte_index; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); note_hpte_modification(kvm, rev); if (!(hp0 & HPTE_V_VALID)) { @@ -657,10 +657,10 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); v = pte_v = be64_to_cpu(hpte[0]); @@ -680,7 +680,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, /* Update guest view of 2nd HPTE dword */ mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI | HPTE_R_KEY_LO; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); if (rev) { r = (rev->guest_rpte & ~mask) | bits; rev->guest_rpte = r; @@ -728,15 +728,15 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; if (flags & H_READ_4) { pte_index &= ~3; n = 4; } - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); for (i = 0; i < n; ++i, ++pte_index) { - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; r = be64_to_cpu(hpte[1]); if (cpu_has_feature(CPU_FTR_ARCH_300)) { @@ -769,11 +769,11 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); v = be64_to_cpu(hpte[0]); @@ -817,11 +817,11 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvm->arch.hpt.npte) return H_PARAMETER; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); v = be64_to_cpu(hpte[0]); @@ -970,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, somask = (1UL << 28) - 1; vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; } - hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; + hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt.mask; avpn = slb_v & ~(somask >> 16); /* also includes B */ avpn |= (eaddr & somask) >> 16; @@ -981,7 +981,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, val |= avpn; for (;;) { - hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (hash << 7)); for (i = 0; i < 16; i += 2) { /* Read the PTE racily */ @@ -1017,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, if (val & HPTE_V_SECONDARY) break; val |= HPTE_V_SECONDARY; - hash = hash ^ kvm->arch.hpt_mask; + hash = hash ^ kvm->arch.hpt.mask; } return -1; } @@ -1066,14 +1066,14 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, return status; /* there really was no HPTE */ return 0; /* for prot fault, HPTE disappeared */ } - hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; r = be64_to_cpu(hpte[1]); if (cpu_has_feature(CPU_FTR_ARCH_300)) { v = hpte_new_to_old_v(v, r); r = hpte_new_to_old_r(r); } - rev = real_vmalloc_addr(&kvm->arch.revmap[index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[index]); gr = rev->guest_rpte; unlock_hpte(hpte, orig_v); -- cgit v1.2.3-70-g09d2 From 3d089f84c6f9b7b0eda993142d73961a44b553d2 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:01 +1100 Subject: KVM: PPC: Book3S HV: Don't store values derivable from HPT order Currently the kvm_hpt_info structure stores the hashed page table's order, and also the number of HPTEs it contains and a mask for its size. The last two can be easily derived from the order, so remove them and just calculate them as necessary with a couple of helper inlines. Signed-off-by: David Gibson Reviewed-by: Thomas Huth Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s_64.h | 12 ++++++++++++ arch/powerpc/include/asm/kvm_host.h | 2 -- arch/powerpc/kvm/book3s_64_mmu_hv.c | 28 +++++++++++++--------------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 18 +++++++++--------- 4 files changed, 34 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 0db010cc4e65..ecc3b33bcc59 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -356,6 +356,18 @@ extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); extern void kvmhv_rm_send_ipi(int cpu); +static inline unsigned long kvmppc_hpt_npte(struct kvm_hpt_info *hpt) +{ + /* HPTEs are 2**4 bytes long */ + return 1UL << (hpt->order - 4); +} + +static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt) +{ + /* 128 (2**7) bytes in each HPTEG */ + return (1UL << (hpt->order - 7)) - 1; +} + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ea6f0c659936..0aa0f22d775a 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -246,8 +246,6 @@ struct kvm_hpt_info { unsigned long virt; /* Array of reverse mapping entries for each guest HPTE */ struct revmap_entry *rev; - unsigned long npte; - unsigned long mask; /* Guest HPT size is 2**(order) bytes */ u32 order; /* 1 if HPT allocated with CMA, 0 otherwise */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 2af63ce129bc..d89995ef6c7b 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -83,15 +83,11 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) kvm->arch.hpt.virt = hpt; kvm->arch.hpt.order = order; - /* HPTEs are 2**4 bytes long */ - kvm->arch.hpt.npte = 1ul << (order - 4); - /* 128 (2**7) bytes in each HPTEG */ - kvm->arch.hpt.mask = (1ul << (order - 7)) - 1; atomic64_set(&kvm->arch.mmio_update, 0); /* Allocate reverse map array */ - rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt.npte); + rev = vmalloc(sizeof(struct revmap_entry) * kvmppc_hpt_npte(&kvm->arch.hpt)); if (!rev) { pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); goto out_freehpt; @@ -196,8 +192,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, if (npages > 1ul << (40 - porder)) npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ - if (npages > kvm->arch.hpt.mask + 1) - npages = kvm->arch.hpt.mask + 1; + if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) + npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); @@ -207,7 +203,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, for (i = 0; i < npages; ++i) { addr = i << porder; /* can't use hpt_hash since va > 64 bits */ - hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt.mask; + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) + & kvmppc_hpt_mask(&kvm->arch.hpt); /* * We assume that the hash table is empty and no * vcpus are using it at this stage. Since we create @@ -1327,7 +1324,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, /* Skip uninteresting entries, i.e. clean on not-first pass */ if (!first_pass) { - while (i < kvm->arch.hpt.npte && + while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && !hpte_dirty(revp, hptp)) { ++i; hptp += 2; @@ -1337,7 +1334,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, hdr.index = i; /* Grab a series of valid entries */ - while (i < kvm->arch.hpt.npte && + while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && hdr.n_valid < 0xffff && nb + HPTE_SIZE < count && record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { @@ -1353,7 +1350,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, ++revp; } /* Now skip invalid entries while we can */ - while (i < kvm->arch.hpt.npte && + while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && hdr.n_invalid < 0xffff && record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { /* found an invalid entry */ @@ -1374,7 +1371,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, } /* Check if we've wrapped around the hash table */ - if (i >= kvm->arch.hpt.npte) { + if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) { i = 0; ctx->first_pass = 0; break; @@ -1433,8 +1430,8 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, err = -EINVAL; i = hdr.index; - if (i >= kvm->arch.hpt.npte || - i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt.npte) + if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) || + i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt)) break; hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); @@ -1625,7 +1622,8 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, kvm = p->kvm; i = p->hpt_index; hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); - for (; len != 0 && i < kvm->arch.hpt.npte; ++i, hptp += 2) { + for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); + ++i, hptp += 2) { if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) continue; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 175748acc9a1..6fca970373ee 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -292,7 +292,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Find and lock the HPTEG slot to use */ do_insert: - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; @@ -469,7 +469,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) @@ -557,7 +557,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) break; } if (req != 1 || flags == 3 || - pte_index >= kvm->arch.hpt.npte) { + pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) { /* parameter error */ args[j] = ((0xa0 | flags) << 56) + pte_index; ret = H_PARAMETER; @@ -657,7 +657,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); @@ -728,7 +728,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; if (flags & H_READ_4) { pte_index &= ~3; @@ -769,7 +769,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); @@ -817,7 +817,7 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - if (pte_index >= kvm->arch.hpt.npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); @@ -970,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, somask = (1UL << 28) - 1; vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; } - hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt.mask; + hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvmppc_hpt_mask(&kvm->arch.hpt); avpn = slb_v & ~(somask >> 16); /* also includes B */ avpn |= (eaddr & somask) >> 16; @@ -1017,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, if (val & HPTE_V_SECONDARY) break; val |= HPTE_V_SECONDARY; - hash = hash ^ kvm->arch.hpt.mask; + hash = hash ^ kvmppc_hpt_mask(&kvm->arch.hpt); } return -1; } -- cgit v1.2.3-70-g09d2 From aae0777f1e8224b4fbb78b2c692060852ee750c8 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:02 +1100 Subject: KVM: PPC: Book3S HV: Split HPT allocation from activation Currently, kvmppc_alloc_hpt() both allocates a new hashed page table (HPT) and sets it up as the active page table for a VM. For the upcoming HPT resize implementation we're going to want to allocate HPTs separately from activating them. So, split the allocation itself out into kvmppc_allocate_hpt() and perform the activation with a new kvmppc_set_hpt() function. Likewise we split kvmppc_free_hpt(), which just frees the HPT, from kvmppc_release_hpt() which unsets it as an active HPT, then frees it. We also move the logic to fall back to smaller HPT sizes if the first try fails into the single caller which used that behaviour, kvmppc_hv_setup_htab_rma(). This introduces a slight semantic change, in that previously if the initial attempt at CMA allocation failed, we would fall back to attempting smaller sizes with the page allocator. Now, we try first CMA, then the page allocator at each size. As far as I can tell this change should be harmless. To match, we make kvmppc_free_hpt() just free the actual HPT itself. The call to kvmppc_free_lpid() that was there, we move to the single caller. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s_64.h | 4 ++ arch/powerpc/include/asm/kvm_ppc.h | 5 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 97 ++++++++++++++++---------------- arch/powerpc/kvm/book3s_hv.c | 17 +++++- 4 files changed, 68 insertions(+), 55 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index ecc3b33bcc59..d9b48f5bb606 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -22,6 +22,10 @@ #include +/* Power architecture requires HPT is at least 256kiB, at most 64TiB */ +#define PPC_MIN_HPT_ORDER 18 +#define PPC_MAX_HPT_ORDER 46 + #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 6fad1f12e9ec..ba61dec72089 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -155,9 +155,10 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); -extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp); +extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order); +extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); -extern void kvmppc_free_hpt(struct kvm *kvm); +extern void kvmppc_free_hpt(struct kvm_hpt_info *info); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d89995ef6c7b..62d132a3cec5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -40,74 +40,66 @@ #include "trace_hv.h" -/* Power architecture requires HPT is at least 256kB */ -#define PPC_MIN_HPT_ORDER 18 - static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret); static void kvmppc_rmap_reset(struct kvm *kvm); -long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) +int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) { unsigned long hpt = 0; - struct revmap_entry *rev; + int cma = 0; struct page *page = NULL; - long order = KVM_DEFAULT_HPT_ORDER; + struct revmap_entry *rev; + unsigned long npte; - if (htab_orderp) { - order = *htab_orderp; - if (order < PPC_MIN_HPT_ORDER) - order = PPC_MIN_HPT_ORDER; - } + if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) + return -EINVAL; - kvm->arch.hpt.cma = 0; page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); memset((void *)hpt, 0, (1ul << order)); - kvm->arch.hpt.cma = 1; + cma = 1; } - /* Lastly try successively smaller sizes from the page allocator */ - /* Only do this if userspace didn't specify a size via ioctl */ - while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) { - hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| - __GFP_NOWARN, order - PAGE_SHIFT); - if (!hpt) - --order; - } + if (!hpt) + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT + |__GFP_NOWARN, order - PAGE_SHIFT); if (!hpt) return -ENOMEM; - kvm->arch.hpt.virt = hpt; - kvm->arch.hpt.order = order; - - atomic64_set(&kvm->arch.mmio_update, 0); + /* HPTEs are 2**4 bytes long */ + npte = 1ul << (order - 4); /* Allocate reverse map array */ - rev = vmalloc(sizeof(struct revmap_entry) * kvmppc_hpt_npte(&kvm->arch.hpt)); + rev = vmalloc(sizeof(struct revmap_entry) * npte); if (!rev) { - pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); - goto out_freehpt; + pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n"); + if (cma) + kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); + else + free_pages(hpt, order - PAGE_SHIFT); + return -ENOMEM; } - kvm->arch.hpt.rev = rev; - kvm->arch.sdr1 = __pa(hpt) | (order - 18); - pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", - hpt, order, kvm->arch.lpid); + info->order = order; + info->virt = hpt; + info->cma = cma; + info->rev = rev; - if (htab_orderp) - *htab_orderp = order; return 0; +} - out_freehpt: - if (kvm->arch.hpt.cma) - kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); - else - free_pages(hpt, order - PAGE_SHIFT); - return -ENOMEM; +void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) +{ + atomic64_set(&kvm->arch.mmio_update, 0); + kvm->arch.hpt = *info; + kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); + + pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", + info->virt, (long)info->order, kvm->arch.lpid); } long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) @@ -141,23 +133,28 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) *htab_orderp = order; err = 0; } else { - err = kvmppc_alloc_hpt(kvm, htab_orderp); - order = *htab_orderp; + struct kvm_hpt_info info; + + err = kvmppc_allocate_hpt(&info, *htab_orderp); + if (err < 0) + goto out; + kvmppc_set_hpt(kvm, &info); } out: mutex_unlock(&kvm->lock); return err; } -void kvmppc_free_hpt(struct kvm *kvm) +void kvmppc_free_hpt(struct kvm_hpt_info *info) { - vfree(kvm->arch.hpt.rev); - if (kvm->arch.hpt.cma) - kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt.virt), - 1 << (kvm->arch.hpt.order - PAGE_SHIFT)); - else if (kvm->arch.hpt.virt) - free_pages(kvm->arch.hpt.virt, - kvm->arch.hpt.order - PAGE_SHIFT); + vfree(info->rev); + if (info->cma) + kvm_free_hpt_cma(virt_to_page(info->virt), + 1 << (info->order - PAGE_SHIFT)); + else if (info->virt) + free_pages(info->virt, info->order - PAGE_SHIFT); + info->virt = 0; + info->order = 0; } /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 02607128a4d4..19987e4343c3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3198,11 +3198,22 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Allocate hashed page table (if not done already) and reset it */ if (!kvm->arch.hpt.virt) { - err = kvmppc_alloc_hpt(kvm, NULL); - if (err) { + int order = KVM_DEFAULT_HPT_ORDER; + struct kvm_hpt_info info; + + err = kvmppc_allocate_hpt(&info, order); + /* If we get here, it means userspace didn't specify a + * size explicitly. So, try successively smaller + * sizes if the default failed. */ + while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER) + err = kvmppc_allocate_hpt(&info, order); + + if (err < 0) { pr_err("KVM: Couldn't alloc HPT\n"); goto out; } + + kvmppc_set_hpt(kvm, &info); } /* Look up the memslot for guest physical address 0 */ @@ -3467,7 +3478,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) if (kvm_is_radix(kvm)) kvmppc_free_radix(kvm); else - kvmppc_free_hpt(kvm); + kvmppc_free_hpt(&kvm->arch.hpt); kvmppc_free_pimap(kvm); } -- cgit v1.2.3-70-g09d2 From f98a8bf9ee201b7e22fc05e27150b1e481d4949f Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:03 +1100 Subject: KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size The KVM_PPC_ALLOCATE_HTAB ioctl() is used to set the size of hashed page table (HPT) that userspace expects a guest VM to have, and is also used to clear that HPT when necessary (e.g. guest reboot). At present, once the ioctl() is called for the first time, the HPT size can never be changed thereafter - it will be cleared but always sized as from the first call. With upcoming HPT resize implementation, we're going to need to allow userspace to resize the HPT at reset (to change it back to the default size if the guest changed it). So, we need to allow this ioctl() to change the HPT size. This patch also updates Documentation/virtual/kvm/api.txt to reflect the new behaviour. In fact the documentation was already slightly incorrect since 572abd5 "KVM: PPC: Book3S HV: Don't fall back to smaller HPT size in allocation ioctl" Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- Documentation/virtual/kvm/api.txt | 14 ++++++++------ arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 29 ++++++++++++++++------------- arch/powerpc/kvm/book3s_hv.c | 5 +---- 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 64f217af0416..f1945d8cbccb 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2443,18 +2443,20 @@ are, it will do nothing and return an EBUSY error. The parameter is a pointer to a 32-bit unsigned integer variable containing the order (log base 2) of the desired size of the hash table, which must be between 18 and 46. On successful return from the -ioctl, it will have been updated with the order of the hash table that -was allocated. +ioctl, the value will not be changed by the kernel. If no hash table has been allocated when any vcpu is asked to run (with the KVM_RUN ioctl), the host kernel will allocate a default-sized hash table (16 MB). If this ioctl is called when a hash table has already been allocated, -the kernel will clear out the existing hash table (zero all HPTEs) and -return the hash table order in the parameter. (If the guest is using -the virtualized real-mode area (VRMA) facility, the kernel will -re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) +with a different order from the existing hash table, the existing hash +table will be freed and a new one allocated. If this is ioctl is +called when a hash table has already been allocated of the same order +as specified, the kernel will clear out the existing hash table (zero +all HPTEs). In either case, if the guest is using the virtualized +real-mode area (VRMA) facility, the kernel will re-create the VMRA +HPTEs on the next KVM_RUN of any vcpu. 4.77 KVM_S390_INTERRUPT diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ba61dec72089..cf3ef8d75910 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -157,7 +157,7 @@ extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order); extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); -extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); +extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order); extern void kvmppc_free_hpt(struct kvm_hpt_info *info); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 62d132a3cec5..3a607faf0f9f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -102,10 +102,10 @@ void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) info->virt, (long)info->order, kvm->arch.lpid); } -long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) +long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) { long err = -EBUSY; - long order; + struct kvm_hpt_info info; if (kvm_is_radix(kvm)) return -EINVAL; @@ -120,8 +120,9 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) goto out; } } - if (kvm->arch.hpt.virt) { - order = kvm->arch.hpt.order; + if (kvm->arch.hpt.order == order) { + /* We already have a suitable HPT */ + /* Set the entire HPT to 0, i.e. invalid HPTEs */ memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); /* @@ -130,17 +131,19 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) kvmppc_rmap_reset(kvm); /* Ensure that each vcpu will flush its TLB on next entry. */ cpumask_setall(&kvm->arch.need_tlb_flush); - *htab_orderp = order; err = 0; - } else { - struct kvm_hpt_info info; - - err = kvmppc_allocate_hpt(&info, *htab_orderp); - if (err < 0) - goto out; - kvmppc_set_hpt(kvm, &info); + goto out; } - out: + + if (kvm->arch.hpt.virt) + kvmppc_free_hpt(&kvm->arch.hpt); + + err = kvmppc_allocate_hpt(&info, order); + if (err < 0) + goto out; + kvmppc_set_hpt(kvm, &info); + +out: mutex_unlock(&kvm->lock); return err; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 19987e4343c3..fbc901746304 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3704,12 +3704,9 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp, r = -EFAULT; if (get_user(htab_order, (u32 __user *)argp)) break; - r = kvmppc_alloc_reset_hpt(kvm, &htab_order); + r = kvmppc_alloc_reset_hpt(kvm, htab_order); if (r) break; - r = -EFAULT; - if (put_user(htab_order, (u32 __user *)argp)) - break; r = 0; break; } -- cgit v1.2.3-70-g09d2 From 639e459768845924705933db9142baef545ff5fc Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:04 +1100 Subject: KVM: PPC: Book3S HV: Create kvmppc_unmap_hpte_helper() The kvm_unmap_rmapp() function, called from certain MMU notifiers, is used to force all guest mappings of a particular host page to be set ABSENT, and removed from the reverse mappings. For HPT resizing, we will have some cases where we want to set just a single guest HPTE ABSENT and remove its reverse mappings. To prepare with this, we split out the logic from kvm_unmap_rmapp() to evict a single HPTE, moving it to a new helper function. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 77 +++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 3a607faf0f9f..6d70989686a7 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -742,13 +742,53 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, return kvm_handle_hva_range(kvm, hva, hva + 1, handler); } +/* Must be called with both HPTE and rmap locked */ +static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, + unsigned long *rmapp, unsigned long gfn) +{ + __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); + struct revmap_entry *rev = kvm->arch.hpt.rev; + unsigned long j, h; + unsigned long ptel, psize, rcbits; + + j = rev[i].forw; + if (j == i) { + /* chain is now empty */ + *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + } else { + /* remove i from chain */ + h = rev[i].back; + rev[h].forw = j; + rev[j].back = h; + rev[i].forw = rev[i].back = i; + *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; + } + + /* Now check and modify the HPTE */ + ptel = rev[i].guest_rpte; + psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && + hpte_rpn(ptel, psize) == gfn) { + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); + kvmppc_invalidate_hpte(kvm, hptep, i); + hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); + /* Harvest R and C */ + rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); + *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; + if (rcbits & HPTE_R_C) + kvmppc_update_rmap_change(rmapp, psize); + if (rcbits & ~rev[i].guest_rpte) { + rev[i].guest_rpte = ptel | rcbits; + note_hpte_modification(kvm, &rev[i]); + } + } +} + static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.hpt.rev; - unsigned long h, i, j; + unsigned long i; __be64 *hptep; - unsigned long ptel, psize, rcbits; unsigned long *rmapp; rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; @@ -773,37 +813,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, cpu_relax(); continue; } - j = rev[i].forw; - if (j == i) { - /* chain is now empty */ - *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); - } else { - /* remove i from chain */ - h = rev[i].back; - rev[h].forw = j; - rev[j].back = h; - rev[i].forw = rev[i].back = i; - *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; - } - /* Now check and modify the HPTE */ - ptel = rev[i].guest_rpte; - psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); - if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && - hpte_rpn(ptel, psize) == gfn) { - hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); - kvmppc_invalidate_hpte(kvm, hptep, i); - hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); - /* Harvest R and C */ - rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); - *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; - if (rcbits & HPTE_R_C) - kvmppc_update_rmap_change(rmapp, psize); - if (rcbits & ~rev[i].guest_rpte) { - rev[i].guest_rpte = ptel | rcbits; - note_hpte_modification(kvm, &rev[i]); - } - } + kvmppc_unmap_hpte(kvm, i, rmapp, gfn); unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } -- cgit v1.2.3-70-g09d2 From 5e9859699aba74c0e297645e7d1734cd4b964de7 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:05 +1100 Subject: KVM: PPC: Book3S HV: Outline of KVM-HV HPT resizing implementation This adds a not yet working outline of the HPT resizing PAPR extension. Specifically it adds the necessary ioctl() functions, their basic steps, the work function which will handle preparation for the resize, and synchronization between these, the guest page fault path and guest HPT update path. The actual guts of the implementation isn't here yet, so for now the calls will always fail. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 3 + arch/powerpc/include/asm/kvm_ppc.h | 4 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 191 ++++++++++++++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv.c | 25 +++++ 4 files changed, 223 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0aa0f22d775a..7bba8f415627 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -252,6 +252,8 @@ struct kvm_hpt_info { int cma; }; +struct kvm_resize_hpt; + struct kvm_arch { unsigned int lpid; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -276,6 +278,7 @@ struct kvm_arch { u64 process_table; struct dentry *debugfs_dir; struct dentry *htab_dentry; + struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */ #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE struct mutex hpt_mutex; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index cf3ef8d75910..dd11c4c8c56a 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -215,6 +215,10 @@ extern void kvmppc_bookehv_exit(void); extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); +extern long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt); +extern long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt); int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 6d70989686a7..323287fc0c01 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -40,9 +40,34 @@ #include "trace_hv.h" +//#define DEBUG_RESIZE_HPT 1 + +#ifdef DEBUG_RESIZE_HPT +#define resize_hpt_debug(resize, ...) \ + do { \ + printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \ + printk(__VA_ARGS__); \ + } while (0) +#else +#define resize_hpt_debug(resize, ...) \ + do { } while (0) +#endif + static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret); + +struct kvm_resize_hpt { + /* These fields read-only after init */ + struct kvm *kvm; + struct work_struct work; + u32 order; + + /* These fields protected by kvm->lock */ + int error; + bool prepare_done; +}; + static void kvmppc_rmap_reset(struct kvm *kvm); int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) @@ -1179,6 +1204,172 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, srcu_read_unlock(&kvm->srcu, srcu_idx); } +/* + * HPT resizing + */ +static int resize_hpt_allocate(struct kvm_resize_hpt *resize) +{ + return 0; +} + +static int resize_hpt_rehash(struct kvm_resize_hpt *resize) +{ + return -EIO; +} + +static void resize_hpt_pivot(struct kvm_resize_hpt *resize) +{ +} + +static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) +{ + BUG_ON(kvm->arch.resize_hpt != resize); + kvm->arch.resize_hpt = NULL; + kfree(resize); +} + +static void resize_hpt_prepare_work(struct work_struct *work) +{ + struct kvm_resize_hpt *resize = container_of(work, + struct kvm_resize_hpt, + work); + struct kvm *kvm = resize->kvm; + int err; + + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", + resize->order); + + err = resize_hpt_allocate(resize); + + mutex_lock(&kvm->lock); + + resize->error = err; + resize->prepare_done = true; + + mutex_unlock(&kvm->lock); +} + +long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt) +{ + unsigned long flags = rhpt->flags; + unsigned long shift = rhpt->shift; + struct kvm_resize_hpt *resize; + int ret; + + if (flags != 0) + return -EINVAL; + + if (shift && ((shift < 18) || (shift > 46))) + return -EINVAL; + + mutex_lock(&kvm->lock); + + resize = kvm->arch.resize_hpt; + + if (resize) { + if (resize->order == shift) { + /* Suitable resize in progress */ + if (resize->prepare_done) { + ret = resize->error; + if (ret != 0) + resize_hpt_release(kvm, resize); + } else { + ret = 100; /* estimated time in ms */ + } + + goto out; + } + + /* not suitable, cancel it */ + resize_hpt_release(kvm, resize); + } + + ret = 0; + if (!shift) + goto out; /* nothing to do */ + + /* start new resize */ + + resize = kzalloc(sizeof(*resize), GFP_KERNEL); + resize->order = shift; + resize->kvm = kvm; + INIT_WORK(&resize->work, resize_hpt_prepare_work); + kvm->arch.resize_hpt = resize; + + schedule_work(&resize->work); + + ret = 100; /* estimated time in ms */ + +out: + mutex_unlock(&kvm->lock); + return ret; +} + +static void resize_hpt_boot_vcpu(void *opaque) +{ + /* Nothing to do, just force a KVM exit */ +} + +long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt) +{ + unsigned long flags = rhpt->flags; + unsigned long shift = rhpt->shift; + struct kvm_resize_hpt *resize; + long ret; + + if (flags != 0) + return -EINVAL; + + if (shift && ((shift < 18) || (shift > 46))) + return -EINVAL; + + mutex_lock(&kvm->lock); + + resize = kvm->arch.resize_hpt; + + /* This shouldn't be possible */ + ret = -EIO; + if (WARN_ON(!kvm->arch.hpte_setup_done)) + goto out_no_hpt; + + /* Stop VCPUs from running while we mess with the HPT */ + kvm->arch.hpte_setup_done = 0; + smp_mb(); + + /* Boot all CPUs out of the guest so they re-read + * hpte_setup_done */ + on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); + + ret = -ENXIO; + if (!resize || (resize->order != shift)) + goto out; + + ret = -EBUSY; + if (!resize->prepare_done) + goto out; + + ret = resize->error; + if (ret != 0) + goto out; + + ret = resize_hpt_rehash(resize); + if (ret != 0) + goto out; + + resize_hpt_pivot(resize); + +out: + /* Let VCPUs run again */ + kvm->arch.hpte_setup_done = 1; + smp_mb(); +out_no_hpt: + resize_hpt_release(kvm, resize); + mutex_unlock(&kvm->lock); + return ret; +} + /* * Functions for reading and writing the hash table via reads and * writes on a file descriptor. diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index fbc901746304..1e107ece4e37 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3422,6 +3422,9 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvm->arch.lpcr = lpcr; + /* Initialization for future HPT resizes */ + kvm->arch.resize_hpt = NULL; + /* * Work out how many sets the TLB has, for the use of * the TLB invalidation loop in book3s_hv_rmhandlers.S. @@ -3721,6 +3724,28 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp, break; } + case KVM_PPC_RESIZE_HPT_PREPARE: { + struct kvm_ppc_resize_hpt rhpt; + + r = -EFAULT; + if (copy_from_user(&rhpt, argp, sizeof(rhpt))) + break; + + r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt); + break; + } + + case KVM_PPC_RESIZE_HPT_COMMIT: { + struct kvm_ppc_resize_hpt rhpt; + + r = -EFAULT; + if (copy_from_user(&rhpt, argp, sizeof(rhpt))) + break; + + r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt); + break; + } + default: r = -ENOTTY; } -- cgit v1.2.3-70-g09d2 From b5baa68773150772c275b4af1bb31327200cfc05 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:06 +1100 Subject: KVM: PPC: Book3S HV: KVM-HV HPT resizing implementation This adds the "guts" of the implementation for the HPT resizing PAPR extension. It has the code to allocate and clear a new HPT, rehash an existing HPT's entries into it, and accomplish the switchover for a KVM guest from the old HPT to the new one. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 188 +++++++++++++++++++++++++++++++++++- 1 file changed, 187 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 323287fc0c01..013552f05182 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -66,6 +66,10 @@ struct kvm_resize_hpt { /* These fields protected by kvm->lock */ int error; bool prepare_done; + + /* Private to the work thread, until prepare_done is true, + * then protected by kvm->resize_hpt_sem */ + struct kvm_hpt_info hpt; }; static void kvmppc_rmap_reset(struct kvm *kvm); @@ -1209,21 +1213,203 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, */ static int resize_hpt_allocate(struct kvm_resize_hpt *resize) { + int rc; + + rc = kvmppc_allocate_hpt(&resize->hpt, resize->order); + if (rc < 0) + return rc; + + resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", + resize->hpt.virt); + return 0; } +static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, + unsigned long idx) +{ + struct kvm *kvm = resize->kvm; + struct kvm_hpt_info *old = &kvm->arch.hpt; + struct kvm_hpt_info *new = &resize->hpt; + unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1; + unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1; + __be64 *hptep, *new_hptep; + unsigned long vpte, rpte, guest_rpte; + int ret; + struct revmap_entry *rev; + unsigned long apsize, psize, avpn, pteg, hash; + unsigned long new_idx, new_pteg, replace_vpte; + + hptep = (__be64 *)(old->virt + (idx << 4)); + + /* Guest is stopped, so new HPTEs can't be added or faulted + * in, only unmapped or altered by host actions. So, it's + * safe to check this before we take the HPTE lock */ + vpte = be64_to_cpu(hptep[0]); + if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) + return 0; /* nothing to do */ + + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + + vpte = be64_to_cpu(hptep[0]); + + ret = 0; + if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) + /* Nothing to do */ + goto out; + + /* Unmap */ + rev = &old->rev[idx]; + guest_rpte = rev->guest_rpte; + + ret = -EIO; + apsize = hpte_page_size(vpte, guest_rpte); + if (!apsize) + goto out; + + if (vpte & HPTE_V_VALID) { + unsigned long gfn = hpte_rpn(guest_rpte, apsize); + int srcu_idx = srcu_read_lock(&kvm->srcu); + struct kvm_memory_slot *memslot = + __gfn_to_memslot(kvm_memslots(kvm), gfn); + + if (memslot) { + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; + + lock_rmap(rmapp); + kvmppc_unmap_hpte(kvm, idx, rmapp, gfn); + unlock_rmap(rmapp); + } + + srcu_read_unlock(&kvm->srcu, srcu_idx); + } + + /* Reload PTE after unmap */ + vpte = be64_to_cpu(hptep[0]); + + BUG_ON(vpte & HPTE_V_VALID); + BUG_ON(!(vpte & HPTE_V_ABSENT)); + + ret = 0; + if (!(vpte & HPTE_V_BOLTED)) + goto out; + + rpte = be64_to_cpu(hptep[1]); + psize = hpte_base_page_size(vpte, rpte); + avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23); + pteg = idx / HPTES_PER_GROUP; + if (vpte & HPTE_V_SECONDARY) + pteg = ~pteg; + + if (!(vpte & HPTE_V_1TB_SEG)) { + unsigned long offset, vsid; + + /* We only have 28 - 23 bits of offset in avpn */ + offset = (avpn & 0x1f) << 23; + vsid = avpn >> 5; + /* We can find more bits from the pteg value */ + if (psize < (1ULL << 23)) + offset |= ((vsid ^ pteg) & old_hash_mask) * psize; + + hash = vsid ^ (offset / psize); + } else { + unsigned long offset, vsid; + + /* We only have 40 - 23 bits of seg_off in avpn */ + offset = (avpn & 0x1ffff) << 23; + vsid = avpn >> 17; + if (psize < (1ULL << 23)) + offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize; + + hash = vsid ^ (vsid << 25) ^ (offset / psize); + } + + new_pteg = hash & new_hash_mask; + if (vpte & HPTE_V_SECONDARY) { + BUG_ON(~pteg != (hash & old_hash_mask)); + new_pteg = ~new_pteg; + } else { + BUG_ON(pteg != (hash & old_hash_mask)); + } + + new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); + new_hptep = (__be64 *)(new->virt + (new_idx << 4)); + + replace_vpte = be64_to_cpu(new_hptep[0]); + + if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { + BUG_ON(new->order >= old->order); + + if (replace_vpte & HPTE_V_BOLTED) { + if (vpte & HPTE_V_BOLTED) + /* Bolted collision, nothing we can do */ + ret = -ENOSPC; + /* Discard the new HPTE */ + goto out; + } + + /* Discard the previous HPTE */ + } + + new_hptep[1] = cpu_to_be64(rpte); + new->rev[new_idx].guest_rpte = guest_rpte; + /* No need for a barrier, since new HPT isn't active */ + new_hptep[0] = cpu_to_be64(vpte); + unlock_hpte(new_hptep, vpte); + +out: + unlock_hpte(hptep, vpte); + return ret; +} + static int resize_hpt_rehash(struct kvm_resize_hpt *resize) { - return -EIO; + struct kvm *kvm = resize->kvm; + unsigned long i; + int rc; + + for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { + rc = resize_hpt_rehash_hpte(resize, i); + if (rc != 0) + return rc; + } + + return 0; } static void resize_hpt_pivot(struct kvm_resize_hpt *resize) { + struct kvm *kvm = resize->kvm; + struct kvm_hpt_info hpt_tmp; + + /* Exchange the pending tables in the resize structure with + * the active tables */ + + resize_hpt_debug(resize, "resize_hpt_pivot()\n"); + + spin_lock(&kvm->mmu_lock); + asm volatile("ptesync" : : : "memory"); + + hpt_tmp = kvm->arch.hpt; + kvmppc_set_hpt(kvm, &resize->hpt); + resize->hpt = hpt_tmp; + + spin_unlock(&kvm->mmu_lock); + + synchronize_srcu_expedited(&kvm->srcu); + + resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); } static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) { BUG_ON(kvm->arch.resize_hpt != resize); + + if (resize->hpt.virt) + kvmppc_free_hpt(&resize->hpt); + kvm->arch.resize_hpt = NULL; kfree(resize); } -- cgit v1.2.3-70-g09d2 From 050f23390f6bdbfa7dd2800884d32490489851b7 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 20 Dec 2016 16:49:07 +1100 Subject: KVM: PPC: Book3S HV: Advertise availablity of HPT resizing on KVM HV This updates the KVM_CAP_SPAPR_RESIZE_HPT capability to advertise the presence of in-kernel HPT resizing on KVM HV. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/powerpc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 40a5b2d75ed1..2b3e4e620078 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -612,6 +612,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPAPR_MULTITCE: r = 1; break; + case KVM_CAP_SPAPR_RESIZE_HPT: + r = !!hv_enabled; + break; #endif case KVM_CAP_PPC_HTM: r = cpu_has_feature(CPU_FTR_TM_COMP) && -- cgit v1.2.3-70-g09d2 From 11710dec8a45084a3e0ad2f6da944c990fad23a4 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Wed, 1 Feb 2017 11:03:45 +0100 Subject: KVM: arm/arm64: Remove kvm_vgic_inject_mapped_irq The only benefit of having kvm_vgic_inject_mapped_irq separate from kvm_vgic_inject_irq is that we pass a boolean that we use for error checking on the injection path. While this could potentially help in some aspect of robustness, it's also a little bit of a defensive move, and arguably callers into the vgic should have make sure they have marked their virtual IRQs as mapped if required. Acked-by: Marc Zyngier Reviewed-by: Andre Przywara Signed-off-by: Christoffer Dall --- virt/kvm/arm/arch_timer.c | 3 ++- virt/kvm/arm/vgic/vgic.c | 50 +++++++++++++++-------------------------------- 2 files changed, 18 insertions(+), 35 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 6a084cd57b88..91ecf48f7fe2 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -175,7 +175,8 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) timer->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->irq.irq, timer->irq.level); - ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, + + ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, timer->irq.irq, timer->irq.level); WARN_ON(ret); diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index dea12df2b879..654dfd40e449 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -335,9 +335,22 @@ retry: return true; } -static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, - unsigned int intid, bool level, - bool mapped_irq) +/** + * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic + * @kvm: The VM structure pointer + * @cpuid: The CPU for PPIs + * @intid: The INTID to inject a new state to. + * @level: Edge-triggered: true: to trigger the interrupt + * false: to ignore the call + * Level-sensitive true: raise the input signal + * false: lower the input signal + * + * The VGIC is not concerned with devices being active-LOW or active-HIGH for + * level-sensitive interrupts. You can think of the level parameter as 1 + * being HIGH and 0 being LOW and all devices being active-HIGH. + */ +int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, + bool level) { struct kvm_vcpu *vcpu; struct vgic_irq *irq; @@ -357,11 +370,6 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, if (!irq) return -EINVAL; - if (irq->hw != mapped_irq) { - vgic_put_irq(kvm, irq); - return -EINVAL; - } - spin_lock(&irq->irq_lock); if (!vgic_validate_injection(irq, level)) { @@ -382,32 +390,6 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, return 0; } -/** - * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic - * @kvm: The VM structure pointer - * @cpuid: The CPU for PPIs - * @intid: The INTID to inject a new state to. - * @level: Edge-triggered: true: to trigger the interrupt - * false: to ignore the call - * Level-sensitive true: raise the input signal - * false: lower the input signal - * - * The VGIC is not concerned with devices being active-LOW or active-HIGH for - * level-sensitive interrupts. You can think of the level parameter as 1 - * being HIGH and 0 being LOW and all devices being active-HIGH. - */ -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level) -{ - return vgic_update_irq_pending(kvm, cpuid, intid, level, false); -} - -int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level) -{ - return vgic_update_irq_pending(kvm, cpuid, intid, level, true); -} - int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); -- cgit v1.2.3-70-g09d2 From 5a6da5f78431f6b172078eb5bd524187833f360b Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Thu, 19 Jan 2017 11:10:26 +0100 Subject: MIPS: KVM: Return directly after a failed copy_from_user() in kvm_arch_vcpu_ioctl() * Return directly after a call of the function "copy_from_user" failed in a case block. * Delete the jump label "out" which became unnecessary with this refactoring. Signed-off-by: Markus Elfring Reviewed-by: Paolo Bonzini Signed-off-by: James Hogan --- arch/mips/kvm/mips.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 29ec9ab3fd55..7999ef4d1147 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1152,10 +1152,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, { struct kvm_mips_interrupt irq; - r = -EFAULT; if (copy_from_user(&irq, argp, sizeof(irq))) - goto out; - + return -EFAULT; kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__, irq.irq); @@ -1165,17 +1163,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; - r = -EFAULT; if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; + return -EFAULT; r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } default: r = -ENOIOCTLCMD; } - -out: return r; } -- cgit v1.2.3-70-g09d2 From 00104b4171491794b53f8d6cc255f539e8bf18b4 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 4 Jan 2017 22:05:22 +0000 Subject: KVM: MIPS: Drop partial KVM_NMI implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIPS incompletely implements the KVM_NMI ioctl to supposedly perform a CPU reset, but all it actually does is invalidate the ASIDs. It doesn't expose the KVM_CAP_USER_NMI capability which is supposed to indicate the presence of the KVM_NMI ioctl, and no user software actually uses it on MIPS. Since this is dead code that would technically need updating for GVA page table handling in upcoming patches, remove it now. If we wanted to implement NMI injection later it can always be done properly along with the KVM_CAP_USER_NMI capability, and if we wanted to implement a proper CPU reset it would be better done with a separate ioctl. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mips.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 7999ef4d1147..f9e305f7ad71 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -63,18 +63,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { {NULL} }; -static int kvm_mips_reset_vcpu(struct kvm_vcpu *vcpu) -{ - int i; - - for_each_possible_cpu(i) { - vcpu->arch.guest_kernel_asid[i] = 0; - vcpu->arch.guest_user_asid[i] = 0; - } - - return 0; -} - /* * XXXKYMA: We are simulatoring a processor that has the WII bit set in * Config7, so we are "runnable" if interrupts are pending @@ -1144,10 +1132,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, return -E2BIG; return kvm_mips_copy_reg_indices(vcpu, user_list->reg); } - case KVM_NMI: - /* Treat the NMI as a CPU reset */ - r = kvm_mips_reset_vcpu(vcpu); - break; case KVM_INTERRUPT: { struct kvm_mips_interrupt irq; -- cgit v1.2.3-70-g09d2 From 1534b3964901ff55465a9f582838204193354966 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 13 Mar 2015 15:54:08 +0000 Subject: KVM: MIPS/MMU: Simplify ASID restoration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM T&E uses an ASID for guest kernel mode and an ASID for guest user mode. The current ASID is saved when the guest is scheduled out, and restored when scheduling back in, with checks for whether the ASID needs to be regenerated. This isn't really necessary as the ASID can be easily determined by the current guest mode, so lets simplify it to just read the required ASID from guest_kernel_asid or guest_user_asid even if the ASID hasn't been regenerated. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 3 --- arch/mips/kvm/mmu.c | 46 +++++++++++----------------------------- 2 files changed, 12 insertions(+), 37 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index bebec370324f..7cc53e44b42e 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -318,9 +318,6 @@ struct kvm_vcpu_arch { /* Bitmask of pending exceptions to be cleared */ unsigned long pending_exceptions_clr; - /* Save/Restore the entryhi register when are are preempted/scheduled back in */ - unsigned long preempt_entryhi; - /* S/W Based TLB for guest */ struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE]; diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 3b677c851be0..e1698a66253b 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -237,7 +237,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]); unsigned long flags; - int newasid = 0; kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu); @@ -250,7 +249,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); vcpu->arch.guest_kernel_asid[cpu] = vcpu->arch.guest_kernel_mm.context.asid[cpu]; - newasid++; kvm_debug("[%d]: cpu_context: %#lx\n", cpu, cpu_context(cpu, current->mm)); @@ -263,7 +261,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); vcpu->arch.guest_user_asid[cpu] = vcpu->arch.guest_user_mm.context.asid[cpu]; - newasid++; kvm_debug("[%d]: cpu_context: %#lx\n", cpu, cpu_context(cpu, current->mm)); @@ -282,35 +279,18 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_mips_migrate_count(vcpu); } - if (!newasid) { - /* - * If we preempted while the guest was executing, then reload - * the pre-empted ASID - */ - if (current->flags & PF_VCPU) { - write_c0_entryhi(vcpu->arch. - preempt_entryhi & asid_mask); - ehb(); - } - } else { - /* New ASIDs were allocated for the VM */ - - /* - * Were we in guest context? If so then the pre-empted ASID is - * no longer valid, we need to set it to what it should be based - * on the mode of the Guest (Kernel/User) - */ - if (current->flags & PF_VCPU) { - if (KVM_GUEST_KERNEL_MODE(vcpu)) - write_c0_entryhi(vcpu->arch. - guest_kernel_asid[cpu] & - asid_mask); - else - write_c0_entryhi(vcpu->arch. - guest_user_asid[cpu] & - asid_mask); - ehb(); - } + /* + * If we preempted while the guest was executing, then reload the ASID + * based on the mode of the Guest (Kernel/User) + */ + if (current->flags & PF_VCPU) { + if (KVM_GUEST_KERNEL_MODE(vcpu)) + write_c0_entryhi(vcpu->arch.guest_kernel_asid[cpu] & + asid_mask); + else + write_c0_entryhi(vcpu->arch.guest_user_asid[cpu] & + asid_mask); + ehb(); } /* restore guest state to registers */ @@ -329,8 +309,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) local_irq_save(flags); cpu = smp_processor_id(); - - vcpu->arch.preempt_entryhi = read_c0_entryhi(); vcpu->arch.last_sched_cpu = cpu; /* save guest state in registers */ -- cgit v1.2.3-70-g09d2 From a60b8438bdba4a4b77c90b6c1b22804150b4f244 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Sat, 12 Nov 2016 00:00:13 +0000 Subject: KVM: MIPS: Convert get/set_regs -> vcpu_load/put MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the get_regs() and set_regs() callbacks to vcpu_load() and vcpu_put(), which provide a cpu argument and more closely match the kvm_arch_vcpu_load() / kvm_arch_vcpu_put() that they are called by. This is in preparation for moving ASID management into the implementations. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 4 ++-- arch/mips/kvm/mmu.c | 4 ++-- arch/mips/kvm/trap_emul.c | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 7cc53e44b42e..1c70b5224151 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -539,8 +539,8 @@ struct kvm_mips_callbacks { const struct kvm_one_reg *reg, s64 *v); int (*set_one_reg)(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, s64 v); - int (*vcpu_get_regs)(struct kvm_vcpu *vcpu); - int (*vcpu_set_regs)(struct kvm_vcpu *vcpu); + int (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); + int (*vcpu_put)(struct kvm_vcpu *vcpu, int cpu); }; extern struct kvm_mips_callbacks *kvm_mips_callbacks; int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index e1698a66253b..ed46528611f4 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -294,7 +294,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } /* restore guest state to registers */ - kvm_mips_callbacks->vcpu_set_regs(vcpu); + kvm_mips_callbacks->vcpu_load(vcpu, cpu); local_irq_restore(flags); @@ -312,7 +312,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->arch.last_sched_cpu = cpu; /* save guest state in registers */ - kvm_mips_callbacks->vcpu_get_regs(vcpu); + kvm_mips_callbacks->vcpu_put(vcpu, cpu); if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & asid_version_mask(cpu))) { diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 3b20441f2beb..c0ee51465913 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -633,15 +633,15 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, return ret; } -static int kvm_trap_emul_vcpu_get_regs(struct kvm_vcpu *vcpu) +static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - kvm_lose_fpu(vcpu); - return 0; } -static int kvm_trap_emul_vcpu_set_regs(struct kvm_vcpu *vcpu) +static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu) { + kvm_lose_fpu(vcpu); + return 0; } @@ -675,8 +675,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .copy_reg_indices = kvm_trap_emul_copy_reg_indices, .get_one_reg = kvm_trap_emul_get_one_reg, .set_one_reg = kvm_trap_emul_set_one_reg, - .vcpu_get_regs = kvm_trap_emul_vcpu_get_regs, - .vcpu_set_regs = kvm_trap_emul_vcpu_set_regs, + .vcpu_load = kvm_trap_emul_vcpu_load, + .vcpu_put = kvm_trap_emul_vcpu_put, }; int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks) -- cgit v1.2.3-70-g09d2 From 1581ff3dbf698abba00f39039cc5bd854400b664 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 16 Nov 2016 23:48:56 +0000 Subject: KVM: MIPS/MMU: Move preempt/ASID handling to implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MIPS KVM host and guest GVA ASIDs may need regenerating when scheduling a process in guest context, which is done from the kvm_arch_vcpu_load() / kvm_arch_vcpu_put() functions in mmu.c. However this is a fairly implementation specific detail. VZ for example may use GuestIDs instead of normal ASIDs to distinguish mappings belonging to different guests, and even on VZ without GuestID the root TLB will be used differently to trap & emulate. Trap & emulate GVA ASIDs only relate to the user part of the full address space, so can be left active during guest exit handling (guest context) to allow guest instructions to be easily read and translated. VZ root ASIDs however are for GPA mappings so can't be left active during normal kernel code. They also aren't useful for accessing guest virtual memory, and we should have CP0_BadInstr[P] registers available to provide encodings of trapping guest instructions anyway. Therefore move the ASID preemption handling into the implementation callback. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mmu.c | 51 ------------------------------------------ arch/mips/kvm/trap_emul.c | 56 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 53 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index ed46528611f4..df013538113f 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -235,39 +235,12 @@ static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu) /* Restore ASID once we are scheduled back after preemption */ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]); unsigned long flags; kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu); - /* Allocate new kernel and user ASIDs if needed */ - local_irq_save(flags); - if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) & - asid_version_mask(cpu)) { - kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); - vcpu->arch.guest_kernel_asid[cpu] = - vcpu->arch.guest_kernel_mm.context.asid[cpu]; - - kvm_debug("[%d]: cpu_context: %#lx\n", cpu, - cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", - cpu, vcpu->arch.guest_kernel_asid[cpu]); - } - - if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) & - asid_version_mask(cpu)) { - kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); - vcpu->arch.guest_user_asid[cpu] = - vcpu->arch.guest_user_mm.context.asid[cpu]; - - kvm_debug("[%d]: cpu_context: %#lx\n", cpu, - cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, - vcpu->arch.guest_user_asid[cpu]); - } - if (vcpu->arch.last_sched_cpu != cpu) { kvm_debug("[%d->%d]KVM VCPU[%d] switch\n", vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); @@ -279,25 +252,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_mips_migrate_count(vcpu); } - /* - * If we preempted while the guest was executing, then reload the ASID - * based on the mode of the Guest (Kernel/User) - */ - if (current->flags & PF_VCPU) { - if (KVM_GUEST_KERNEL_MODE(vcpu)) - write_c0_entryhi(vcpu->arch.guest_kernel_asid[cpu] & - asid_mask); - else - write_c0_entryhi(vcpu->arch.guest_user_asid[cpu] & - asid_mask); - ehb(); - } - /* restore guest state to registers */ kvm_mips_callbacks->vcpu_load(vcpu, cpu); local_irq_restore(flags); - } /* ASID can change if another task is scheduled during preemption */ @@ -314,15 +272,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) /* save guest state in registers */ kvm_mips_callbacks->vcpu_put(vcpu, cpu); - if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & - asid_version_mask(cpu))) { - kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__, - cpu_context(cpu, current->mm)); - drop_mmu_context(current->mm, cpu); - } - write_c0_entryhi(cpu_asid(cpu, current->mm)); - ehb(); - local_irq_restore(flags); } diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index c0ee51465913..494a90221b5e 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -11,9 +11,9 @@ #include #include -#include - #include +#include +#include #include "interrupt.h" @@ -635,6 +635,49 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]); + + /* Allocate new kernel and user ASIDs if needed */ + + if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) & + asid_version_mask(cpu)) { + kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); + vcpu->arch.guest_kernel_asid[cpu] = + vcpu->arch.guest_kernel_mm.context.asid[cpu]; + + kvm_debug("[%d]: cpu_context: %#lx\n", cpu, + cpu_context(cpu, current->mm)); + kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", + cpu, vcpu->arch.guest_kernel_asid[cpu]); + } + + if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) & + asid_version_mask(cpu)) { + kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); + vcpu->arch.guest_user_asid[cpu] = + vcpu->arch.guest_user_mm.context.asid[cpu]; + + kvm_debug("[%d]: cpu_context: %#lx\n", cpu, + cpu_context(cpu, current->mm)); + kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, + vcpu->arch.guest_user_asid[cpu]); + } + + /* + * Were we in guest context? If so then the pre-empted ASID is + * no longer valid, we need to set it to what it should be based + * on the mode of the Guest (Kernel/User) + */ + if (current->flags & PF_VCPU) { + if (KVM_GUEST_KERNEL_MODE(vcpu)) + write_c0_entryhi(vcpu->arch.guest_kernel_asid[cpu] & + asid_mask); + else + write_c0_entryhi(vcpu->arch.guest_user_asid[cpu] & + asid_mask); + ehb(); + } + return 0; } @@ -642,6 +685,15 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu) { kvm_lose_fpu(vcpu); + if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu))) { + kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__, + cpu_context(cpu, current->mm)); + drop_mmu_context(current->mm, cpu); + } + write_c0_entryhi(cpu_asid(cpu, current->mm)); + ehb(); + return 0; } -- cgit v1.2.3-70-g09d2 From c550d53934d821dbdd867ca314d417f2e918c72c Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 11 Oct 2016 23:14:39 +0100 Subject: KVM: MIPS: Remove duplicated ASIDs from vcpu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kvm_vcpu_arch structure contains both mm_structs for allocating MMU contexts (primarily the ASID) but it also copies the resulting ASIDs into guest_{user,kernel}_asid[] arrays which are referenced from uasm generated code. This duplication doesn't seem to serve any purpose, and it gets in the way of generalising the ASID handling across guest kernel/user modes, so lets just extract the ASID straight out of the mm_struct on demand, and in fact there are convenient cpu_context() and cpu_asid() macros for doing so. To reduce the verbosity of this code we do also add kern_mm and user_mm local variables where the kernel and user mm_structs are used. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 4 +--- arch/mips/kvm/emulate.c | 13 +++++++------ arch/mips/kvm/entry.c | 22 +++++++++++++--------- arch/mips/kvm/mips.c | 8 +++----- arch/mips/kvm/mmu.c | 8 ++++---- arch/mips/kvm/tlb.c | 8 ++++---- arch/mips/kvm/trap_emul.c | 29 ++++++++++++----------------- 7 files changed, 44 insertions(+), 48 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 1c70b5224151..923f81dc6115 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -321,9 +321,7 @@ struct kvm_vcpu_arch { /* S/W Based TLB for guest */ struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE]; - /* Cached guest kernel/user ASIDs */ - u32 guest_user_asid[NR_CPUS]; - u32 guest_kernel_asid[NR_CPUS]; + /* Guest kernel/user [partial] mm */ struct mm_struct guest_kernel_mm, guest_user_mm; /* Guest ASID of last user mode execution */ diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index aa0937423e28..060acc5b3378 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -856,6 +856,8 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu) static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu, struct kvm_mips_tlb *tlb) { + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; int cpu, i; bool user; @@ -879,8 +881,8 @@ static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu, if (i == cpu) continue; if (user) - vcpu->arch.guest_user_asid[i] = 0; - vcpu->arch.guest_kernel_asid[i] = 0; + cpu_context(i, user_mm) = 0; + cpu_context(i, kern_mm) = 0; } preempt_enable(); @@ -1056,6 +1058,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; enum emulation_result er = EMULATE_DONE; u32 rt, rd, sel; unsigned long curr_pc; @@ -1178,13 +1181,11 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, */ preempt_disable(); cpu = smp_processor_id(); - kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, + kvm_get_new_mmu_context(kern_mm, cpu, vcpu); - vcpu->arch.guest_kernel_asid[cpu] = - vcpu->arch.guest_kernel_mm.context.asid[cpu]; for_each_possible_cpu(i) if (i != cpu) - vcpu->arch.guest_kernel_asid[i] = 0; + cpu_context(i, kern_mm) = 0; preempt_enable(); } kvm_write_c0_guest_entryhi(cop0, diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index e92fb190e2d6..f81888704caa 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -286,23 +287,26 @@ static void *kvm_mips_build_enter_guest(void *addr) uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL); uasm_i_xori(&p, T0, T0, KSU_USER); uasm_il_bnez(&p, &r, T0, label_kernel_asid); - UASM_i_ADDIU(&p, T1, K1, - offsetof(struct kvm_vcpu_arch, guest_kernel_asid)); + UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch, + guest_kernel_mm.context.asid)); /* else user */ - UASM_i_ADDIU(&p, T1, K1, - offsetof(struct kvm_vcpu_arch, guest_user_asid)); + UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch, + guest_user_mm.context.asid)); uasm_l_kernel_asid(&l, p); /* t1: contains the base of the ASID array, need to get the cpu id */ /* smp_processor_id */ uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP); - /* x4 */ - uasm_i_sll(&p, T2, T2, 2); + /* index the ASID array */ + uasm_i_sll(&p, T2, T2, ilog2(sizeof(long))); UASM_i_ADDU(&p, T3, T1, T2); - uasm_i_lw(&p, K0, 0, T3); + UASM_i_LW(&p, K0, 0, T3); #ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - /* x sizeof(struct cpuinfo_mips)/4 */ - uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4); + /* + * reuse ASID array offset + * cpuinfo_mips is a multiple of sizeof(long) + */ + uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/sizeof(long)); uasm_i_mul(&p, T2, T2, T3); UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index f9e305f7ad71..85bc54f35695 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -413,6 +413,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, /* Must be called with preemption disabled, just before entering guest */ static void kvm_mips_check_asids(struct kvm_vcpu *vcpu) { + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; struct mips_coproc *cop0 = vcpu->arch.cop0; int i, cpu = smp_processor_id(); unsigned int gasid; @@ -426,13 +427,10 @@ static void kvm_mips_check_asids(struct kvm_vcpu *vcpu) if (!KVM_GUEST_KERNEL_MODE(vcpu)) { gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; if (gasid != vcpu->arch.last_user_gasid) { - kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, - vcpu); - vcpu->arch.guest_user_asid[cpu] = - vcpu->arch.guest_user_mm.context.asid[cpu]; + kvm_get_new_mmu_context(user_mm, cpu, vcpu); for_each_possible_cpu(i) if (i != cpu) - vcpu->arch.guest_user_asid[cpu] = 0; + cpu_context(i, user_mm) = 0; vcpu->arch.last_user_gasid = gasid; } } diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index df013538113f..27d6d0dbfeb4 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -15,18 +15,18 @@ static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; int cpu = smp_processor_id(); - return vcpu->arch.guest_kernel_asid[cpu] & - cpu_asid_mask(&cpu_data[cpu]); + return cpu_asid(cpu, kern_mm); } static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) { + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; int cpu = smp_processor_id(); - return vcpu->arch.guest_user_asid[cpu] & - cpu_asid_mask(&cpu_data[cpu]); + return cpu_asid(cpu, user_mm); } static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 254377d8e0b9..ba490130b5e7 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -38,18 +38,18 @@ EXPORT_SYMBOL_GPL(kvm_mips_instance); static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; int cpu = smp_processor_id(); - return vcpu->arch.guest_kernel_asid[cpu] & - cpu_asid_mask(&cpu_data[cpu]); + return cpu_asid(cpu, kern_mm); } static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) { + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; int cpu = smp_processor_id(); - return vcpu->arch.guest_user_asid[cpu] & - cpu_asid_mask(&cpu_data[cpu]); + return cpu_asid(cpu, user_mm); } inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 494a90221b5e..c7854d32fd64 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -635,32 +635,29 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]); + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; /* Allocate new kernel and user ASIDs if needed */ - if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) & + if ((cpu_context(cpu, kern_mm) ^ asid_cache(cpu)) & asid_version_mask(cpu)) { - kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); - vcpu->arch.guest_kernel_asid[cpu] = - vcpu->arch.guest_kernel_mm.context.asid[cpu]; + kvm_get_new_mmu_context(kern_mm, cpu, vcpu); kvm_debug("[%d]: cpu_context: %#lx\n", cpu, cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", - cpu, vcpu->arch.guest_kernel_asid[cpu]); + kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#lx\n", + cpu, cpu_context(cpu, kern_mm)); } - if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) & + if ((cpu_context(cpu, user_mm) ^ asid_cache(cpu)) & asid_version_mask(cpu)) { - kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); - vcpu->arch.guest_user_asid[cpu] = - vcpu->arch.guest_user_mm.context.asid[cpu]; + kvm_get_new_mmu_context(user_mm, cpu, vcpu); kvm_debug("[%d]: cpu_context: %#lx\n", cpu, cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, - vcpu->arch.guest_user_asid[cpu]); + kvm_debug("[%d]: Allocated new ASID for Guest User: %#lx\n", + cpu, cpu_context(cpu, user_mm)); } /* @@ -670,11 +667,9 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) */ if (current->flags & PF_VCPU) { if (KVM_GUEST_KERNEL_MODE(vcpu)) - write_c0_entryhi(vcpu->arch.guest_kernel_asid[cpu] & - asid_mask); + write_c0_entryhi(cpu_asid(cpu, kern_mm)); else - write_c0_entryhi(vcpu->arch.guest_user_asid[cpu] & - asid_mask); + write_c0_entryhi(cpu_asid(cpu, user_mm)); ehb(); } -- cgit v1.2.3-70-g09d2 From a2c046e40ff16ef6c20d534b0d77d526bc02a684 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 18 Nov 2016 13:14:37 +0000 Subject: KVM: MIPS: Add vcpu_run() & vcpu_reenter() callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add implementation callbacks for entering the guest (vcpu_run()) and reentering the guest (vcpu_reenter()), allowing implementation specific operations to be performed before entering the guest or after returning to the host without cluttering kvm_arch_vcpu_ioctl_run(). This allows the T&E specific lazy user GVA flush to be moved into trap_emul.c, along with disabling of the HTW. We also move kvm_mips_deliver_interrupts() as VZ will need to restore the guest timer state prior to delivering interrupts. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 2 ++ arch/mips/kvm/mips.c | 43 ++--------------------------------- arch/mips/kvm/trap_emul.c | 48 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 41 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 923f81dc6115..9f319375835a 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -539,6 +539,8 @@ struct kvm_mips_callbacks { const struct kvm_one_reg *reg, s64 v); int (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); int (*vcpu_put)(struct kvm_vcpu *vcpu, int cpu); + int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); + void (*vcpu_reenter)(struct kvm_run *run, struct kvm_vcpu *vcpu); }; extern struct kvm_mips_callbacks *kvm_mips_callbacks; int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 85bc54f35695..1733877d8a53 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -410,32 +410,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -ENOIOCTLCMD; } -/* Must be called with preemption disabled, just before entering guest */ -static void kvm_mips_check_asids(struct kvm_vcpu *vcpu) -{ - struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; - struct mips_coproc *cop0 = vcpu->arch.cop0; - int i, cpu = smp_processor_id(); - unsigned int gasid; - - /* - * Lazy host ASID regeneration for guest user mode. - * If the guest ASID has changed since the last guest usermode - * execution, regenerate the host ASID so as to invalidate stale TLB - * entries. - */ - if (!KVM_GUEST_KERNEL_MODE(vcpu)) { - gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; - if (gasid != vcpu->arch.last_user_gasid) { - kvm_get_new_mmu_context(user_mm, cpu, vcpu); - for_each_possible_cpu(i) - if (i != cpu) - cpu_context(i, user_mm) = 0; - vcpu->arch.last_user_gasid = gasid; - } - } -} - int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r = 0; @@ -453,25 +427,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) lose_fpu(1); local_irq_disable(); - /* Check if we have any exceptions/interrupts pending */ - kvm_mips_deliver_interrupts(vcpu, - kvm_read_c0_guest_cause(vcpu->arch.cop0)); - guest_enter_irqoff(); - - /* Disable hardware page table walking while in guest */ - htw_stop(); - trace_kvm_enter(vcpu); - kvm_mips_check_asids(vcpu); + r = kvm_mips_callbacks->vcpu_run(run, vcpu); - r = vcpu->arch.vcpu_run(run, vcpu); trace_kvm_out(vcpu); - - /* Re-enable HTW before enabling interrupts */ - htw_start(); - guest_exit_irqoff(); local_irq_enable(); @@ -1570,7 +1531,7 @@ skip_emul: if (ret == RESUME_GUEST) { trace_kvm_reenter(vcpu); - kvm_mips_check_asids(vcpu); + kvm_mips_callbacks->vcpu_reenter(run, vcpu); /* * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index c7854d32fd64..92734d095c94 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -692,6 +692,52 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu) return 0; } +static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, + struct kvm_vcpu *vcpu) +{ + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; + struct mips_coproc *cop0 = vcpu->arch.cop0; + int i, cpu = smp_processor_id(); + unsigned int gasid; + + /* + * Lazy host ASID regeneration for guest user mode. + * If the guest ASID has changed since the last guest usermode + * execution, regenerate the host ASID so as to invalidate stale TLB + * entries. + */ + if (!KVM_GUEST_KERNEL_MODE(vcpu)) { + gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; + if (gasid != vcpu->arch.last_user_gasid) { + kvm_get_new_mmu_context(user_mm, cpu, vcpu); + for_each_possible_cpu(i) + if (i != cpu) + cpu_context(i, user_mm) = 0; + vcpu->arch.last_user_gasid = gasid; + } + } +} + +static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int r; + + /* Check if we have any exceptions/interrupts pending */ + kvm_mips_deliver_interrupts(vcpu, + kvm_read_c0_guest_cause(vcpu->arch.cop0)); + + kvm_trap_emul_vcpu_reenter(run, vcpu); + + /* Disable hardware page table walking while in guest */ + htw_stop(); + + r = vcpu->arch.vcpu_run(run, vcpu); + + htw_start(); + + return r; +} + static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { /* exit handlers */ .handle_cop_unusable = kvm_trap_emul_handle_cop_unusable, @@ -724,6 +770,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .set_one_reg = kvm_trap_emul_set_one_reg, .vcpu_load = kvm_trap_emul_vcpu_load, .vcpu_put = kvm_trap_emul_vcpu_put, + .vcpu_run = kvm_trap_emul_vcpu_run, + .vcpu_reenter = kvm_trap_emul_vcpu_reenter, }; int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks) -- cgit v1.2.3-70-g09d2 From 91cdee5710d5fe8f81915307b5ff38d364fbde33 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 18 Nov 2016 13:25:24 +0000 Subject: KVM: MIPS/T&E: Restore host asid on return to host MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We only need the guest ASID loaded while in guest context, i.e. while running guest code and while handling guest exits. We load the guest ASID when entering the guest, however we restore the host ASID later than necessary, when the VCPU state is saved i.e. vcpu_put() or slightly earlier if preempted after returning to the host. This mismatch is both unpleasant and causes redundant host ASID restores in kvm_trap_emul_vcpu_put(). Lets explicitly restore the host ASID when returning to the host, and don't bother restoring the host ASID on context switch in unless we're already in guest context. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/trap_emul.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 92734d095c94..3e1dbcbcea85 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -680,14 +680,17 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu) { kvm_lose_fpu(vcpu); - if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & - asid_version_mask(cpu))) { - kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__, - cpu_context(cpu, current->mm)); - drop_mmu_context(current->mm, cpu); + if (current->flags & PF_VCPU) { + /* Restore normal Linux process memory map */ + if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu))) { + kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__, + cpu_context(cpu, current->mm)); + get_new_mmu_context(current->mm, cpu); + } + write_c0_entryhi(cpu_asid(cpu, current->mm)); + ehb(); } - write_c0_entryhi(cpu_asid(cpu, current->mm)); - ehb(); return 0; } @@ -720,6 +723,7 @@ static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) { + int cpu; int r; /* Check if we have any exceptions/interrupts pending */ @@ -733,6 +737,15 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) r = vcpu->arch.vcpu_run(run, vcpu); + /* We may have migrated while handling guest exits */ + cpu = smp_processor_id(); + + /* Restore normal Linux process memory map */ + if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu))) + get_new_mmu_context(current->mm, cpu); + write_c0_entryhi(cpu_asid(cpu, current->mm)); + htw_start(); return r; -- cgit v1.2.3-70-g09d2 From a7ebb2e410f8be2b061557fd85241d75a1094221 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 15 Nov 2016 00:06:05 +0000 Subject: KVM: MIPS/T&E: active_mm = init_mm in guest context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set init_mm as the active_mm and update mm_cpumask(current->mm) to reflect that it isn't active when in guest context. This prevents cache management code from attempting cache flushes on host virtual addresses while in guest context, for example due to a cache management IPIs or later when writing of dynamically translated code hits copy on write. We do this using helpers in static kernel code to avoid having to export init_mm to modules. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 4 ++++ arch/mips/kvm/tlb.c | 35 +++++++++++++++++++++++++++++++++++ arch/mips/kvm/trap_emul.c | 12 +++++++++++- 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 9f319375835a..95320b7964a6 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -607,6 +607,10 @@ extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi); extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi); extern int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr); + +void kvm_mips_suspend_mm(int cpu); +void kvm_mips_resume_mm(int cpu); + extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, unsigned long gva); extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index ba490130b5e7..6c1f894b8754 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -382,3 +382,38 @@ void kvm_local_flush_tlb_all(void) local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all); + +/** + * kvm_mips_suspend_mm() - Suspend the active mm. + * @cpu The CPU we're running on. + * + * Suspend the active_mm, ready for a switch to a KVM guest virtual address + * space. This is left active for the duration of guest context, including time + * with interrupts enabled, so we need to be careful not to confuse e.g. cache + * management IPIs. + * + * kvm_mips_resume_mm() should be called before context switching to a different + * process so we don't need to worry about reference counting. + * + * This needs to be in static kernel code to avoid exporting init_mm. + */ +void kvm_mips_suspend_mm(int cpu) +{ + cpumask_clear_cpu(cpu, mm_cpumask(current->active_mm)); + current->active_mm = &init_mm; +} +EXPORT_SYMBOL_GPL(kvm_mips_suspend_mm); + +/** + * kvm_mips_resume_mm() - Resume the current process mm. + * @cpu The CPU we're running on. + * + * Resume the mm of the current process, after a switch back from a KVM guest + * virtual address space (see kvm_mips_suspend_mm()). + */ +void kvm_mips_resume_mm(int cpu) +{ + cpumask_set_cpu(cpu, mm_cpumask(current->mm)); + current->active_mm = current->mm; +} +EXPORT_SYMBOL_GPL(kvm_mips_resume_mm); diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 3e1dbcbcea85..9cfe4d2a283c 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -670,6 +670,7 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) write_c0_entryhi(cpu_asid(cpu, kern_mm)); else write_c0_entryhi(cpu_asid(cpu, user_mm)); + kvm_mips_suspend_mm(cpu); ehb(); } @@ -689,6 +690,7 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu) get_new_mmu_context(current->mm, cpu); } write_c0_entryhi(cpu_asid(cpu, current->mm)); + kvm_mips_resume_mm(cpu); ehb(); } @@ -723,7 +725,7 @@ static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) { - int cpu; + int cpu = smp_processor_id(); int r; /* Check if we have any exceptions/interrupts pending */ @@ -735,6 +737,13 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) /* Disable hardware page table walking while in guest */ htw_stop(); + /* + * While in guest context we're in the guest's address space, not the + * host process address space, so we need to be careful not to confuse + * e.g. cache management IPIs. + */ + kvm_mips_suspend_mm(cpu); + r = vcpu->arch.vcpu_run(run, vcpu); /* We may have migrated while handling guest exits */ @@ -745,6 +754,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) asid_version_mask(cpu))) get_new_mmu_context(current->mm, cpu); write_c0_entryhi(cpu_asid(cpu, current->mm)); + kvm_mips_resume_mm(cpu); htw_start(); -- cgit v1.2.3-70-g09d2 From 630766b3694f0574f903863ef5c3cc4f2bbe736f Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 8 Sep 2016 23:00:24 +0100 Subject: KVM: MIPS: Wire up vcpu uninit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire up a vcpu uninit implementation callback. This will be used for the clean up of GVA->HPA page tables. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 2 +- arch/mips/kvm/mips.c | 5 +++++ arch/mips/kvm/trap_emul.c | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 95320b7964a6..fea538fc5331 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -519,6 +519,7 @@ struct kvm_mips_callbacks { int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); int (*vm_init)(struct kvm *kvm); int (*vcpu_init)(struct kvm_vcpu *vcpu); + void (*vcpu_uninit)(struct kvm_vcpu *vcpu); int (*vcpu_setup)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(gva_t gva); void (*queue_timer_int)(struct kvm_vcpu *vcpu); @@ -765,7 +766,6 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} -static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 1733877d8a53..011710a68c6b 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1345,6 +1345,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) return 0; } +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kvm_mips_callbacks->vcpu_uninit(vcpu); +} + int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 9cfe4d2a283c..07540cf2b557 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -440,6 +440,10 @@ static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) return 0; } +static void kvm_trap_emul_vcpu_uninit(struct kvm_vcpu *vcpu) +{ +} + static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -779,6 +783,7 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .vm_init = kvm_trap_emul_vm_init, .vcpu_init = kvm_trap_emul_vcpu_init, + .vcpu_uninit = kvm_trap_emul_vcpu_uninit, .vcpu_setup = kvm_trap_emul_vcpu_setup, .gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb, .queue_timer_int = kvm_mips_queue_timer_int_cb, -- cgit v1.2.3-70-g09d2 From f7f1427dc0c67e21ba9ec2200b7c8853535b3842 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 8 Sep 2016 22:57:03 +0100 Subject: KVM: MIPS/T&E: Allocate GVA -> HPA page tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allocate GVA -> HPA page tables for guest kernel and guest user mode on each VCPU, to allow for fast path TLB refill handling to be added later. In the process kvm_arch_vcpu_init() needs updating to pass on any error from the vcpu_init() callback. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mips.c | 7 +++++- arch/mips/kvm/trap_emul.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 011710a68c6b..7cf85fa1f658 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1338,7 +1338,12 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - kvm_mips_callbacks->vcpu_init(vcpu); + int err; + + err = kvm_mips_callbacks->vcpu_init(vcpu); + if (err) + return err; + hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 07540cf2b557..183150a963ec 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "interrupt.h" @@ -435,13 +436,75 @@ static int kvm_trap_emul_vm_init(struct kvm *kvm) static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) { + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; + vcpu->arch.kscratch_enabled = 0xfc; + /* + * Allocate GVA -> HPA page tables. + * MIPS doesn't use the mm_struct pointer argument. + */ + kern_mm->pgd = pgd_alloc(kern_mm); + if (!kern_mm->pgd) + return -ENOMEM; + + user_mm->pgd = pgd_alloc(user_mm); + if (!user_mm->pgd) { + pgd_free(kern_mm, kern_mm->pgd); + return -ENOMEM; + } + return 0; } +static void kvm_mips_emul_free_gva_pt(pgd_t *pgd) +{ + /* Don't free host kernel page tables copied from init_mm.pgd */ + const unsigned long end = 0x80000000; + unsigned long pgd_va, pud_va, pmd_va; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int i, j, k; + + for (i = 0; i < USER_PTRS_PER_PGD; i++) { + if (pgd_none(pgd[i])) + continue; + + pgd_va = (unsigned long)i << PGDIR_SHIFT; + if (pgd_va >= end) + break; + pud = pud_offset(pgd + i, 0); + for (j = 0; j < PTRS_PER_PUD; j++) { + if (pud_none(pud[j])) + continue; + + pud_va = pgd_va | ((unsigned long)j << PUD_SHIFT); + if (pud_va >= end) + break; + pmd = pmd_offset(pud + j, 0); + for (k = 0; k < PTRS_PER_PMD; k++) { + if (pmd_none(pmd[k])) + continue; + + pmd_va = pud_va | (k << PMD_SHIFT); + if (pmd_va >= end) + break; + pte = pte_offset(pmd + k, 0); + pte_free_kernel(NULL, pte); + } + pmd_free(NULL, pmd); + } + pud_free(NULL, pud); + } + pgd_free(NULL, pgd); +} + static void kvm_trap_emul_vcpu_uninit(struct kvm_vcpu *vcpu) { + kvm_mips_emul_free_gva_pt(vcpu->arch.guest_kernel_mm.pgd); + kvm_mips_emul_free_gva_pt(vcpu->arch.guest_user_mm.pgd); } static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 7faa6eec6991715d6c1d85c192738dcac405ab89 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 7 Oct 2016 23:58:53 +0100 Subject: KVM: MIPS/T&E: Activate GVA page tables in guest context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Activate the GVA page tables when in guest context. This will allow the normal Linux TLB refill handler to fill from it when guest memory is read, as well as preventing accidental reading from user memory. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/mmu_context.h | 4 +++- arch/mips/kvm/entry.c | 16 +++++++++++++++- arch/mips/kvm/trap_emul.c | 10 ++++++---- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h index ddd57ade1aa8..16eb8521398e 100644 --- a/arch/mips/include/asm/mmu_context.h +++ b/arch/mips/include/asm/mmu_context.h @@ -29,9 +29,11 @@ do { \ } \ } while (0) +extern void tlbmiss_handler_setup_pgd(unsigned long); + +/* Note: This is also implemented with uasm in arch/mips/kvm/entry.c */ #define TLBMISS_HANDLER_SETUP_PGD(pgd) \ do { \ - extern void tlbmiss_handler_setup_pgd(unsigned long); \ tlbmiss_handler_setup_pgd((unsigned long)(pgd)); \ htw_set_pwbase((unsigned long)pgd); \ } while (0) diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index f81888704caa..f683d123172c 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -316,7 +317,20 @@ static void *kvm_mips_build_enter_guest(void *addr) #else uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID); #endif - uasm_i_mtc0(&p, K0, C0_ENTRYHI); + + /* + * Set up KVM T&E GVA pgd. + * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD(): + * - call tlbmiss_handler_setup_pgd(mm->pgd) + * - but skips write into CP0_PWBase for now + */ + UASM_i_LW(&p, A0, (int)offsetof(struct mm_struct, pgd) - + (int)offsetof(struct mm_struct, context.asid), T1); + + UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd); + uasm_i_jalr(&p, RA, T9); + uasm_i_mtc0(&p, K0, C0_ENTRYHI); + uasm_i_ehb(&p); /* Disable RDHWR access */ diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 183150a963ec..f39d427649dc 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -704,6 +704,7 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; + struct mm_struct *mm; /* Allocate new kernel and user ASIDs if needed */ @@ -733,10 +734,9 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * on the mode of the Guest (Kernel/User) */ if (current->flags & PF_VCPU) { - if (KVM_GUEST_KERNEL_MODE(vcpu)) - write_c0_entryhi(cpu_asid(cpu, kern_mm)); - else - write_c0_entryhi(cpu_asid(cpu, user_mm)); + mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm; + write_c0_entryhi(cpu_asid(cpu, mm)); + TLBMISS_HANDLER_SETUP_PGD(mm->pgd); kvm_mips_suspend_mm(cpu); ehb(); } @@ -757,6 +757,7 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu) get_new_mmu_context(current->mm, cpu); } write_c0_entryhi(cpu_asid(cpu, current->mm)); + TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd); kvm_mips_resume_mm(cpu); ehb(); } @@ -821,6 +822,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) asid_version_mask(cpu))) get_new_mmu_context(current->mm, cpu); write_c0_entryhi(cpu_asid(cpu, current->mm)); + TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd); kvm_mips_resume_mm(cpu); htw_start(); -- cgit v1.2.3-70-g09d2 From 29b500b54ef379f1f3227b633dd477a4dd3cd62b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 11 Nov 2016 14:08:32 +0000 Subject: KVM: MIPS: Support NetLogic KScratch registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tlbex.c uses the implementation dependent $22 CP0 register group on NetLogic cores, with the help of the c0_kscratch() helper. Allow these registers to be allocated by the KVM entry code too instead of assuming KScratch registers are all $31, which will also allow pgd_reg to be handled since it is allocated that way. We also drop the masking of kscratch_mask with 0xfc, as it is redundant for the standard KScratch registers (Config4.KScrExist won't have the low 2 bits set anyway), and apparently not necessary for NetLogic. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/entry.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index f683d123172c..7424d3d566ff 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -91,6 +91,21 @@ static void *kvm_mips_build_ret_from_exit(void *addr); static void *kvm_mips_build_ret_to_guest(void *addr); static void *kvm_mips_build_ret_to_host(void *addr); +/* + * The version of this function in tlbex.c uses current_cpu_type(), but for KVM + * we assume symmetry. + */ +static int c0_kscratch(void) +{ + switch (boot_cpu_type()) { + case CPU_XLP: + case CPU_XLR: + return 22; + default: + return 31; + } +} + /** * kvm_mips_entry_setup() - Perform global setup for entry code. * @@ -105,18 +120,18 @@ int kvm_mips_entry_setup(void) * We prefer to use KScratchN registers if they are available over the * defaults above, which may not work on all cores. */ - unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc; + unsigned int kscratch_mask = cpu_data[0].kscratch_mask; /* Pick a scratch register for storing VCPU */ if (kscratch_mask) { - scratch_vcpu[0] = 31; + scratch_vcpu[0] = c0_kscratch(); scratch_vcpu[1] = ffs(kscratch_mask) - 1; kscratch_mask &= ~BIT(scratch_vcpu[1]); } /* Pick a scratch register to use as a temp for saving state */ if (kscratch_mask) { - scratch_tmp[0] = 31; + scratch_tmp[0] = c0_kscratch(); scratch_tmp[1] = ffs(kscratch_mask) - 1; kscratch_mask &= ~BIT(scratch_tmp[1]); } @@ -132,7 +147,7 @@ static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp, UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); /* Save the temp scratch register value in cp0_cause of stack frame */ - if (scratch_tmp[0] == 31) { + if (scratch_tmp[0] == c0_kscratch()) { UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]); UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); } @@ -148,7 +163,7 @@ static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp, UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); - if (scratch_tmp[0] == 31) { + if (scratch_tmp[0] == c0_kscratch()) { UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]); } -- cgit v1.2.3-70-g09d2 From a7cfa7ac1236937dac431845596a39ba27364a00 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Sat, 10 Sep 2016 23:56:46 +0100 Subject: KVM: MIPS: Add fast path TLB refill handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use functions from the general MIPS TLB exception vector generation code (tlbex.c) to construct a fast path TLB refill handler similar to the general one, but cut down and capable of preserving K0 and K1. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 1 + arch/mips/kvm/entry.c | 78 ++++++++++++++++++++++++++++++++++++++++ arch/mips/kvm/mips.c | 8 +++-- 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index fea538fc5331..80928ffa0150 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -554,6 +554,7 @@ extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu); /* Building of entry/exception code */ int kvm_mips_entry_setup(void); void *kvm_mips_build_vcpu_run(void *addr); +void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler); void *kvm_mips_build_exception(void *addr, void *handler); void *kvm_mips_build_exit(void *addr); diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index 7424d3d566ff..1ae33e0e675c 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -16,6 +16,7 @@ #include #include #include +#include #include /* Register names */ @@ -122,6 +123,9 @@ int kvm_mips_entry_setup(void) */ unsigned int kscratch_mask = cpu_data[0].kscratch_mask; + if (pgd_reg != -1) + kscratch_mask &= ~BIT(pgd_reg); + /* Pick a scratch register for storing VCPU */ if (kscratch_mask) { scratch_vcpu[0] = c0_kscratch(); @@ -380,6 +384,80 @@ static void *kvm_mips_build_enter_guest(void *addr) return p; } +/** + * kvm_mips_build_tlb_refill_exception() - Assemble TLB refill handler. + * @addr: Address to start writing code. + * @handler: Address of common handler (within range of @addr). + * + * Assemble TLB refill exception fast path handler for guest execution. + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler) +{ + u32 *p = addr; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* Save guest k1 into scratch register */ + UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]); + + /* Get the VCPU pointer from the VCPU scratch register */ + UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); + + /* Save guest k0 into VCPU structure */ + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1); + + /* + * Some of the common tlbex code uses current_cpu_type(). For KVM we + * assume symmetry and just disable preemption to silence the warning. + */ + preempt_disable(); + + /* + * Now for the actual refill bit. A lot of this can be common with the + * Linux TLB refill handler, however we don't need to handle so many + * cases. We only need to handle user mode refills, and user mode runs + * with 32-bit addressing. + * + * Therefore the branch to label_vmalloc generated by build_get_pmde64() + * that isn't resolved should never actually get taken and is harmless + * to leave in place for now. + */ + +#ifdef CONFIG_64BIT + build_get_pmde64(&p, &l, &r, K0, K1); /* get pmd in K1 */ +#else + build_get_pgde32(&p, K0, K1); /* get pgd in K1 */ +#endif + + /* we don't support huge pages yet */ + + build_get_ptep(&p, K0, K1); + build_update_entries(&p, K0, K1); + build_tlb_write_entry(&p, &l, &r, tlb_random); + + preempt_enable(); + + /* Get the VCPU pointer from the VCPU scratch register again */ + UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); + + /* Restore the guest's k0/k1 registers */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1); + uasm_i_ehb(&p); + UASM_i_MFC0(&p, K1, scratch_tmp[0], scratch_tmp[1]); + + /* Jump to guest */ + uasm_i_eret(&p); + + return p; +} + /** * kvm_mips_build_exception() - Assemble first level guest exception handler. * @addr: Address to start writing code. diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 7cf85fa1f658..a687864de428 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -264,7 +264,7 @@ static inline void dump_handler(const char *symbol, void *start, void *end) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { int err, size; - void *gebase, *p, *handler; + void *gebase, *p, *handler, *refill_start, *refill_end; int i; struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); @@ -317,8 +317,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) /* Build guest exception vectors dynamically in unmapped memory */ handler = gebase + 0x2000; - /* TLB Refill, EXL = 0 */ - kvm_mips_build_exception(gebase, handler); + /* TLB refill */ + refill_start = gebase; + refill_end = kvm_mips_build_tlb_refill_exception(refill_start, handler); /* General Exception Entry point */ kvm_mips_build_exception(gebase + 0x180, handler); @@ -344,6 +345,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) pr_debug("#include \n"); pr_debug("\n"); dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p); + dump_handler("kvm_tlb_refill", refill_start, refill_end); dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200); dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run); -- cgit v1.2.3-70-g09d2 From f3a8603f098fd2c68311d945a6531d1e3b62271c Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 7 Oct 2016 22:01:05 +0100 Subject: KVM: MIPS/TLB: Fix off-by-one in TLB invalidate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_mips_host_tlb_inv() uses the TLBP instruction to probe the host TLB for an entry matching the given guest virtual address, and determines whether a match was found based on whether CP0_Index > 0. This is technically incorrect as an index of 0 (with the high bit clear) is a perfectly valid TLB index. This is harmless at the moment due to the use of at least 1 wired TLB entry for the KVM commpage, however we will soon be ridding ourselves of that particular wired entry so lets fix the condition in case the entry needing invalidation does land at TLB index 0. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/tlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 6c1f894b8754..4bf82613d440 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -282,7 +282,7 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va) if (idx >= current_cpu_data.tlbsize) BUG(); - if (idx > 0) { + if (idx >= 0) { write_c0_entryhi(UNIQUE_ENTRYHI(idx)); write_c0_entrylo0(0); write_c0_entrylo1(0); @@ -297,7 +297,7 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va) local_irq_restore(flags); - if (idx > 0) + if (idx >= 0) kvm_debug("%s: Invalidated entryhi %#lx @ idx %d\n", __func__, (va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu), idx); -- cgit v1.2.3-70-g09d2 From 57e3869cfaaec712f6ea1855ab7ba868f6f306ed Mon Sep 17 00:00:00 2001 From: James Hogan Date: Sat, 8 Oct 2016 00:15:52 +0100 Subject: KVM: MIPS/TLB: Generalise host TLB invalidate to kernel ASID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor kvm_mips_host_tlb_inv() to also be able to invalidate any matching TLB entry in the kernel ASID rather than assuming only the TLB entries in the user ASID can change. Two new bool user/kernel arguments allow the caller to indicate whether the mapping should affect each of the ASIDs for guest user/kernel mode. - kvm_mips_invalidate_guest_tlb() (used by TLBWI/TLBWR emulation) can now invalidate any corresponding TLB entry in both the kernel ASID (guest kernel may have accessed any guest mapping), and the user ASID if the entry being replaced is in guest USeg (where guest user may also have accessed it). - The tlbmod fault handler (and the KSeg0 / TLB mapped / commpage fault handlers in later patches) can now invalidate the corresponding TLB entry in whichever ASID is currently active, since only a single page table will have been updated anyway. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 3 ++- arch/mips/kvm/emulate.c | 6 ++++-- arch/mips/kvm/tlb.c | 40 ++++++++++++++++++++++++++++++---------- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 80928ffa0150..fb2ea578c193 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -604,7 +604,8 @@ extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, unsigned long entrylo1, int flush_dcache_mask); extern void kvm_mips_flush_host_tlb(int skip_kseg0); -extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi); +extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi, + bool user, bool kernel); extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi); diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 060acc5b3378..611b8996ca0c 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -873,7 +873,7 @@ static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu, * Probe the shadow host TLB for the entry being overwritten, if one * matches, invalidate it */ - kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi); + kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi, user, true); /* Invalidate the whole ASID on other CPUs */ cpu = smp_processor_id(); @@ -2100,13 +2100,15 @@ enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc, struct mips_coproc *cop0 = vcpu->arch.cop0; unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) | (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID); + bool kernel = KVM_GUEST_KERNEL_MODE(vcpu); int index; /* If address not in the guest TLB, then we are in trouble */ index = kvm_mips_guest_tlb_lookup(vcpu, entryhi); if (index < 0) { /* XXXKYMA Invalidate and retry */ - kvm_mips_host_tlb_inv(vcpu, vcpu->arch.host_cp0_badvaddr); + kvm_mips_host_tlb_inv(vcpu, vcpu->arch.host_cp0_badvaddr, + !kernel, kernel); kvm_err("%s: host got TLBMOD for %#lx but entry not present in Guest TLB\n", __func__, entryhi); kvm_mips_dump_guest_tlbs(vcpu); diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 4bf82613d440..06ee9a1d78a5 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -263,16 +263,11 @@ int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr) } EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_lookup); -int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va) +static int _kvm_mips_host_tlb_inv(unsigned long entryhi) { int idx; - unsigned long flags, old_entryhi; - - local_irq_save(flags); - - old_entryhi = read_c0_entryhi(); - write_c0_entryhi((va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu)); + write_c0_entryhi(entryhi); mtc0_tlbw_hazard(); tlb_probe(); @@ -292,14 +287,39 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va) tlbw_use_hazard(); } + return idx; +} + +int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va, + bool user, bool kernel) +{ + int idx_user, idx_kernel; + unsigned long flags, old_entryhi; + + local_irq_save(flags); + + old_entryhi = read_c0_entryhi(); + + if (user) + idx_user = _kvm_mips_host_tlb_inv((va & VPN2_MASK) | + kvm_mips_get_user_asid(vcpu)); + if (kernel) + idx_kernel = _kvm_mips_host_tlb_inv((va & VPN2_MASK) | + kvm_mips_get_kernel_asid(vcpu)); + write_c0_entryhi(old_entryhi); mtc0_tlbw_hazard(); local_irq_restore(flags); - if (idx >= 0) - kvm_debug("%s: Invalidated entryhi %#lx @ idx %d\n", __func__, - (va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu), idx); + if (user && idx_user >= 0) + kvm_debug("%s: Invalidated guest user entryhi %#lx @ idx %d\n", + __func__, (va & VPN2_MASK) | + kvm_mips_get_user_asid(vcpu), idx_user); + if (kernel && idx_kernel >= 0) + kvm_debug("%s: Invalidated guest kernel entryhi %#lx @ idx %d\n", + __func__, (va & VPN2_MASK) | + kvm_mips_get_kernel_asid(vcpu), idx_kernel); return 0; } -- cgit v1.2.3-70-g09d2 From a31b50d741bd85a127d5ef2c21c0788041bc41a9 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 16 Dec 2016 15:57:00 +0000 Subject: KVM: MIPS/MMU: Invalidate GVA PTs on ASID changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement invalidation of large ranges of virtual addresses from GVA page tables in response to a guest ASID change (immediately for guest kernel page table, lazily for guest user page table). We iterate through a range of page tables invalidating entries and freeing fully invalidated tables. To minimise overhead the exact ranges invalidated depends on the flags argument to kvm_mips_flush_gva_pt(), which also allows it to be used in future KVM_CAP_SYNC_MMU patches in response to GPA changes, which unlike guest TLB mapping changes affects guest KSeg0 mappings. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 17 +++++ arch/mips/kvm/emulate.c | 11 ++++ arch/mips/kvm/mmu.c | 134 +++++++++++++++++++++++++++++++++++++++ arch/mips/kvm/trap_emul.c | 5 +- 4 files changed, 165 insertions(+), 2 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index fb2ea578c193..f5145dcab319 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -614,6 +614,23 @@ extern int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr); void kvm_mips_suspend_mm(int cpu); void kvm_mips_resume_mm(int cpu); +/* MMU handling */ + +/** + * enum kvm_mips_flush - Types of MMU flushes. + * @KMF_USER: Flush guest user virtual memory mappings. + * Guest USeg only. + * @KMF_KERN: Flush guest kernel virtual memory mappings. + * Guest USeg and KSeg2/3. + * @KMF_GPA: Flush guest physical memory mappings. + * Also includes KSeg0 if KMF_KERN is set. + */ +enum kvm_mips_flush { + KMF_USER = 0x0, + KMF_KERN = 0x1, + KMF_GPA = 0x2, +}; +void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags); extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, unsigned long gva); extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 611b8996ca0c..1d399396e486 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1171,6 +1171,17 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, & KVM_ENTRYHI_ASID, nasid); + /* + * Flush entries from the GVA page + * tables. + * Guest user page table will get + * flushed lazily on re-entry to guest + * user if the guest ASID actually + * changes. + */ + kvm_mips_flush_gva_pt(kern_mm->pgd, + KMF_KERN); + /* * Regenerate/invalidate kernel MMU * context. diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 27d6d0dbfeb4..09146b62552f 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -12,6 +12,7 @@ #include #include #include +#include static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { @@ -80,6 +81,139 @@ unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset; } +/* + * kvm_mips_flush_gva_{pte,pmd,pud,pgd,pt}. + * Flush a range of guest physical address space from the VM's GPA page tables. + */ + +static bool kvm_mips_flush_gva_pte(pte_t *pte, unsigned long start_gva, + unsigned long end_gva) +{ + int i_min = __pte_offset(start_gva); + int i_max = __pte_offset(end_gva); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1); + int i; + + /* + * There's no freeing to do, so there's no point clearing individual + * entries unless only part of the last level page table needs flushing. + */ + if (safe_to_remove) + return true; + + for (i = i_min; i <= i_max; ++i) { + if (!pte_present(pte[i])) + continue; + + set_pte(pte + i, __pte(0)); + } + return false; +} + +static bool kvm_mips_flush_gva_pmd(pmd_t *pmd, unsigned long start_gva, + unsigned long end_gva) +{ + pte_t *pte; + unsigned long end = ~0ul; + int i_min = __pmd_offset(start_gva); + int i_max = __pmd_offset(end_gva); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gva = 0) { + if (!pmd_present(pmd[i])) + continue; + + pte = pte_offset(pmd + i, 0); + if (i == i_max) + end = end_gva; + + if (kvm_mips_flush_gva_pte(pte, start_gva, end)) { + pmd_clear(pmd + i); + pte_free_kernel(NULL, pte); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +static bool kvm_mips_flush_gva_pud(pud_t *pud, unsigned long start_gva, + unsigned long end_gva) +{ + pmd_t *pmd; + unsigned long end = ~0ul; + int i_min = __pud_offset(start_gva); + int i_max = __pud_offset(end_gva); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gva = 0) { + if (!pud_present(pud[i])) + continue; + + pmd = pmd_offset(pud + i, 0); + if (i == i_max) + end = end_gva; + + if (kvm_mips_flush_gva_pmd(pmd, start_gva, end)) { + pud_clear(pud + i); + pmd_free(NULL, pmd); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +static bool kvm_mips_flush_gva_pgd(pgd_t *pgd, unsigned long start_gva, + unsigned long end_gva) +{ + pud_t *pud; + unsigned long end = ~0ul; + int i_min = pgd_index(start_gva); + int i_max = pgd_index(end_gva); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gva = 0) { + if (!pgd_present(pgd[i])) + continue; + + pud = pud_offset(pgd + i, 0); + if (i == i_max) + end = end_gva; + + if (kvm_mips_flush_gva_pud(pud, start_gva, end)) { + pgd_clear(pgd + i); + pud_free(NULL, pud); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags) +{ + if (flags & KMF_GPA) { + /* all of guest virtual address space could be affected */ + if (flags & KMF_KERN) + /* useg, kseg0, seg2/3 */ + kvm_mips_flush_gva_pgd(pgd, 0, 0x7fffffff); + else + /* useg */ + kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff); + } else { + /* useg */ + kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff); + + /* kseg2/3 */ + if (flags & KMF_KERN) + kvm_mips_flush_gva_pgd(pgd, 0x60000000, 0x7fffffff); + } +} + /* XXXKYMA: Must be called with interrupts disabled */ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index f39d427649dc..6a56e48f4bfa 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -774,14 +774,15 @@ static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, unsigned int gasid; /* - * Lazy host ASID regeneration for guest user mode. + * Lazy host ASID regeneration / PT flush for guest user mode. * If the guest ASID has changed since the last guest usermode * execution, regenerate the host ASID so as to invalidate stale TLB - * entries. + * entries and flush GVA PT entries too. */ if (!KVM_GUEST_KERNEL_MODE(vcpu)) { gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; if (gasid != vcpu->arch.last_user_gasid) { + kvm_mips_flush_gva_pt(user_mm->pgd, KMF_USER); kvm_get_new_mmu_context(user_mm, cpu, vcpu); for_each_possible_cpu(i) if (i != cpu) -- cgit v1.2.3-70-g09d2 From aba8592950f1c698bb9c1b42d4f4dab07a145674 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 16 Dec 2016 15:57:00 +0000 Subject: KVM: MIPS/MMU: Invalidate stale GVA PTEs on TLBW MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement invalidation of specific pairs of GVA page table entries in one or both of the GVA page tables. This is used when existing mappings are replaced in the guest TLB by emulated TLBWI/TLBWR instructions. Due to the sharing of page tables in the host kernel range, we should be careful not to allow host pages to be invalidated. Add a helper kvm_mips_walk_pgd() which can be used when walking of either GPA (future patches) or GVA page tables is needed, optionally with allocation of page tables along the way when they don't exist. GPA page table walking will need to be protected by the kvm->mmu_lock, so we also add a small MMU page cache in each KVM VCPU, like that found for other architectures but smaller. This allows enough pages to be pre-allocated to handle a single fault without holding the lock, allowing the helper to run with the lock held without having to handle allocation failures. Using the same mechanism for GVA allows the same code to be used, and allows it to use the same cache of allocated pages if the GPA walk didn't need to allocate any new tables. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 17 +++++++ arch/mips/kvm/emulate.c | 6 +++ arch/mips/kvm/mips.c | 1 + arch/mips/kvm/mmu.c | 95 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 119 insertions(+) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index f5145dcab319..40aab4f5007c 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -261,6 +261,17 @@ struct kvm_mips_tlb { long tlb_lo[2]; }; +#define KVM_NR_MEM_OBJS 4 + +/* + * We don't want allocation failures within the mmu code, so we preallocate + * enough memory for a single page fault in a cache. + */ +struct kvm_mmu_memory_cache { + int nobjs; + void *objects[KVM_NR_MEM_OBJS]; +}; + #define KVM_MIPS_AUX_FPU 0x1 #define KVM_MIPS_AUX_MSA 0x2 @@ -327,6 +338,9 @@ struct kvm_vcpu_arch { /* Guest ASID of last user mode execution */ unsigned int last_user_gasid; + /* Cache some mmu pages needed inside spinlock regions */ + struct kvm_mmu_memory_cache mmu_page_cache; + int last_sched_cpu; /* WAIT executed */ @@ -631,6 +645,9 @@ enum kvm_mips_flush { KMF_GPA = 0x2, }; void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags); +void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); +void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, + bool user); extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, unsigned long gva); extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 1d399396e486..19eaeda6975c 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -864,11 +864,17 @@ static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu, /* No need to flush for entries which are already invalid */ if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V)) return; + /* Don't touch host kernel page tables or TLB mappings */ + if ((unsigned long)tlb->tlb_hi > 0x7fffffff) + return; /* User address space doesn't need flushing for KSeg2/3 changes */ user = tlb->tlb_hi < KVM_GUEST_KSEG0; preempt_disable(); + /* Invalidate page table entries */ + kvm_trap_emul_invalidate_gva(vcpu, tlb->tlb_hi & VPN2_MASK, user); + /* * Probe the shadow host TLB for the entry being overwritten, if one * matches, invalidate it diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a687864de428..c369fdd19fbc 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -396,6 +396,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvm_mips_dump_stats(vcpu); + kvm_mmu_free_memory_caches(vcpu); kfree(vcpu->arch.guest_ebase); kfree(vcpu->arch.kseg0_commpage); kfree(vcpu); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 09146b62552f..dbf2b55ee874 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -14,6 +14,26 @@ #include #include +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + free_page((unsigned long)mc->objects[--mc->nobjs]); +} + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) +{ + void *p; + + BUG_ON(!mc || !mc->nobjs); + p = mc->objects[--mc->nobjs]; + return p; +} + +void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) +{ + mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); +} + static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; @@ -30,6 +50,56 @@ static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) return cpu_asid(cpu, user_mm); } +/** + * kvm_mips_walk_pgd() - Walk page table with optional allocation. + * @pgd: Page directory pointer. + * @addr: Address to index page table using. + * @cache: MMU page cache to allocate new page tables from, or NULL. + * + * Walk the page tables pointed to by @pgd to find the PTE corresponding to the + * address @addr. If page tables don't exist for @addr, they will be created + * from the MMU cache if @cache is not NULL. + * + * Returns: Pointer to pte_t corresponding to @addr. + * NULL if a page table doesn't exist for @addr and !@cache. + * NULL if a page table allocation failed. + */ +static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, + unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pgd += pgd_index(addr); + if (pgd_none(*pgd)) { + /* Not used on MIPS yet */ + BUG(); + return NULL; + } + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + pmd_t *new_pmd; + + if (!cache) + return NULL; + new_pmd = mmu_memory_cache_alloc(cache); + pmd_init((unsigned long)new_pmd, + (unsigned long)invalid_pte_table); + pud_populate(NULL, pud, new_pmd); + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte_t *new_pte; + + if (!cache) + return NULL; + new_pte = mmu_memory_cache_alloc(cache); + clear_page(new_pte); + pmd_populate_kernel(NULL, pmd, new_pte); + } + return pte_offset(pmd, addr); +} + static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) { int srcu_idx, err = 0; @@ -81,6 +151,31 @@ unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset; } +void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, + bool user) +{ + pgd_t *pgdp; + pte_t *ptep; + + addr &= PAGE_MASK << 1; + + pgdp = vcpu->arch.guest_kernel_mm.pgd; + ptep = kvm_mips_walk_pgd(pgdp, NULL, addr); + if (ptep) { + ptep[0] = pfn_pte(0, __pgprot(0)); + ptep[1] = pfn_pte(0, __pgprot(0)); + } + + if (user) { + pgdp = vcpu->arch.guest_user_mm.pgd; + ptep = kvm_mips_walk_pgd(pgdp, NULL, addr); + if (ptep) { + ptep[0] = pfn_pte(0, __pgprot(0)); + ptep[1] = pfn_pte(0, __pgprot(0)); + } + } +} + /* * kvm_mips_flush_gva_{pte,pmd,pud,pgd,pt}. * Flush a range of guest physical address space from the VM's GPA page tables. -- cgit v1.2.3-70-g09d2 From fb99589391a9ed2e505dc7c3d02651a1a7b9f72b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 5 Jan 2017 10:44:38 +0000 Subject: KVM: MIPS/MMU: Convert KSeg0 faults to page tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we have GVA page tables and an optimised TLB refill handler in place, convert the handling of KSeg0 page faults from the guest to fill the GVA page tables and invalidate the TLB entry, rather than filling a TLB entry directly. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mmu.c | 79 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 15 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index dbf2b55ee874..afb47f21d8bc 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -14,6 +14,33 @@ #include #include +/* + * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels + * for which pages need to be cached. + */ +#if defined(__PAGETABLE_PMD_FOLDED) +#define KVM_MMU_CACHE_MIN_PAGES 1 +#else +#define KVM_MMU_CACHE_MIN_PAGES 2 +#endif + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + int min, int max) +{ + void *page; + + BUG_ON(max > KVM_NR_MEM_OBJS); + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < max) { + page = (void *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + cache->objects[cache->nobjs++] = page; + } + return 0; +} + static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) @@ -151,6 +178,27 @@ unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset; } +static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu, + unsigned long addr) +{ + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + pgd_t *pgdp; + int ret; + + /* We need a minimum of cached pages ready for page table creation */ + ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, + KVM_NR_MEM_OBJS); + if (ret) + return NULL; + + if (KVM_GUEST_KERNEL_MODE(vcpu)) + pgdp = vcpu->arch.guest_kernel_mm.pgd; + else + pgdp = vcpu->arch.guest_user_mm.pgd; + + return kvm_mips_walk_pgd(pgdp, memcache, addr); +} + void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, bool user) { @@ -316,10 +364,8 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, gfn_t gfn; kvm_pfn_t pfn0, pfn1; unsigned long vaddr = 0; - unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; struct kvm *kvm = vcpu->kvm; - const int flush_dcache_mask = 0; - int ret; + pte_t *ptep_gva; if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) { kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr); @@ -327,6 +373,8 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, return -1; } + /* Find host PFNs */ + gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT); if ((gfn | 1) >= kvm->arch.guest_pmap_npages) { kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__, @@ -345,20 +393,21 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, pfn0 = kvm->arch.guest_pmap[gfn & ~0x1]; pfn1 = kvm->arch.guest_pmap[gfn | 0x1]; - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | - ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - ENTRYLO_D | ENTRYLO_V; - entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | - ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - ENTRYLO_D | ENTRYLO_V; + /* Find GVA page table entry */ - preempt_disable(); - entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu)); - ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, - flush_dcache_mask); - preempt_enable(); + ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, vaddr); + if (!ptep_gva) { + kvm_err("No ptep for gva %lx\n", vaddr); + return -1; + } - return ret; + /* Write host PFNs into GVA page table */ + ptep_gva[0] = pte_mkyoung(pte_mkdirty(pfn_pte(pfn0, PAGE_SHARED))); + ptep_gva[1] = pte_mkyoung(pte_mkdirty(pfn_pte(pfn1, PAGE_SHARED))); + + /* Invalidate this entry in the TLB, guest kernel ASID only */ + kvm_mips_host_tlb_inv(vcpu, vaddr, false, true); + return 0; } int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, -- cgit v1.2.3-70-g09d2 From 7e3d2a750bcb0a7fed84e14e562d752c39fdf542 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Sat, 8 Oct 2016 01:15:19 +0100 Subject: KVM: MIPS/MMU: Convert TLB mapped faults to page tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we have GVA page tables and an optimised TLB refill handler in place, convert the handling of page faults in TLB mapped segment from the guest to fill a single GVA page table entry and invalidate the TLB entry, rather than filling a TLB entry pair directly. Also remove the now unused kvm_mips_get_{kernel,user}_asid() functions in mmu.c and kvm_mips_host_tlb_write() in tlb.c. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 7 +-- arch/mips/kvm/emulate.c | 6 ++- arch/mips/kvm/mmu.c | 93 +++++++++++++++------------------------- arch/mips/kvm/tlb.c | 64 --------------------------- 4 files changed, 40 insertions(+), 130 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 40aab4f5007c..f7680999e28a 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -599,7 +599,8 @@ extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu); extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, - struct kvm_mips_tlb *tlb); + struct kvm_mips_tlb *tlb, + unsigned long gva); extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, @@ -613,10 +614,6 @@ extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause, extern void kvm_mips_dump_host_tlbs(void); extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu); -extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, - unsigned long entrylo0, - unsigned long entrylo1, - int flush_dcache_mask); extern void kvm_mips_flush_host_tlb(int skip_kseg0); extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi, bool user, bool kernel); diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 19eaeda6975c..3ced662e012e 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1770,7 +1770,8 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, * We fault an entry from the guest tlb to the * shadow host TLB */ - if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) { + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, + va)) { kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", __func__, va, index, vcpu, read_c0_entryhi()); @@ -2746,7 +2747,8 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, * OK we have a Guest TLB entry, now inject it into the * shadow host TLB */ - if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) { + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, + va)) { kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", __func__, va, index, vcpu, read_c0_entryhi()); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index afb47f21d8bc..62122d297e52 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -61,22 +61,6 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); } -static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) -{ - struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; - int cpu = smp_processor_id(); - - return cpu_asid(cpu, kern_mm); -} - -static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) -{ - struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; - int cpu = smp_processor_id(); - - return cpu_asid(cpu, user_mm); -} - /** * kvm_mips_walk_pgd() - Walk page table with optional allocation. * @pgd: Page directory pointer. @@ -411,67 +395,58 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, } int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, - struct kvm_mips_tlb *tlb) + struct kvm_mips_tlb *tlb, + unsigned long gva) { - unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; struct kvm *kvm = vcpu->kvm; - kvm_pfn_t pfn0, pfn1; - gfn_t gfn0, gfn1; - long tlb_lo[2]; - int ret; - - tlb_lo[0] = tlb->tlb_lo[0]; - tlb_lo[1] = tlb->tlb_lo[1]; + kvm_pfn_t pfn; + gfn_t gfn; + long tlb_lo = 0; + pte_t *ptep_gva; + unsigned int idx; + bool kernel = KVM_GUEST_KERNEL_MODE(vcpu); /* * The commpage address must not be mapped to anything else if the guest * TLB contains entries nearby, or commpage accesses will break. */ - if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) & - VPN2_MASK & (PAGE_MASK << 1))) - tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0; - - gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT; - gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT; - if (gfn0 >= kvm->arch.guest_pmap_npages || - gfn1 >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n", - __func__, gfn0, gfn1, tlb->tlb_hi); + idx = TLB_LO_IDX(*tlb, gva); + if ((gva ^ KVM_GUEST_COMMPAGE_ADDR) & VPN2_MASK & PAGE_MASK) + tlb_lo = tlb->tlb_lo[idx]; + + /* Find host PFN */ + gfn = mips3_tlbpfn_to_paddr(tlb_lo) >> PAGE_SHIFT; + if (gfn >= kvm->arch.guest_pmap_npages) { + kvm_err("%s: Invalid gfn: %#llx, EHi: %#lx\n", + __func__, gfn, tlb->tlb_hi); kvm_mips_dump_guest_tlbs(vcpu); return -1; } - - if (kvm_mips_map_page(kvm, gfn0) < 0) + if (kvm_mips_map_page(kvm, gfn) < 0) return -1; + pfn = kvm->arch.guest_pmap[gfn]; - if (kvm_mips_map_page(kvm, gfn1) < 0) + /* Find GVA page table entry */ + ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva); + if (!ptep_gva) { + kvm_err("No ptep for gva %lx\n", gva); return -1; + } - pfn0 = kvm->arch.guest_pmap[gfn0]; - pfn1 = kvm->arch.guest_pmap[gfn1]; + /* Write PFN into GVA page table, taking attributes from Guest TLB */ + *ptep_gva = pfn_pte(pfn, (!(tlb_lo & ENTRYLO_V)) ? __pgprot(0) : + (tlb_lo & ENTRYLO_D) ? PAGE_SHARED : + PAGE_READONLY); + if (pte_present(*ptep_gva)) + *ptep_gva = pte_mkyoung(pte_mkdirty(*ptep_gva)); - /* Get attributes from the Guest TLB */ - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | - ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - (tlb_lo[0] & ENTRYLO_D) | - (tlb_lo[0] & ENTRYLO_V); - entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | - ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - (tlb_lo[1] & ENTRYLO_D) | - (tlb_lo[1] & ENTRYLO_V); + /* Invalidate this entry in the TLB, current guest mode ASID only */ + kvm_mips_host_tlb_inv(vcpu, gva, !kernel, kernel); kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, tlb->tlb_lo[0], tlb->tlb_lo[1]); - preempt_disable(); - entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ? - kvm_mips_get_kernel_asid(vcpu) : - kvm_mips_get_user_asid(vcpu)); - ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, - tlb->tlb_mask); - preempt_enable(); - - return ret; + return 0; } void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, @@ -582,7 +557,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) return KVM_INVALID_INST; } if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, - &vcpu->arch.guest_tlb[index])) { + &vcpu->arch.guest_tlb[index], va)) { kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n", __func__, opc, index, vcpu, read_c0_entryhi()); diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 06ee9a1d78a5..2fb76869d017 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -104,70 +104,6 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs); -/* XXXKYMA: Must be called with interrupts disabled */ -/* set flush_dcache_mask == 0 if no dcache flush required */ -int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, - unsigned long entrylo0, unsigned long entrylo1, - int flush_dcache_mask) -{ - unsigned long flags; - unsigned long old_entryhi; - int idx; - - local_irq_save(flags); - - old_entryhi = read_c0_entryhi(); - write_c0_entryhi(entryhi); - mtc0_tlbw_hazard(); - - tlb_probe(); - tlb_probe_hazard(); - idx = read_c0_index(); - - if (idx > current_cpu_data.tlbsize) { - kvm_err("%s: Invalid Index: %d\n", __func__, idx); - kvm_mips_dump_host_tlbs(); - local_irq_restore(flags); - return -1; - } - - write_c0_entrylo0(entrylo0); - write_c0_entrylo1(entrylo1); - mtc0_tlbw_hazard(); - - if (idx < 0) - tlb_write_random(); - else - tlb_write_indexed(); - tlbw_use_hazard(); - - kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0(R): 0x%08lx, entrylo1(R): 0x%08lx\n", - vcpu->arch.pc, idx, read_c0_entryhi(), - read_c0_entrylo0(), read_c0_entrylo1()); - - /* Flush D-cache */ - if (flush_dcache_mask) { - if (entrylo0 & ENTRYLO_V) { - ++vcpu->stat.flush_dcache_exits; - flush_data_cache_page((entryhi & VPN2_MASK) & - ~flush_dcache_mask); - } - if (entrylo1 & ENTRYLO_V) { - ++vcpu->stat.flush_dcache_exits; - flush_data_cache_page(((entryhi & VPN2_MASK) & - ~flush_dcache_mask) | - (0x1 << PAGE_SHIFT)); - } - } - - /* Restore old ASID */ - write_c0_entryhi(old_entryhi); - mtc0_tlbw_hazard(); - local_irq_restore(flags); - return 0; -} -EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write); - int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu) { -- cgit v1.2.3-70-g09d2 From 4c86460cbc03f32c3649f7aaf9104e6e65c88a61 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Sat, 8 Oct 2016 01:16:21 +0100 Subject: KVM: MIPS/MMU: Convert commpage fault handling to page tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we have GVA page tables and an optimised TLB refill handler in place, convert the handling of commpage faults from the guest kernel to fill the GVA page table and invalidate the TLB entry, rather than filling the wired TLB entry directly. For simplicity we no longer use a wired entry for the commpage (refill should be much cheaper with the fast-path handler anyway). Since we don't need to manipulate the TLB directly any longer, move the function from tlb.c to mmu.c. This puts it closer to the similar functions handling KSeg0 and TLB mapped page faults from the guest. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 3 --- arch/mips/kvm/mips.c | 34 +------------------------------ arch/mips/kvm/mmu.c | 21 +++++++++++++++++++ arch/mips/kvm/tlb.c | 44 ---------------------------------------- 4 files changed, 22 insertions(+), 80 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index f7680999e28a..e38e11184c1c 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -159,9 +159,6 @@ struct kvm_arch { /* Guest GVA->HPA page table */ unsigned long *guest_pmap; unsigned long guest_pmap_npages; - - /* Wired host TLB used for the commpage */ - int commpage_tlb; }; #define N_MIPS_COPROC_REGS 32 diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index c369fdd19fbc..c4fc52e39f34 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -92,28 +92,10 @@ void kvm_arch_check_processor_compat(void *rtn) *(int *)rtn = 0; } -static void kvm_mips_init_tlbs(struct kvm *kvm) -{ - unsigned long wired; - - /* - * Add a wired entry to the TLB, it is used to map the commpage to - * the Guest kernel - */ - wired = read_c0_wired(); - write_c0_wired(wired + 1); - mtc0_tlbw_hazard(); - kvm->arch.commpage_tlb = wired; - - kvm_debug("[%d] commpage TLB: %d\n", smp_processor_id(), - kvm->arch.commpage_tlb); -} - static void kvm_mips_init_vm_percpu(void *arg) { struct kvm *kvm = (struct kvm *)arg; - kvm_mips_init_tlbs(kvm); kvm_mips_callbacks->vm_init(kvm); } @@ -165,25 +147,11 @@ void kvm_mips_free_vcpus(struct kvm *kvm) mutex_unlock(&kvm->lock); } -static void kvm_mips_uninit_tlbs(void *arg) -{ - /* Restore wired count */ - write_c0_wired(0); - mtc0_tlbw_hazard(); - /* Clear out all the TLBs */ - kvm_local_flush_tlb_all(); -} - void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_mips_free_vcpus(kvm); - /* If this is the last instance, restore wired count */ - if (atomic_dec_return(&kvm_mips_instance) == 0) { - kvm_debug("%s: last KVM instance, restoring TLB parameters\n", - __func__); - on_each_cpu(kvm_mips_uninit_tlbs, NULL, 1); - } + atomic_dec(&kvm_mips_instance); } long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 62122d297e52..98f1a7715a68 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -449,6 +449,27 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, return 0; } +int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, + struct kvm_vcpu *vcpu) +{ + kvm_pfn_t pfn; + pte_t *ptep; + + ptep = kvm_trap_emul_pte_for_gva(vcpu, badvaddr); + if (!ptep) { + kvm_err("No ptep for commpage %lx\n", badvaddr); + return -1; + } + + pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage)); + /* Also set valid and dirty, so refill handler doesn't have to */ + *ptep = pte_mkyoung(pte_mkdirty(pfn_pte(pfn, PAGE_SHARED))); + + /* Invalidate this entry in the TLB, guest kernel ASID only */ + kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true); + return 0; +} + void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, struct kvm_vcpu *vcpu) { diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 2fb76869d017..919252662d5a 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -52,11 +52,6 @@ static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) return cpu_asid(cpu, user_mm); } -inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu) -{ - return vcpu->kvm->arch.commpage_tlb; -} - /* Structure defining an tlb entry data set. */ void kvm_mips_dump_host_tlbs(void) @@ -104,45 +99,6 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs); -int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, - struct kvm_vcpu *vcpu) -{ - kvm_pfn_t pfn; - unsigned long flags, old_entryhi = 0, vaddr = 0; - unsigned long entrylo[2] = { 0, 0 }; - unsigned int pair_idx; - - pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage)); - pair_idx = (badvaddr >> PAGE_SHIFT) & 1; - entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) | - ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - ENTRYLO_D | ENTRYLO_V; - - local_irq_save(flags); - - old_entryhi = read_c0_entryhi(); - vaddr = badvaddr & (PAGE_MASK << 1); - write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu)); - write_c0_entrylo0(entrylo[0]); - write_c0_entrylo1(entrylo[1]); - write_c0_index(kvm_mips_get_commpage_asid(vcpu)); - mtc0_tlbw_hazard(); - tlb_write_indexed(); - tlbw_use_hazard(); - - kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n", - vcpu->arch.pc, read_c0_index(), read_c0_entryhi(), - read_c0_entrylo0(), read_c0_entrylo1()); - - /* Restore old ASID */ - write_c0_entryhi(old_entryhi); - mtc0_tlbw_hazard(); - local_irq_restore(flags); - - return 0; -} -EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault); - int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi) { int i; -- cgit v1.2.3-70-g09d2 From 7a156e9f822d2eb6c294226aea2a4c12c05caa10 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 16 Nov 2016 17:36:47 +0000 Subject: KVM: MIPS: Drop vm_init() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the commpage doesn't use wired TLB entries, the per-CPU vm_init() callback is the only work done by kvm_mips_init_vm_percpu(). The trap & emulate implementation doesn't actually need to do anything from vm_init(), and the future VZ implementation would be better served by a kvm_arch_hardware_enable callback anyway. Therefore drop the vm_init() callback entirely, allowing the kvm_mips_init_vm_percpu() function to also be dropped, along with the kvm_mips_instance atomic counter. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 3 --- arch/mips/kvm/mips.c | 16 ---------------- arch/mips/kvm/tlb.c | 3 --- arch/mips/kvm/trap_emul.c | 6 ------ 4 files changed, 28 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index e38e11184c1c..95c86dab9b1b 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -121,8 +121,6 @@ static inline bool kvm_is_error_hva(unsigned long addr) return IS_ERR_VALUE(addr); } -extern atomic_t kvm_mips_instance; - struct kvm_vm_stat { ulong remote_tlb_flush; }; @@ -528,7 +526,6 @@ struct kvm_mips_callbacks { int (*handle_msa_fpe)(struct kvm_vcpu *vcpu); int (*handle_fpe)(struct kvm_vcpu *vcpu); int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); - int (*vm_init)(struct kvm *kvm); int (*vcpu_init)(struct kvm_vcpu *vcpu); void (*vcpu_uninit)(struct kvm_vcpu *vcpu); int (*vcpu_setup)(struct kvm_vcpu *vcpu); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index c4fc52e39f34..07ce10e3627a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -92,22 +92,8 @@ void kvm_arch_check_processor_compat(void *rtn) *(int *)rtn = 0; } -static void kvm_mips_init_vm_percpu(void *arg) -{ - struct kvm *kvm = (struct kvm *)arg; - - kvm_mips_callbacks->vm_init(kvm); - -} - int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - if (atomic_inc_return(&kvm_mips_instance) == 1) { - kvm_debug("%s: 1st KVM instance, setup host TLB parameters\n", - __func__); - on_each_cpu(kvm_mips_init_vm_percpu, kvm, 1); - } - return 0; } @@ -150,8 +136,6 @@ void kvm_mips_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_mips_free_vcpus(kvm); - - atomic_dec(&kvm_mips_instance); } long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 919252662d5a..8af5fd2cb107 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -33,9 +33,6 @@ #define KVM_GUEST_PC_TLB 0 #define KVM_GUEST_SP_TLB 1 -atomic_t kvm_mips_instance; -EXPORT_SYMBOL_GPL(kvm_mips_instance); - static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 6a56e48f4bfa..8bb82eaa4c91 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -429,11 +429,6 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) return ret; } -static int kvm_trap_emul_vm_init(struct kvm *kvm) -{ - return 0; -} - static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) { struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; @@ -847,7 +842,6 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .handle_fpe = kvm_trap_emul_handle_fpe, .handle_msa_disabled = kvm_trap_emul_handle_msa_disabled, - .vm_init = kvm_trap_emul_vm_init, .vcpu_init = kvm_trap_emul_vcpu_init, .vcpu_uninit = kvm_trap_emul_vcpu_uninit, .vcpu_setup = kvm_trap_emul_vcpu_setup, -- cgit v1.2.3-70-g09d2 From dacc3ed1dd608ff9553dcede6cd05369030ed099 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 19 Aug 2016 15:27:22 +0100 Subject: KVM: MIPS: Use uaccess to read/modify guest instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we have GVA page tables, use standard user accesses with page faults disabled to read & modify guest instructions. This should be more robust (than the rather dodgy method of accessing guest mapped segments by just directly addressing them) and will also work with Enhanced Virtual Addressing (EVA) host kernel configurations where dedicated instructions are needed for accessing user mode memory. For simplicity and speed we do this regardless of the guest segment the address resides in, rather than handling guest KSeg0 specially with kmap_atomic() as before. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 2 -- arch/mips/kvm/dyntrans.c | 28 +++++---------- arch/mips/kvm/mmu.c | 77 +++------------------------------------- arch/mips/kvm/trap_emul.c | 9 +++++ 4 files changed, 22 insertions(+), 94 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 95c86dab9b1b..a26504bee21c 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -639,8 +639,6 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, bool user); -extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, - unsigned long gva); extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, struct kvm_vcpu *vcpu); extern void kvm_local_flush_tlb_all(void); diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index 010cef240688..60ebf5862d2b 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -29,28 +30,15 @@ static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, union mips_instruction replace) { - unsigned long paddr, flags; - void *vaddr; - - if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) { - paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, - (unsigned long)opc); - vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr))); - vaddr += paddr & ~PAGE_MASK; - memcpy(vaddr, (void *)&replace, sizeof(u32)); - local_flush_icache_range((unsigned long)vaddr, - (unsigned long)vaddr + 32); - kunmap_atomic(vaddr); - } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { - local_irq_save(flags); - memcpy((void *)opc, (void *)&replace, sizeof(u32)); - __local_flush_icache_user_range((unsigned long)opc, - (unsigned long)opc + 32); - local_irq_restore(flags); - } else { + unsigned long vaddr = (unsigned long)opc; + int err; + + err = put_user(replace.word, opc); + if (unlikely(err)) { kvm_err("%s: Invalid address: %p\n", __func__, opc); - return -EFAULT; + return err; } + __local_flush_icache_user_range(vaddr, vaddr + 4); return 0; } diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 98f1a7715a68..c4e9c65065ea 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -134,34 +135,6 @@ out: return err; } -/* Translate guest KSEG0 addresses to Host PA */ -unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, - unsigned long gva) -{ - gfn_t gfn; - unsigned long offset = gva & ~PAGE_MASK; - struct kvm *kvm = vcpu->kvm; - - if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) { - kvm_err("%s/%p: Invalid gva: %#lx\n", __func__, - __builtin_return_address(0), gva); - return KVM_INVALID_PAGE; - } - - gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT); - - if (gfn >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn, - gva); - return KVM_INVALID_PAGE; - } - - if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) - return KVM_INVALID_ADDR; - - return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset; -} - static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu, unsigned long addr) { @@ -551,51 +524,11 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) { - struct mips_coproc *cop0 = vcpu->arch.cop0; - unsigned long paddr, flags, vpn2, asid; - unsigned long va = (unsigned long)opc; - void *vaddr; u32 inst; - int index; - - if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 || - KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) { - local_irq_save(flags); - index = kvm_mips_host_tlb_lookup(vcpu, va); - if (index >= 0) { - inst = *(opc); - } else { - vpn2 = va & VPN2_MASK; - asid = kvm_read_c0_guest_entryhi(cop0) & - KVM_ENTRYHI_ASID; - index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid); - if (index < 0) { - kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n", - __func__, opc, vcpu, read_c0_entryhi()); - kvm_mips_dump_host_tlbs(); - kvm_mips_dump_guest_tlbs(vcpu); - local_irq_restore(flags); - return KVM_INVALID_INST; - } - if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, - &vcpu->arch.guest_tlb[index], va)) { - kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n", - __func__, opc, index, vcpu, - read_c0_entryhi()); - kvm_mips_dump_guest_tlbs(vcpu); - local_irq_restore(flags); - return KVM_INVALID_INST; - } - inst = *(opc); - } - local_irq_restore(flags); - } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) { - paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va); - vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr))); - vaddr += paddr & ~PAGE_MASK; - inst = *(u32 *)vaddr; - kunmap_atomic(vaddr); - } else { + int err; + + err = get_user(inst, opc); + if (unlikely(err)) { kvm_err("%s: illegal address: %p\n", __func__, opc); return KVM_INVALID_INST; } diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 8bb82eaa4c91..ee8b5ad8c7c5 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -798,6 +799,12 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_trap_emul_vcpu_reenter(run, vcpu); + /* + * We use user accessors to access guest memory, but we don't want to + * invoke Linux page faulting. + */ + pagefault_disable(); + /* Disable hardware page table walking while in guest */ htw_stop(); @@ -823,6 +830,8 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) htw_start(); + pagefault_enable(); + return r; } -- cgit v1.2.3-70-g09d2 From 8af0e3c2e89e56dc5b064e5854b87a19e70e2710 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 17 Oct 2016 16:37:45 +0100 Subject: KVM: MIPS/Emulate: Fix CACHE emulation for EVA hosts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use protected_writeback_dcache_line() instead of flush_dcache_line(), and protected_flush_icache_line() instead of flush_icache_line(), so that CACHEE (the EVA variant) is used on EVA host kernels. Without this, guest floating point branch delay slot emulation via a trampoline on the user stack fails on EVA host kernels due to failure of the icache sync, resulting in the break instruction getting skipped and execution from the stack. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/emulate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 3ced662e012e..9ac8e45017ce 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1792,7 +1792,7 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, skip_fault: /* XXXKYMA: Only a subset of cache ops are supported, used by Linux */ if (op_inst == Hit_Writeback_Inv_D || op_inst == Hit_Invalidate_D) { - flush_dcache_line(va); + protected_writeback_dcache_line(va); #ifdef CONFIG_KVM_MIPS_DYN_TRANS /* @@ -1802,8 +1802,8 @@ skip_fault: kvm_mips_trans_cache_va(inst, opc, vcpu); #endif } else if (op_inst == Hit_Invalidate_I) { - flush_dcache_line(va); - flush_icache_line(va); + protected_writeback_dcache_line(va); + protected_flush_icache_line(va); #ifdef CONFIG_KVM_MIPS_DYN_TRANS /* Replace the CACHE instruction, with a SYNCI */ -- cgit v1.2.3-70-g09d2 From 49ec508e3bd0b11aaf534af0d63e4a17e05594e4 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 7 Oct 2016 22:32:13 +0100 Subject: KVM: MIPS/TLB: Drop kvm_local_flush_tlb_all() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that KVM no longer uses wired entries we can safely use local_flush_tlb_all() when we need to flush the entire TLB (on the start of a new ASID cycle). This doesn't flush wired entries, which allows other code to use them without KVM clobbering them all the time. It also is more up to date, knowing about the tlbinv architectural feature, flushing of micro TLB on cores where that is necessary (Loongson I believe), and knows to stop the HTW while doing so. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 1 - arch/mips/include/asm/mmu_context.h | 5 ----- arch/mips/kvm/mmu.c | 2 +- arch/mips/kvm/tlb.c | 29 ----------------------------- 4 files changed, 1 insertion(+), 36 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index a26504bee21c..1a83b6f85de2 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -641,7 +641,6 @@ void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, bool user); extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, struct kvm_vcpu *vcpu); -extern void kvm_local_flush_tlb_all(void); extern void kvm_mips_alloc_new_mmu_context(struct kvm_vcpu *vcpu); extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu); extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu); diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h index 16eb8521398e..2abf94f72c0a 100644 --- a/arch/mips/include/asm/mmu_context.h +++ b/arch/mips/include/asm/mmu_context.h @@ -99,17 +99,12 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) static inline void get_new_mmu_context(struct mm_struct *mm, unsigned long cpu) { - extern void kvm_local_flush_tlb_all(void); unsigned long asid = asid_cache(cpu); if (!((asid += cpu_asid_inc()) & cpu_asid_mask(&cpu_data[cpu]))) { if (cpu_has_vtag_icache) flush_icache_all(); -#ifdef CONFIG_KVM - kvm_local_flush_tlb_all(); /* start new asid cycle */ -#else local_flush_tlb_all(); /* start new asid cycle */ -#endif if (!asid) /* fix version if needed */ asid = asid_first_version(cpu); } diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index c4e9c65065ea..cf832ea963d8 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -453,7 +453,7 @@ void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, if (cpu_has_vtag_icache) flush_icache_all(); - kvm_local_flush_tlb_all(); /* start new asid cycle */ + local_flush_tlb_all(); /* start new asid cycle */ if (!asid) /* fix version if needed */ asid = asid_first_version(cpu); diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 8af5fd2cb107..51f4aee717e7 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -263,35 +263,6 @@ void kvm_mips_flush_host_tlb(int skip_kseg0) } EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb); -void kvm_local_flush_tlb_all(void) -{ - unsigned long flags; - unsigned long old_ctx; - int entry = 0; - - local_irq_save(flags); - /* Save old context and create impossible VPN2 value */ - old_ctx = read_c0_entryhi(); - write_c0_entrylo0(0); - write_c0_entrylo1(0); - - /* Blast 'em all away. */ - while (entry < current_cpu_data.tlbsize) { - /* Make sure all entries differ. */ - write_c0_entryhi(UNIQUE_ENTRYHI(entry)); - write_c0_index(entry); - mtc0_tlbw_hazard(); - tlb_write_indexed(); - tlbw_use_hazard(); - entry++; - } - write_c0_entryhi(old_ctx); - mtc0_tlbw_hazard(); - - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all); - /** * kvm_mips_suspend_mm() - Suspend the active mm. * @cpu The CPU we're running on. -- cgit v1.2.3-70-g09d2 From 7071a8859bb36d34b8f766275b239e98760f010c Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 13 Sep 2016 12:58:08 +0100 Subject: KVM: MIPS/Emulate: Drop redundant TLB flushes on exceptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When exceptions are injected into the MIPS KVM guest, the whole host TLB is flushed (except any entries in the guest KSeg0 range). This is certainly not mandated by the architecture when exceptions are taken (userland can't directly change TLB mappings anyway), and is a pretty heavyweight operation: - There may be hundreds of TLB entries especially when a 512 entry FTLB is present. These are walked and read and conditionally invalidated, so the TLBINV feature can't be used either. - It'll indiscriminately wipe out entries belonging to other memory spaces. A simple ASID regeneration would be much faster to perform, although it'd wipe out the guest KSeg0 mappings too. My suspicion is that this was simply to plaster over the fact that kvm_mips_host_tlb_inv() incorrectly only invalidated TLB entries in the ASID for guest usermode, and not the ASID for guest kernelmode. Now that the recent commit "KVM: MIPS/TLB: Flush host TLB entry in kernel ASID" fixes kvm_mips_host_tlb_inv() to flush TLB entries in the kernelmode ASID when the guest TLB changes, lets drop these calls and the otherwise unused kvm_mips_flush_host_tlb(). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 1 - arch/mips/kvm/emulate.c | 10 -------- arch/mips/kvm/tlb.c | 49 ---------------------------------------- 3 files changed, 60 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 1a83b6f85de2..174857f146b1 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -608,7 +608,6 @@ extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause, extern void kvm_mips_dump_host_tlbs(void); extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu); -extern void kvm_mips_flush_host_tlb(int skip_kseg0); extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi, bool user, bool kernel); diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 9ac8e45017ce..cd11d787d9dc 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1968,8 +1968,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); /* XXXKYMA: is the context register used by linux??? */ kvm_write_c0_guest_entryhi(cop0, entryhi); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); return EMULATE_DONE; } @@ -2014,8 +2012,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); /* XXXKYMA: is the context register used by linux??? */ kvm_write_c0_guest_entryhi(cop0, entryhi); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); return EMULATE_DONE; } @@ -2058,8 +2054,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); /* XXXKYMA: is the context register used by linux??? */ kvm_write_c0_guest_entryhi(cop0, entryhi); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); return EMULATE_DONE; } @@ -2102,8 +2096,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); /* XXXKYMA: is the context register used by linux??? */ kvm_write_c0_guest_entryhi(cop0, entryhi); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); return EMULATE_DONE; } @@ -2176,8 +2168,6 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); /* XXXKYMA: is the context register used by linux??? */ kvm_write_c0_guest_entryhi(cop0, entryhi); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); return EMULATE_DONE; } diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 51f4aee717e7..cee2e9feb942 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -214,55 +214,6 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va, } EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv); -void kvm_mips_flush_host_tlb(int skip_kseg0) -{ - unsigned long flags; - unsigned long old_entryhi, entryhi; - unsigned long old_pagemask; - int entry = 0; - int maxentry = current_cpu_data.tlbsize; - - local_irq_save(flags); - - old_entryhi = read_c0_entryhi(); - old_pagemask = read_c0_pagemask(); - - /* Blast 'em all away. */ - for (entry = 0; entry < maxentry; entry++) { - write_c0_index(entry); - - if (skip_kseg0) { - mtc0_tlbr_hazard(); - tlb_read(); - tlb_read_hazard(); - - entryhi = read_c0_entryhi(); - - /* Don't blow away guest kernel entries */ - if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0) - continue; - - write_c0_pagemask(old_pagemask); - } - - /* Make sure all entries differ. */ - write_c0_entryhi(UNIQUE_ENTRYHI(entry)); - write_c0_entrylo0(0); - write_c0_entrylo1(0); - mtc0_tlbw_hazard(); - - tlb_write_indexed(); - tlbw_use_hazard(); - } - - write_c0_entryhi(old_entryhi); - write_c0_pagemask(old_pagemask); - mtc0_tlbw_hazard(); - - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb); - /** * kvm_mips_suspend_mm() - Suspend the active mm. * @cpu The CPU we're running on. -- cgit v1.2.3-70-g09d2 From a98dd7410370634496aa500abf90ec6336b494ae Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 7 Oct 2016 22:39:41 +0100 Subject: KVM: MIPS/MMU: Drop kvm_get_new_mmu_context() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIPS KVM uses its own variation of get_new_mmu_context() which takes an extra vcpu pointer (unused) and does exactly the same thing. Switch to just using get_new_mmu_context() directly and drop KVM's version of it as it doesn't really serve any purpose. The nearby declarations of kvm_mips_alloc_new_mmu_context(), kvm_mips_vcpu_load() and kvm_mips_vcpu_put() are also removed from kvm_host.h, as no definitions or users exist. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 5 ----- arch/mips/kvm/emulate.c | 3 +-- arch/mips/kvm/mmu.c | 19 ------------------- arch/mips/kvm/trap_emul.c | 6 +++--- 4 files changed, 4 insertions(+), 29 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 174857f146b1..1337abb18e2b 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -638,11 +638,6 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, bool user); -extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, - struct kvm_vcpu *vcpu); -extern void kvm_mips_alloc_new_mmu_context(struct kvm_vcpu *vcpu); -extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu); /* Emulation */ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu); diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index cd11d787d9dc..67ea39973b96 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1198,8 +1198,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, */ preempt_disable(); cpu = smp_processor_id(); - kvm_get_new_mmu_context(kern_mm, - cpu, vcpu); + get_new_mmu_context(kern_mm, cpu); for_each_possible_cpu(i) if (i != cpu) cpu_context(i, kern_mm) = 0; diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index cf832ea963d8..aab604e75d3b 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -443,25 +443,6 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, return 0; } -void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, - struct kvm_vcpu *vcpu) -{ - unsigned long asid = asid_cache(cpu); - - asid += cpu_asid_inc(); - if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) { - if (cpu_has_vtag_icache) - flush_icache_all(); - - local_flush_tlb_all(); /* start new asid cycle */ - - if (!asid) /* fix version if needed */ - asid = asid_first_version(cpu); - } - - cpu_context(cpu, mm) = asid_cache(cpu) = asid; -} - /** * kvm_mips_migrate_count() - Migrate timer. * @vcpu: Virtual CPU. diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index ee8b5ad8c7c5..653850c05b33 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -706,7 +706,7 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if ((cpu_context(cpu, kern_mm) ^ asid_cache(cpu)) & asid_version_mask(cpu)) { - kvm_get_new_mmu_context(kern_mm, cpu, vcpu); + get_new_mmu_context(kern_mm, cpu); kvm_debug("[%d]: cpu_context: %#lx\n", cpu, cpu_context(cpu, current->mm)); @@ -716,7 +716,7 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if ((cpu_context(cpu, user_mm) ^ asid_cache(cpu)) & asid_version_mask(cpu)) { - kvm_get_new_mmu_context(user_mm, cpu, vcpu); + get_new_mmu_context(user_mm, cpu); kvm_debug("[%d]: cpu_context: %#lx\n", cpu, cpu_context(cpu, current->mm)); @@ -779,7 +779,7 @@ static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; if (gasid != vcpu->arch.last_user_gasid) { kvm_mips_flush_gva_pt(user_mm->pgd, KMF_USER); - kvm_get_new_mmu_context(user_mm, cpu, vcpu); + get_new_mmu_context(user_mm, cpu); for_each_possible_cpu(i) if (i != cpu) cpu_context(i, user_mm) = 0; -- cgit v1.2.3-70-g09d2 From a1ecc54d7ea629538116351a3ccc7d86bb9a3c69 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 28 Nov 2016 18:39:24 +0000 Subject: KVM: MIPS/T&E: Don't treat code fetch faults as MMIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to make use of the CP0_BadInstr & CP0_BadInstrP registers we need to be a bit more careful not to treat code fetch faults as MMIO, lest we hit an UNPREDICTABLE register value when we try to emulate the MMIO load instruction but there was no valid instruction word available to the hardware. Add a kvm_is_ifetch_fault() helper to try to figure out whether a load fault was due to a code fetch, and prevent MMIO instruction emulation in that case. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 27 +++++++++++++++++++++++++++ arch/mips/kvm/trap_emul.c | 12 ++++++++++++ 2 files changed, 39 insertions(+) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 1337abb18e2b..6f68f7545b66 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -643,6 +643,33 @@ void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu); enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); +/** + * kvm_is_ifetch_fault() - Find whether a TLBL exception is due to ifetch fault. + * @vcpu: Virtual CPU. + * + * Returns: Whether the TLBL exception was likely due to an instruction + * fetch fault rather than a data load fault. + */ +static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *vcpu) +{ + unsigned long badvaddr = vcpu->host_cp0_badvaddr; + unsigned long epc = msk_isa16_mode(vcpu->pc); + u32 cause = vcpu->host_cp0_cause; + + if (epc == badvaddr) + return true; + + /* + * Branches may be 32-bit or 16-bit instructions. + * This isn't exact, but we don't really support MIPS16 or microMIPS yet + * in KVM anyway. + */ + if ((cause & CAUSEF_BD) && badvaddr - epc <= 4) + return true; + + return false; +} + extern enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, struct kvm_run *run, diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 653850c05b33..ccd56b3ce84b 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -178,6 +178,12 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) } } else if (KVM_GUEST_KERNEL_MODE(vcpu) && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { + /* A code fetch fault doesn't count as an MMIO */ + if (!store && kvm_is_ifetch_fault(&vcpu->arch)) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + /* * With EVA we may get a TLB exception instead of an address * error when the guest performs MMIO to KSeg1 addresses. @@ -255,6 +261,12 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) int ret = RESUME_GUEST; if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) { + /* A code fetch fault doesn't count as an MMIO */ + if (kvm_is_ifetch_fault(&vcpu->arch)) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + kvm_debug("Emulate Load from MMIO space @ %#lx\n", badvaddr); er = kvm_mips_emulate_inst(cause, opc, run, vcpu); if (er == EMULATE_FAIL) { -- cgit v1.2.3-70-g09d2 From 122e51d47418f74a69a93bf02f5535d11ff75bf5 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 28 Nov 2016 17:23:14 +0000 Subject: KVM: MIPS: Improve kvm_get_inst() error return MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently kvm_get_inst() returns KVM_INVALID_INST in the event of a fault reading the guest instruction. This has the rather arbitrary magic value 0xdeadbeef. This API isn't very robust, and in fact 0xdeadbeef is a valid MIPS64 instruction encoding, namely "ld t1,-16657(s5)". Therefore change the kvm_get_inst() API to return 0 or -EFAULT, and to return the instruction via a u32 *out argument. We can then drop the KVM_INVALID_INST definition entirely. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 3 +- arch/mips/kvm/emulate.c | 90 ++++++++++++++++++++-------------------- arch/mips/kvm/mips.c | 7 +++- arch/mips/kvm/mmu.c | 9 ++-- 4 files changed, 56 insertions(+), 53 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 6f68f7545b66..f296ebeda9e3 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -104,7 +104,6 @@ #define KVM_GUEST_KSEG23ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23) #define KVM_INVALID_PAGE 0xdeadbeef -#define KVM_INVALID_INST 0xdeadbeef #define KVM_INVALID_ADDR 0xdeadbeef /* @@ -640,7 +639,7 @@ void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, bool user); /* Emulation */ -u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu); +int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); /** diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 67ea39973b96..b906fc0589f3 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -38,23 +38,25 @@ * Compute the return address and do emulate branch simulation, if required. * This function should be called only in branch delay slot active. */ -unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, - unsigned long instpc) +static int kvm_compute_return_epc(struct kvm_vcpu *vcpu, unsigned long instpc, + unsigned long *out) { unsigned int dspcontrol; union mips_instruction insn; struct kvm_vcpu_arch *arch = &vcpu->arch; long epc = instpc; - long nextpc = KVM_INVALID_INST; + long nextpc; + int err; - if (epc & 3) - goto unaligned; + if (epc & 3) { + kvm_err("%s: unaligned epc\n", __func__); + return -EINVAL; + } /* Read the instruction */ - insn.word = kvm_get_inst((u32 *) epc, vcpu); - - if (insn.word == KVM_INVALID_INST) - return KVM_INVALID_INST; + err = kvm_get_inst((u32 *)epc, vcpu, &insn.word); + if (err) + return err; switch (insn.i_format.opcode) { /* jr and jalr are in r_format format. */ @@ -66,6 +68,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, case jr_op: nextpc = arch->gprs[insn.r_format.rs]; break; + default: + return -EINVAL; } break; @@ -114,8 +118,11 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, nextpc = epc; break; case bposge32_op: - if (!cpu_has_dsp) - goto sigill; + if (!cpu_has_dsp) { + kvm_err("%s: DSP branch but not DSP ASE\n", + __func__); + return -EINVAL; + } dspcontrol = rddsp(0x01); @@ -125,6 +132,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, epc += 8; nextpc = epc; break; + default: + return -EINVAL; } break; @@ -189,7 +198,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, /* And now the FPA/cp1 branch instructions. */ case cop1_op: kvm_err("%s: unsupported cop1_op\n", __func__); - break; + return -EINVAL; #ifdef CONFIG_CPU_MIPSR6 /* R6 added the following compact branches with forbidden slots */ @@ -198,19 +207,19 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, /* only rt == 0 isn't compact branch */ if (insn.i_format.rt != 0) goto compact_branch; - break; + return -EINVAL; case pop10_op: case pop30_op: /* only rs == rt == 0 is reserved, rest are compact branches */ if (insn.i_format.rs != 0 || insn.i_format.rt != 0) goto compact_branch; - break; + return -EINVAL; case pop66_op: case pop76_op: /* only rs == 0 isn't compact branch */ if (insn.i_format.rs != 0) goto compact_branch; - break; + return -EINVAL; compact_branch: /* * If we've hit an exception on the forbidden slot, then @@ -221,42 +230,32 @@ compact_branch: break; #else compact_branch: - /* Compact branches not supported before R6 */ - break; + /* Fall through - Compact branches not supported before R6 */ #endif + default: + return -EINVAL; } - return nextpc; - -unaligned: - kvm_err("%s: unaligned epc\n", __func__); - return nextpc; - -sigill: - kvm_err("%s: DSP branch but not DSP ASE\n", __func__); - return nextpc; + *out = nextpc; + return 0; } enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause) { - unsigned long branch_pc; - enum emulation_result er = EMULATE_DONE; + int err; if (cause & CAUSEF_BD) { - branch_pc = kvm_compute_return_epc(vcpu, vcpu->arch.pc); - if (branch_pc == KVM_INVALID_INST) { - er = EMULATE_FAIL; - } else { - vcpu->arch.pc = branch_pc; - kvm_debug("BD update_pc(): New PC: %#lx\n", - vcpu->arch.pc); - } - } else + err = kvm_compute_return_epc(vcpu, vcpu->arch.pc, + &vcpu->arch.pc); + if (err) + return EMULATE_FAIL; + } else { vcpu->arch.pc += 4; + } kvm_debug("update_pc(): New PC: %#lx\n", vcpu->arch.pc); - return er; + return EMULATE_DONE; } /** @@ -1835,12 +1834,14 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, { union mips_instruction inst; enum emulation_result er = EMULATE_DONE; + int err; /* Fetch the instruction. */ if (cause & CAUSEF_BD) opc += 1; - - inst.word = kvm_get_inst(opc, vcpu); + err = kvm_get_inst(opc, vcpu, &inst.word); + if (err) + return EMULATE_FAIL; switch (inst.r_format.opcode) { case cop0_op: @@ -2419,6 +2420,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, enum emulation_result er = EMULATE_DONE; unsigned long curr_pc; union mips_instruction inst; + int err; /* * Update PC and hold onto current PC in case there is @@ -2432,11 +2434,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, /* Fetch the instruction. */ if (cause & CAUSEF_BD) opc += 1; - - inst.word = kvm_get_inst(opc, vcpu); - - if (inst.word == KVM_INVALID_INST) { - kvm_err("%s: Cannot get inst @ %p\n", __func__, opc); + err = kvm_get_inst(opc, vcpu, &inst.word); + if (err) { + kvm_err("%s: Cannot get inst @ %p (%d)\n", __func__, opc, err); return EMULATE_FAIL; } diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 07ce10e3627a..29afd96069ef 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1343,6 +1343,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; enum emulation_result er = EMULATE_DONE; + u32 inst; int ret = RESUME_GUEST; /* re-enable HTW before enabling interrupts */ @@ -1467,8 +1468,12 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) break; default: + if (cause & CAUSEF_BD) + opc += 1; + inst = 0; + kvm_get_inst(opc, vcpu, &inst); kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#lx\n", - exccode, opc, kvm_get_inst(opc, vcpu), badvaddr, + exccode, opc, inst, badvaddr, kvm_read_c0_guest_status(vcpu->arch.cop0)); kvm_arch_vcpu_dump_regs(vcpu); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index aab604e75d3b..6379ac1bc7b9 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -503,16 +503,15 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) local_irq_restore(flags); } -u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) +int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out) { - u32 inst; int err; - err = get_user(inst, opc); + err = get_user(*out, opc); if (unlikely(err)) { kvm_err("%s: illegal address: %p\n", __func__, opc); - return KVM_INVALID_INST; + return -EFAULT; } - return inst; + return 0; } -- cgit v1.2.3-70-g09d2 From 6a97c775ff77fb7c54adc3f7944205ae66cb5475 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Apr 2015 16:54:35 +0100 Subject: KVM: MIPS: Use CP0_BadInstr[P] for emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When exiting from the guest, store the values of the CP0_BadInstr and CP0_BadInstrP registers if they exist, which contain the encodings of the instructions which caused the last synchronous exception. When the instruction is needed for emulation, kvm_get_badinstr() and kvm_get_badinstrp() are used instead of calling kvm_get_inst() directly, to decide whether to read the saved CP0_BadInstr/CP0_BadInstrP registers (if they exist), or read the instruction from memory (if not). The use of these registers should be more robust than using kvm_get_inst(), as it actually gives the instruction encoding seen by the hardware rather than relying on user accessors after the fact, which can be fooled by incoherent icache or a racing code modification. It will also work with VZ, where the guest virtual memory isn't directly accessible by the host with user accessors. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 4 ++++ arch/mips/kvm/emulate.c | 48 +++++++++++++++++++++++++++++++++++++--- arch/mips/kvm/entry.c | 14 ++++++++++++ arch/mips/kvm/mips.c | 2 +- 4 files changed, 64 insertions(+), 4 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index f296ebeda9e3..17c5e53ef3fa 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -280,6 +280,8 @@ struct kvm_vcpu_arch { unsigned long host_cp0_badvaddr; unsigned long host_cp0_epc; u32 host_cp0_cause; + u32 host_cp0_badinstr; + u32 host_cp0_badinstrp; /* GPRS */ unsigned long gprs[32]; @@ -641,6 +643,8 @@ void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, /* Emulation */ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); +int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); +int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); /** * kvm_is_ifetch_fault() - Find whether a TLBL exception is due to ifetch fault. diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index b906fc0589f3..b295a4a1496f 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -54,7 +54,7 @@ static int kvm_compute_return_epc(struct kvm_vcpu *vcpu, unsigned long instpc, } /* Read the instruction */ - err = kvm_get_inst((u32 *)epc, vcpu, &insn.word); + err = kvm_get_badinstrp((u32 *)epc, vcpu, &insn.word); if (err) return err; @@ -258,6 +258,48 @@ enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause) return EMULATE_DONE; } +/** + * kvm_get_badinstr() - Get bad instruction encoding. + * @opc: Guest pointer to faulting instruction. + * @vcpu: KVM VCPU information. + * + * Gets the instruction encoding of the faulting instruction, using the saved + * BadInstr register value if it exists, otherwise falling back to reading guest + * memory at @opc. + * + * Returns: The instruction encoding of the faulting instruction. + */ +int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out) +{ + if (cpu_has_badinstr) { + *out = vcpu->arch.host_cp0_badinstr; + return 0; + } else { + return kvm_get_inst(opc, vcpu, out); + } +} + +/** + * kvm_get_badinstrp() - Get bad prior instruction encoding. + * @opc: Guest pointer to prior faulting instruction. + * @vcpu: KVM VCPU information. + * + * Gets the instruction encoding of the prior faulting instruction (the branch + * containing the delay slot which faulted), using the saved BadInstrP register + * value if it exists, otherwise falling back to reading guest memory at @opc. + * + * Returns: The instruction encoding of the prior faulting instruction. + */ +int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out) +{ + if (cpu_has_badinstrp) { + *out = vcpu->arch.host_cp0_badinstrp; + return 0; + } else { + return kvm_get_inst(opc, vcpu, out); + } +} + /** * kvm_mips_count_disabled() - Find whether the CP0_Count timer is disabled. * @vcpu: Virtual CPU. @@ -1839,7 +1881,7 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, /* Fetch the instruction. */ if (cause & CAUSEF_BD) opc += 1; - err = kvm_get_inst(opc, vcpu, &inst.word); + err = kvm_get_badinstr(opc, vcpu, &inst.word); if (err) return EMULATE_FAIL; @@ -2434,7 +2476,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, /* Fetch the instruction. */ if (cause & CAUSEF_BD) opc += 1; - err = kvm_get_inst(opc, vcpu, &inst.word); + err = kvm_get_badinstr(opc, vcpu, &inst.word); if (err) { kvm_err("%s: Cannot get inst @ %p (%d)\n", __func__, opc, err); return EMULATE_FAIL; diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index 1ae33e0e675c..c5b254c4d0da 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -53,6 +53,8 @@ /* Some CP0 registers */ #define C0_HWRENA 7, 0 #define C0_BADVADDR 8, 0 +#define C0_BADINSTR 8, 1 +#define C0_BADINSTRP 8, 2 #define C0_ENTRYHI 10, 0 #define C0_STATUS 12, 0 #define C0_CAUSE 13, 0 @@ -579,6 +581,18 @@ void *kvm_mips_build_exit(void *addr) uasm_i_mfc0(&p, K0, C0_CAUSE); uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1); + if (cpu_has_badinstr) { + uasm_i_mfc0(&p, K0, C0_BADINSTR); + uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, + host_cp0_badinstr), K1); + } + + if (cpu_has_badinstrp) { + uasm_i_mfc0(&p, K0, C0_BADINSTRP); + uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, + host_cp0_badinstrp), K1); + } + /* Now restore the host state just enough to run the handlers */ /* Switch EBASE to the one used by Linux */ diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 29afd96069ef..b8f04070bf39 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1471,7 +1471,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) if (cause & CAUSEF_BD) opc += 1; inst = 0; - kvm_get_inst(opc, vcpu, &inst); + kvm_get_badinstr(opc, vcpu, &inst); kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#lx\n", exccode, opc, inst, badvaddr, kvm_read_c0_guest_status(vcpu->arch.cop0)); -- cgit v1.2.3-70-g09d2 From 06c158c96ed8909a1d8696d4f953ca8a9ef55574 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 1 May 2015 13:50:18 +0100 Subject: KVM: MIPS/MMU: Convert guest physical map to page table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current guest physical memory is mapped to host physical addresses using a single linear array (guest_pmap of length guest_pmap_npages). This was only really meant to be temporary, and isn't sparse, so its wasteful of memory. A small amount of RAM at GPA 0 and a small boot exception vector at GPA 0x1fc00000 cannot be represented without a full 128KiB guest_pmap allocation (MIPS32 with 16KiB pages), which is one reason why QEMU currently runs its boot code at the top of RAM instead of the usual boot exception vector address. Instead use the existing infrastructure for host virtual page table management to allocate a page table for guest physical memory too. This should be sufficient for now, assuming the size of physical memory doesn't exceed the size of virtual memory. It may need extending in future to handle XPA (eXtended Physical Addressing) in 32-bit guests, as supported by VZ guests on P5600. Some of this code is based loosely on Cavium's VZ KVM implementation. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 7 +- arch/mips/kvm/mips.c | 48 ++----- arch/mips/kvm/mmu.c | 293 ++++++++++++++++++++++++++++++++++----- 3 files changed, 278 insertions(+), 70 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 17c5e53ef3fa..a04657932e19 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -153,9 +153,8 @@ struct kvm_arch_memory_slot { }; struct kvm_arch { - /* Guest GVA->HPA page table */ - unsigned long *guest_pmap; - unsigned long guest_pmap_npages; + /* Guest physical mm */ + struct mm_struct gpa_mm; }; #define N_MIPS_COPROC_REGS 32 @@ -636,6 +635,8 @@ enum kvm_mips_flush { KMF_GPA = 0x2, }; void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags); +bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn); +pgd_t *kvm_pgd_alloc(void); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, bool user); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index b8f04070bf39..9b72e2c55a0c 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -94,6 +95,11 @@ void kvm_arch_check_processor_compat(void *rtn) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + /* Allocate page table to map GPA -> RPA */ + kvm->arch.gpa_mm.pgd = kvm_pgd_alloc(); + if (!kvm->arch.gpa_mm.pgd) + return -ENOMEM; + return 0; } @@ -112,13 +118,6 @@ void kvm_mips_free_vcpus(struct kvm *kvm) unsigned int i; struct kvm_vcpu *vcpu; - /* Put the pages we reserved for the guest pmap */ - for (i = 0; i < kvm->arch.guest_pmap_npages; i++) { - if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE) - kvm_release_pfn_clean(kvm->arch.guest_pmap[i]); - } - kfree(kvm->arch.guest_pmap); - kvm_for_each_vcpu(i, vcpu, kvm) { kvm_arch_vcpu_free(vcpu); } @@ -133,9 +132,17 @@ void kvm_mips_free_vcpus(struct kvm *kvm) mutex_unlock(&kvm->lock); } +static void kvm_mips_free_gpa_pt(struct kvm *kvm) +{ + /* It should always be safe to remove after flushing the whole range */ + WARN_ON(!kvm_mips_flush_gpa_pt(kvm, 0, ~0)); + pgd_free(NULL, kvm->arch.gpa_mm.pgd); +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_mips_free_vcpus(kvm); + kvm_mips_free_gpa_pt(kvm); } long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, @@ -164,36 +171,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { - unsigned long npages = 0; - int i; - kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n", __func__, kvm, mem->slot, mem->guest_phys_addr, mem->memory_size, mem->userspace_addr); - - /* Setup Guest PMAP table */ - if (!kvm->arch.guest_pmap) { - if (mem->slot == 0) - npages = mem->memory_size >> PAGE_SHIFT; - - if (npages) { - kvm->arch.guest_pmap_npages = npages; - kvm->arch.guest_pmap = - kzalloc(npages * sizeof(unsigned long), GFP_KERNEL); - - if (!kvm->arch.guest_pmap) { - kvm_err("Failed to allocate guest PMAP\n"); - return; - } - - kvm_debug("Allocated space for Guest PMAP Table (%ld pages) @ %p\n", - npages, kvm->arch.guest_pmap); - - /* Now setup the page table */ - for (i = 0; i < npages; i++) - kvm->arch.guest_pmap[i] = KVM_INVALID_PAGE; - } - } } static inline void dump_handler(const char *symbol, void *start, void *end) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 6379ac1bc7b9..09f5da706d9a 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -62,6 +62,63 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); } +/** + * kvm_pgd_init() - Initialise KVM GPA page directory. + * @page: Pointer to page directory (PGD) for KVM GPA. + * + * Initialise a KVM GPA page directory with pointers to the invalid table, i.e. + * representing no mappings. This is similar to pgd_init(), however it + * initialises all the page directory pointers, not just the ones corresponding + * to the userland address space (since it is for the guest physical address + * space rather than a virtual address space). + */ +static void kvm_pgd_init(void *page) +{ + unsigned long *p, *end; + unsigned long entry; + +#ifdef __PAGETABLE_PMD_FOLDED + entry = (unsigned long)invalid_pte_table; +#else + entry = (unsigned long)invalid_pmd_table; +#endif + + p = (unsigned long *)page; + end = p + PTRS_PER_PGD; + + do { + p[0] = entry; + p[1] = entry; + p[2] = entry; + p[3] = entry; + p[4] = entry; + p += 8; + p[-3] = entry; + p[-2] = entry; + p[-1] = entry; + } while (p != end); +} + +/** + * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. + * + * Allocate a blank KVM GPA page directory (PGD) for representing guest physical + * to host physical page mappings. + * + * Returns: Pointer to new KVM GPA page directory. + * NULL on allocation failure. + */ +pgd_t *kvm_pgd_alloc(void) +{ + pgd_t *ret; + + ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD_ORDER); + if (ret) + kvm_pgd_init(ret); + + return ret; +} + /** * kvm_mips_walk_pgd() - Walk page table with optional allocation. * @pgd: Page directory pointer. @@ -112,15 +169,182 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, return pte_offset(pmd, addr); } -static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) +/* Caller must hold kvm->mm_lock */ +static pte_t *kvm_mips_pte_for_gpa(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, + unsigned long addr) { - int srcu_idx, err = 0; - kvm_pfn_t pfn; + return kvm_mips_walk_pgd(kvm->arch.gpa_mm.pgd, cache, addr); +} - if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE) - return 0; +/* + * kvm_mips_flush_gpa_{pte,pmd,pud,pgd,pt}. + * Flush a range of guest physical address space from the VM's GPA page tables. + */ + +static bool kvm_mips_flush_gpa_pte(pte_t *pte, unsigned long start_gpa, + unsigned long end_gpa) +{ + int i_min = __pte_offset(start_gpa); + int i_max = __pte_offset(end_gpa); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1); + int i; + + for (i = i_min; i <= i_max; ++i) { + if (!pte_present(pte[i])) + continue; + + kvm_release_pfn_clean(pte_pfn(pte[i])); + set_pte(pte + i, __pte(0)); + } + return safe_to_remove; +} + +static bool kvm_mips_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa, + unsigned long end_gpa) +{ + pte_t *pte; + unsigned long end = ~0ul; + int i_min = __pmd_offset(start_gpa); + int i_max = __pmd_offset(end_gpa); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gpa = 0) { + if (!pmd_present(pmd[i])) + continue; + + pte = pte_offset(pmd + i, 0); + if (i == i_max) + end = end_gpa; + + if (kvm_mips_flush_gpa_pte(pte, start_gpa, end)) { + pmd_clear(pmd + i); + pte_free_kernel(NULL, pte); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +static bool kvm_mips_flush_gpa_pud(pud_t *pud, unsigned long start_gpa, + unsigned long end_gpa) +{ + pmd_t *pmd; + unsigned long end = ~0ul; + int i_min = __pud_offset(start_gpa); + int i_max = __pud_offset(end_gpa); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gpa = 0) { + if (!pud_present(pud[i])) + continue; + + pmd = pmd_offset(pud + i, 0); + if (i == i_max) + end = end_gpa; + + if (kvm_mips_flush_gpa_pmd(pmd, start_gpa, end)) { + pud_clear(pud + i); + pmd_free(NULL, pmd); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +static bool kvm_mips_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa, + unsigned long end_gpa) +{ + pud_t *pud; + unsigned long end = ~0ul; + int i_min = pgd_index(start_gpa); + int i_max = pgd_index(end_gpa); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gpa = 0) { + if (!pgd_present(pgd[i])) + continue; + + pud = pud_offset(pgd + i, 0); + if (i == i_max) + end = end_gpa; + + if (kvm_mips_flush_gpa_pud(pud, start_gpa, end)) { + pgd_clear(pgd + i); + pud_free(NULL, pud); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +/** + * kvm_mips_flush_gpa_pt() - Flush a range of guest physical addresses. + * @kvm: KVM pointer. + * @start_gfn: Guest frame number of first page in GPA range to flush. + * @end_gfn: Guest frame number of last page in GPA range to flush. + * + * Flushes a range of GPA mappings from the GPA page tables. + * + * The caller must hold the @kvm->mmu_lock spinlock. + * + * Returns: Whether its safe to remove the top level page directory because + * all lower levels have been removed. + */ +bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) +{ + return kvm_mips_flush_gpa_pgd(kvm->arch.gpa_mm.pgd, + start_gfn << PAGE_SHIFT, + end_gfn << PAGE_SHIFT); +} + +/** + * kvm_mips_map_page() - Map a guest physical page. + * @vcpu: VCPU pointer. + * @gpa: Guest physical address of fault. + * @out_entry: New PTE for @gpa (written on success unless NULL). + * @out_buddy: New PTE for @gpa's buddy (written on success unless + * NULL). + * + * Handle GPA faults by creating a new GPA mapping (or updating an existing + * one). + * + * This takes care of asking KVM for the corresponding PFN, and creating a + * mapping in the GPA page tables. Derived mappings (GVA page tables and TLBs) + * must be handled by the caller. + * + * Returns: 0 on success, in which case the caller may use the @out_entry + * and @out_buddy PTEs to update derived mappings and resume guest + * execution. + * -EFAULT if there is no memory region at @gpa or a write was + * attempted to a read-only memory region. This is usually handled + * as an MMIO access. + */ +static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, + pte_t *out_entry, pte_t *out_buddy) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + gfn_t gfn = gpa >> PAGE_SHIFT; + int srcu_idx, err; + kvm_pfn_t pfn; + pte_t *ptep, entry, old_pte; + unsigned long prot_bits; srcu_idx = srcu_read_lock(&kvm->srcu); + + /* We need a minimum of cached pages ready for page table creation */ + err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, + KVM_NR_MEM_OBJS); + if (err) + goto out; + pfn = gfn_to_pfn(kvm, gfn); if (is_error_noslot_pfn(pfn)) { @@ -129,7 +353,25 @@ static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) goto out; } - kvm->arch.guest_pmap[gfn] = pfn; + spin_lock(&kvm->mmu_lock); + + ptep = kvm_mips_pte_for_gpa(kvm, memcache, gpa); + + prot_bits = __READABLE | _PAGE_PRESENT | __WRITEABLE; + entry = pfn_pte(pfn, __pgprot(prot_bits)); + + old_pte = *ptep; + set_pte(ptep, entry); + if (pte_present(old_pte)) + kvm_release_pfn_clean(pte_pfn(old_pte)); + + err = 0; + if (out_entry) + *out_entry = *ptep; + if (out_buddy) + *out_buddy = *ptep_buddy(ptep); + + spin_unlock(&kvm->mmu_lock); out: srcu_read_unlock(&kvm->srcu, srcu_idx); return err; @@ -318,11 +560,10 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags) int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu) { - gfn_t gfn; + unsigned long gpa; kvm_pfn_t pfn0, pfn1; - unsigned long vaddr = 0; - struct kvm *kvm = vcpu->kvm; - pte_t *ptep_gva; + unsigned long vaddr; + pte_t pte_gpa[2], *ptep_gva; if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) { kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr); @@ -332,23 +573,17 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, /* Find host PFNs */ - gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT); - if ((gfn | 1) >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__, - gfn, badvaddr); - kvm_mips_dump_host_tlbs(); - return -1; - } + gpa = KVM_GUEST_CPHYSADDR(badvaddr & (PAGE_MASK << 1)); vaddr = badvaddr & (PAGE_MASK << 1); - if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) + if (kvm_mips_map_page(vcpu, gpa, &pte_gpa[0], NULL) < 0) return -1; - if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0) + if (kvm_mips_map_page(vcpu, gpa | PAGE_SIZE, &pte_gpa[1], NULL) < 0) return -1; - pfn0 = kvm->arch.guest_pmap[gfn & ~0x1]; - pfn1 = kvm->arch.guest_pmap[gfn | 0x1]; + pfn0 = pte_pfn(pte_gpa[0]); + pfn1 = pte_pfn(pte_gpa[1]); /* Find GVA page table entry */ @@ -371,11 +606,9 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, struct kvm_mips_tlb *tlb, unsigned long gva) { - struct kvm *kvm = vcpu->kvm; kvm_pfn_t pfn; - gfn_t gfn; long tlb_lo = 0; - pte_t *ptep_gva; + pte_t pte_gpa, *ptep_gva; unsigned int idx; bool kernel = KVM_GUEST_KERNEL_MODE(vcpu); @@ -388,16 +621,10 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, tlb_lo = tlb->tlb_lo[idx]; /* Find host PFN */ - gfn = mips3_tlbpfn_to_paddr(tlb_lo) >> PAGE_SHIFT; - if (gfn >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx, EHi: %#lx\n", - __func__, gfn, tlb->tlb_hi); - kvm_mips_dump_guest_tlbs(vcpu); - return -1; - } - if (kvm_mips_map_page(kvm, gfn) < 0) + if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo), &pte_gpa, + NULL) < 0) return -1; - pfn = kvm->arch.guest_pmap[gfn]; + pfn = pte_pfn(pte_gpa); /* Find GVA page table entry */ ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva); -- cgit v1.2.3-70-g09d2 From 4841e0dd4f53c127b11947bdbe4423b5e9014ebc Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 28 Nov 2016 22:45:04 +0000 Subject: KVM: MIPS: Update vcpu->mode and vcpu->cpu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep the vcpu->mode and vcpu->cpu variables up to date so that kvm_make_all_cpus_request() has a chance of functioning correctly. This will soon need to be used for kvm_flush_remote_tlbs(). We can easily update vcpu->cpu when the VCPU context is loaded or saved, which will happen when accessing guest context and when the guest is scheduled in and out. We need to be a little careful with vcpu->mode though, as we will in future be checking for outstanding VCPU requests, and this must be done after the value of IN_GUEST_MODE in vcpu->mode is visible to other CPUs. Otherwise the other CPU could fail to trigger an IPI to wait for completion dispite the VCPU request not being seen. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mips.c | 18 ++++++++++++++++++ arch/mips/kvm/mmu.c | 2 ++ 2 files changed, 20 insertions(+) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 9b72e2c55a0c..ff5e34293227 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -365,6 +365,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) guest_enter_irqoff(); trace_kvm_enter(vcpu); + /* + * Make sure the read of VCPU requests in vcpu_run() callback is not + * reordered ahead of the write to vcpu->mode, or we could miss a TLB + * flush request while the requester sees the VCPU as outside of guest + * mode and not needing an IPI. + */ + smp_store_mb(vcpu->mode, IN_GUEST_MODE); + r = kvm_mips_callbacks->vcpu_run(run, vcpu); trace_kvm_out(vcpu); @@ -1326,6 +1334,8 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) u32 inst; int ret = RESUME_GUEST; + vcpu->mode = OUTSIDE_GUEST_MODE; + /* re-enable HTW before enabling interrupts */ htw_start(); @@ -1481,6 +1491,14 @@ skip_emul: if (ret == RESUME_GUEST) { trace_kvm_reenter(vcpu); + /* + * Make sure the read of VCPU requests in vcpu_reenter() + * callback is not reordered ahead of the write to vcpu->mode, + * or we could miss a TLB flush request while the requester sees + * the VCPU as outside of guest mode and not needing an IPI. + */ + smp_store_mb(vcpu->mode, IN_GUEST_MODE); + kvm_mips_callbacks->vcpu_reenter(run, vcpu); /* diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 09f5da706d9a..e41ee36dd626 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -696,6 +696,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) local_irq_save(flags); + vcpu->cpu = cpu; if (vcpu->arch.last_sched_cpu != cpu) { kvm_debug("[%d->%d]KVM VCPU[%d] switch\n", vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); @@ -723,6 +724,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) cpu = smp_processor_id(); vcpu->arch.last_sched_cpu = cpu; + vcpu->cpu = -1; /* save guest state in registers */ kvm_mips_callbacks->vcpu_put(vcpu, cpu); -- cgit v1.2.3-70-g09d2 From b29e115ae451a67a8c044dffb3aa02b19d4570a0 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 28 Nov 2016 23:19:32 +0000 Subject: KVM: MIPS/T&E: Handle TLB invalidation requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add handling of TLB invalidation requests before entering guest mode. This will allow asynchonous invalidation of the VCPU mappings when physical memory regions are altered. Should the CPU running the VCPU already be in guest mode an IPI will be sent to trigger a guest exit. The reload_asid path will be used in a future patch for when GVA is about to be directly accessed by KVM. In the process, the stale user ASID check in the re-entry path (for lazy user GVA flushing) is generalised to check the ASID for the current guest mode, in case a TLB invalidation request was handled. This has the side effect of making the ASID checks on vcpu_load too conservative, which will be addressed in a later patch. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/trap_emul.c | 71 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index ccd56b3ce84b..2b20b7de493e 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -773,31 +773,86 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu) return 0; } +static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu, + bool reload_asid) +{ + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; + struct mm_struct *mm; + int i; + + if (likely(!vcpu->requests)) + return; + + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + /* + * Both kernel & user GVA mappings must be invalidated. The + * caller is just about to check whether the ASID is stale + * anyway so no need to reload it here. + */ + kvm_mips_flush_gva_pt(kern_mm->pgd, KMF_GPA | KMF_KERN); + kvm_mips_flush_gva_pt(user_mm->pgd, KMF_GPA | KMF_USER); + for_each_possible_cpu(i) { + cpu_context(i, kern_mm) = 0; + cpu_context(i, user_mm) = 0; + } + + /* Generate new ASID for current mode */ + if (reload_asid) { + mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm; + get_new_mmu_context(mm, cpu); + htw_stop(); + write_c0_entryhi(cpu_asid(cpu, mm)); + TLBMISS_HANDLER_SETUP_PGD(mm->pgd); + htw_start(); + } + } +} + static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu) { + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; + struct mm_struct *mm; struct mips_coproc *cop0 = vcpu->arch.cop0; int i, cpu = smp_processor_id(); unsigned int gasid; /* - * Lazy host ASID regeneration / PT flush for guest user mode. - * If the guest ASID has changed since the last guest usermode - * execution, regenerate the host ASID so as to invalidate stale TLB - * entries and flush GVA PT entries too. + * No need to reload ASID, IRQs are disabled already so there's no rush, + * and we'll check if we need to regenerate below anyway before + * re-entering the guest. */ - if (!KVM_GUEST_KERNEL_MODE(vcpu)) { + kvm_trap_emul_check_requests(vcpu, cpu, false); + + if (KVM_GUEST_KERNEL_MODE(vcpu)) { + mm = kern_mm; + } else { + mm = user_mm; + + /* + * Lazy host ASID regeneration / PT flush for guest user mode. + * If the guest ASID has changed since the last guest usermode + * execution, invalidate the stale TLB entries and flush GVA PT + * entries too. + */ gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; if (gasid != vcpu->arch.last_user_gasid) { kvm_mips_flush_gva_pt(user_mm->pgd, KMF_USER); - get_new_mmu_context(user_mm, cpu); for_each_possible_cpu(i) - if (i != cpu) - cpu_context(i, user_mm) = 0; + cpu_context(i, user_mm) = 0; vcpu->arch.last_user_gasid = gasid; } } + + /* + * Check if ASID is stale. This may happen due to a TLB flush request or + * a lazy user MM invalidation. + */ + if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu)) + get_new_mmu_context(mm, cpu); } static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 91737ea205856c41183c2530fdb6b407ceeb3836 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 2 Dec 2016 23:40:52 +0000 Subject: KVM: MIPS/T&E: Reduce stale ASID checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stale ASID checks taking place on VCPU load can be reduced: - Now that we check for a stale ASID on guest re-entry, there is no need to do so when loading the VCPU outside of guest context, since it will happen before entering the guest. Note that a lot of KVM VCPU ioctls will cause the VCPU to be loaded but guest context won't be entered. - There is no need to check for a stale kernel_mm ASID when the guest is in user mode and vice versa. In fact doing so can potentially be problematic since the user_mm ASID regeneration may trigger a new ASID cycle, which would cause the kern_mm ASID to become stale after it has been checked for staleness. Therefore only check the ASID for the mm corresponding to the current guest mode, and only if we're already in guest context. We drop some of the related kvm_debug() calls here too. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/trap_emul.c | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 2b20b7de493e..edda8f039026 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -714,35 +714,15 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; struct mm_struct *mm; - /* Allocate new kernel and user ASIDs if needed */ - - if ((cpu_context(cpu, kern_mm) ^ asid_cache(cpu)) & - asid_version_mask(cpu)) { - get_new_mmu_context(kern_mm, cpu); - - kvm_debug("[%d]: cpu_context: %#lx\n", cpu, - cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#lx\n", - cpu, cpu_context(cpu, kern_mm)); - } - - if ((cpu_context(cpu, user_mm) ^ asid_cache(cpu)) & - asid_version_mask(cpu)) { - get_new_mmu_context(user_mm, cpu); - - kvm_debug("[%d]: cpu_context: %#lx\n", cpu, - cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest User: %#lx\n", - cpu, cpu_context(cpu, user_mm)); - } - /* - * Were we in guest context? If so then the pre-empted ASID is - * no longer valid, we need to set it to what it should be based - * on the mode of the Guest (Kernel/User) + * Were we in guest context? If so, restore the appropriate ASID based + * on the mode of the Guest (Kernel/User). */ if (current->flags & PF_VCPU) { mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm; + if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu)) + get_new_mmu_context(mm, cpu); write_c0_entryhi(cpu_asid(cpu, mm)); TLBMISS_HANDLER_SETUP_PGD(mm->pgd); kvm_mips_suspend_mm(cpu); @@ -759,11 +739,8 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu) if (current->flags & PF_VCPU) { /* Restore normal Linux process memory map */ if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & - asid_version_mask(cpu))) { - kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__, - cpu_context(cpu, current->mm)); + asid_version_mask(cpu))) get_new_mmu_context(current->mm, cpu); - } write_c0_entryhi(cpu_asid(cpu, current->mm)); TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd); kvm_mips_resume_mm(cpu); -- cgit v1.2.3-70-g09d2 From 1880afd6057f34586919715e8ffe9c5858f4a326 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 28 Nov 2016 23:04:52 +0000 Subject: KVM: MIPS/T&E: Add lockless GVA access helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add helpers to allow for lockless direct access to the GVA space, by changing the VCPU mode to READING_SHADOW_PAGE_TABLES for the duration of the access. This allows asynchronous TLB flush requests in future patches to safely trigger either a TLB flush before the direct GVA space access, or a delay until the in-progress lockless direct access is complete. The kvm_trap_emul_gva_lockless_begin() and kvm_trap_emul_gva_lockless_end() helpers take care of guarding the direct GVA accesses, and kvm_trap_emul_gva_fault() tries to handle a uaccess fault resulting from a flush having taken place. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 15 ++++++++++ arch/mips/kvm/mmu.c | 51 +++++++++++++++++++++++++++++++ arch/mips/kvm/trap_emul.c | 65 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index a04657932e19..c1e46abb5704 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -243,6 +243,7 @@ enum emulation_result { #define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID) #define TLB_LO_IDX(x, va) (((va) >> PAGE_SHIFT) & 1) #define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V) +#define TLB_IS_DIRTY(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_D) #define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \ ((y) & VPN2_MASK & ~(x).tlb_mask)) #define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \ @@ -640,6 +641,20 @@ pgd_t *kvm_pgd_alloc(void); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, bool user); +void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu); +void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu); + +enum kvm_mips_fault_result { + KVM_MIPS_MAPPED = 0, + KVM_MIPS_GVA, + KVM_MIPS_GPA, + KVM_MIPS_TLB, + KVM_MIPS_TLBINV, + KVM_MIPS_TLBMOD, +}; +enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, + unsigned long gva, + bool write); /* Emulation */ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index e41ee36dd626..32c317de6c0a 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -732,6 +732,57 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) local_irq_restore(flags); } +/** + * kvm_trap_emul_gva_fault() - Safely attempt to handle a GVA access fault. + * @vcpu: Virtual CPU. + * @gva: Guest virtual address to be accessed. + * @write: True if write attempted (must be dirtied and made writable). + * + * Safely attempt to handle a GVA fault, mapping GVA pages if necessary, and + * dirtying the page if @write so that guest instructions can be modified. + * + * Returns: KVM_MIPS_MAPPED on success. + * KVM_MIPS_GVA if bad guest virtual address. + * KVM_MIPS_GPA if bad guest physical address. + * KVM_MIPS_TLB if guest TLB not present. + * KVM_MIPS_TLBINV if guest TLB present but not valid. + * KVM_MIPS_TLBMOD if guest TLB read only. + */ +enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, + unsigned long gva, + bool write) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + struct kvm_mips_tlb *tlb; + int index; + + if (KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG0) { + if (kvm_mips_handle_kseg0_tlb_fault(gva, vcpu) < 0) + return KVM_MIPS_GPA; + } else if ((KVM_GUEST_KSEGX(gva) < KVM_GUEST_KSEG0) || + KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG23) { + /* Address should be in the guest TLB */ + index = kvm_mips_guest_tlb_lookup(vcpu, (gva & VPN2_MASK) | + (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID)); + if (index < 0) + return KVM_MIPS_TLB; + tlb = &vcpu->arch.guest_tlb[index]; + + /* Entry should be valid, and dirty for writes */ + if (!TLB_IS_VALID(*tlb, gva)) + return KVM_MIPS_TLBINV; + if (write && !TLB_IS_DIRTY(*tlb, gva)) + return KVM_MIPS_TLBMOD; + + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, gva)) + return KVM_MIPS_GPA; + } else { + return KVM_MIPS_GVA; + } + + return KVM_MIPS_MAPPED; +} + int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out) { int err; diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index edda8f039026..e20369d45f24 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -786,6 +786,71 @@ static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu, } } +/** + * kvm_trap_emul_gva_lockless_begin() - Begin lockless access to GVA space. + * @vcpu: VCPU pointer. + * + * Call before a GVA space access outside of guest mode, to ensure that + * asynchronous TLB flush requests are handled or delayed until completion of + * the GVA access (as indicated by a matching kvm_trap_emul_gva_lockless_end()). + * + * Should be called with IRQs already enabled. + */ +void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu) +{ + /* We re-enable IRQs in kvm_trap_emul_gva_lockless_end() */ + WARN_ON_ONCE(irqs_disabled()); + + /* + * The caller is about to access the GVA space, so we set the mode to + * force TLB flush requests to send an IPI, and also disable IRQs to + * delay IPI handling until kvm_trap_emul_gva_lockless_end(). + */ + local_irq_disable(); + + /* + * Make sure the read of VCPU requests is not reordered ahead of the + * write to vcpu->mode, or we could miss a TLB flush request while + * the requester sees the VCPU as outside of guest mode and not needing + * an IPI. + */ + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + + /* + * If a TLB flush has been requested (potentially while + * OUTSIDE_GUEST_MODE and assumed immediately effective), perform it + * before accessing the GVA space, and be sure to reload the ASID if + * necessary as it'll be immediately used. + * + * TLB flush requests after this check will trigger an IPI due to the + * mode change above, which will be delayed due to IRQs disabled. + */ + kvm_trap_emul_check_requests(vcpu, smp_processor_id(), true); +} + +/** + * kvm_trap_emul_gva_lockless_end() - End lockless access to GVA space. + * @vcpu: VCPU pointer. + * + * Called after a GVA space access outside of guest mode. Should have a matching + * call to kvm_trap_emul_gva_lockless_begin(). + */ +void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu) +{ + /* + * Make sure the write to vcpu->mode is not reordered in front of GVA + * accesses, or a TLB flush requester may not think it necessary to send + * an IPI. + */ + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); + + /* + * Now that the access to GVA space is complete, its safe for pending + * TLB flush request IPIs to be handled (which indicates completion). + */ + local_irq_enable(); +} + static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu) { -- cgit v1.2.3-70-g09d2 From 4b21e8abf959ca66c27f0656bf294fe69d3f2254 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 28 Nov 2016 23:13:38 +0000 Subject: KVM: MIPS/T&E: Use lockless GVA helpers for dyntrans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the lockless GVA helpers to implement the dynamic translation of guest instructions. This will allow it to handle asynchronous TLB flushes when they are implemented. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/dyntrans.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index 60ebf5862d2b..f8e772564d74 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -33,10 +33,32 @@ static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, unsigned long vaddr = (unsigned long)opc; int err; +retry: + /* The GVA page table is still active so use the Linux TLB handlers */ + kvm_trap_emul_gva_lockless_begin(vcpu); err = put_user(replace.word, opc); + kvm_trap_emul_gva_lockless_end(vcpu); + if (unlikely(err)) { - kvm_err("%s: Invalid address: %p\n", __func__, opc); - return err; + /* + * We write protect clean pages in GVA page table so normal + * Linux TLB mod handler doesn't silently dirty the page. + * Its also possible we raced with a GVA invalidation. + * Try to force the page to become dirty. + */ + err = kvm_trap_emul_gva_fault(vcpu, vaddr, true); + if (unlikely(err)) { + kvm_info("%s: Address unwriteable: %p\n", + __func__, opc); + return -EFAULT; + } + + /* + * Try again. This will likely trigger a TLB refill, which will + * fetch the new dirty entry from the GVA page table, which + * should then succeed. + */ + goto retry; } __local_flush_icache_user_range(vaddr, vaddr + 4); -- cgit v1.2.3-70-g09d2 From 5207ce144a25aef89dd12b8fc3ccaa53aba4f2bd Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 28 Nov 2016 23:15:53 +0000 Subject: KVM: MIPS/MMU: Use lockless GVA helpers for get_inst() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the lockless GVA helpers to implement the reading of guest instructions for emulation. This will allow it to handle asynchronous TLB flushes when they are implemented. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mmu.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 32c317de6c0a..b3da473e1569 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -787,11 +787,26 @@ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out) { int err; +retry: + kvm_trap_emul_gva_lockless_begin(vcpu); err = get_user(*out, opc); + kvm_trap_emul_gva_lockless_end(vcpu); + if (unlikely(err)) { - kvm_err("%s: illegal address: %p\n", __func__, opc); - return -EFAULT; - } + /* + * Try to handle the fault, maybe we just raced with a GVA + * invalidation. + */ + err = kvm_trap_emul_gva_fault(vcpu, (unsigned long)opc, + false); + if (unlikely(err)) { + kvm_err("%s: illegal address: %p\n", + __func__, opc); + return -EFAULT; + } + /* Hopefully it'll work now */ + goto retry; + } return 0; } -- cgit v1.2.3-70-g09d2 From 4cf74c9c83dda79143490d7cc774b7830e257fcd Mon Sep 17 00:00:00 2001 From: James Hogan Date: Sat, 26 Nov 2016 00:37:28 +0000 Subject: KVM: MIPS/Emulate: Use lockless GVA helpers for cache emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the lockless GVA helpers to implement the reading of guest instructions for emulation. This will allow it to handle asynchronous TLB flushes when they are implemented. This is a little more complicated than the other two cases (get_inst() and dynamic translation) due to the need to emulate the appropriate guest TLB exception when the address isn't present or isn't valid in the guest TLB. Since there are several protected cache ops that may need to be performed safely, this is abstracted by kvm_mips_guest_cache_op() which is passed a protected cache op function pointer and takes care of the lockless operation and fault handling / retry if the op should fail, taking advantage of the new errors which the protected cache ops can now return. This allows the existing advance fault handling which relied on host TLB lookups to be removed, along with the now unused kvm_mips_host_tlb_lookup(), Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 2 +- arch/mips/kvm/emulate.c | 148 +++++++++++++++++---------------------- arch/mips/kvm/tlb.c | 35 --------- 3 files changed, 66 insertions(+), 119 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index c1e46abb5704..33d3d8ac742e 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -226,6 +226,7 @@ enum emulation_result { EMULATE_FAIL, /* can't emulate this instruction */ EMULATE_WAIT, /* WAIT instruction */ EMULATE_PRIV_FAIL, + EMULATE_EXCEPT, /* A guest exception has been generated */ }; #define mips3_paddr_to_tlbpfn(x) \ @@ -614,7 +615,6 @@ extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi, extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi); -extern int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr); void kvm_mips_suspend_mm(int cpu); void kvm_mips_resume_mm(int cpu); diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index b295a4a1496f..40159cf5166b 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1697,12 +1697,56 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, return er; } +static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long), + unsigned long curr_pc, + unsigned long addr, + struct kvm_run *run, + struct kvm_vcpu *vcpu, + u32 cause) +{ + int err; + + for (;;) { + /* Carefully attempt the cache operation */ + kvm_trap_emul_gva_lockless_begin(vcpu); + err = fn(addr); + kvm_trap_emul_gva_lockless_end(vcpu); + + if (likely(!err)) + return EMULATE_DONE; + + /* + * Try to handle the fault and retry, maybe we just raced with a + * GVA invalidation. + */ + switch (kvm_trap_emul_gva_fault(vcpu, addr, false)) { + case KVM_MIPS_GVA: + case KVM_MIPS_GPA: + /* bad virtual or physical address */ + return EMULATE_FAIL; + case KVM_MIPS_TLB: + /* no matching guest TLB */ + vcpu->arch.host_cp0_badvaddr = addr; + vcpu->arch.pc = curr_pc; + kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, vcpu); + return EMULATE_EXCEPT; + case KVM_MIPS_TLBINV: + /* invalid matching guest TLB */ + vcpu->arch.host_cp0_badvaddr = addr; + vcpu->arch.pc = curr_pc; + kvm_mips_emulate_tlbinv_ld(cause, NULL, run, vcpu); + return EMULATE_EXCEPT; + default: + break; + }; + } +} + enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, u32 *opc, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { - struct mips_coproc *cop0 = vcpu->arch.cop0; enum emulation_result er = EMULATE_DONE; u32 cache, op_inst, op, base; s16 offset; @@ -1759,81 +1803,16 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, goto done; } - preempt_disable(); - if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) { - if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 && - kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) { - kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n", - __func__, va, vcpu, read_c0_entryhi()); - er = EMULATE_FAIL; - preempt_enable(); - goto done; - } - } else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) || - KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) { - int index; - - /* If an entry already exists then skip */ - if (kvm_mips_host_tlb_lookup(vcpu, va) >= 0) - goto skip_fault; - - /* - * If address not in the guest TLB, then give the guest a fault, - * the resulting handler will do the right thing - */ - index = kvm_mips_guest_tlb_lookup(vcpu, (va & VPN2_MASK) | - (kvm_read_c0_guest_entryhi - (cop0) & KVM_ENTRYHI_ASID)); - - if (index < 0) { - vcpu->arch.host_cp0_badvaddr = va; - vcpu->arch.pc = curr_pc; - er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, - vcpu); - preempt_enable(); - goto dont_update_pc; - } else { - struct kvm_mips_tlb *tlb = &vcpu->arch.guest_tlb[index]; - /* - * Check if the entry is valid, if not then setup a TLB - * invalid exception to the guest - */ - if (!TLB_IS_VALID(*tlb, va)) { - vcpu->arch.host_cp0_badvaddr = va; - vcpu->arch.pc = curr_pc; - er = kvm_mips_emulate_tlbinv_ld(cause, NULL, - run, vcpu); - preempt_enable(); - goto dont_update_pc; - } - /* - * We fault an entry from the guest tlb to the - * shadow host TLB - */ - if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, - va)) { - kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", - __func__, va, index, vcpu, - read_c0_entryhi()); - er = EMULATE_FAIL; - preempt_enable(); - goto done; - } - } - } else { - kvm_err("INVALID CACHE INDEX/ADDRESS (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n", - cache, op, base, arch->gprs[base], offset); - er = EMULATE_FAIL; - preempt_enable(); - goto done; - - } - -skip_fault: /* XXXKYMA: Only a subset of cache ops are supported, used by Linux */ if (op_inst == Hit_Writeback_Inv_D || op_inst == Hit_Invalidate_D) { - protected_writeback_dcache_line(va); - + /* + * Perform the dcache part of icache synchronisation on the + * guest's behalf. + */ + er = kvm_mips_guest_cache_op(protected_writeback_dcache_line, + curr_pc, va, run, vcpu, cause); + if (er != EMULATE_DONE) + goto done; #ifdef CONFIG_KVM_MIPS_DYN_TRANS /* * Replace the CACHE instruction, with a SYNCI, not the same, @@ -1842,8 +1821,15 @@ skip_fault: kvm_mips_trans_cache_va(inst, opc, vcpu); #endif } else if (op_inst == Hit_Invalidate_I) { - protected_writeback_dcache_line(va); - protected_flush_icache_line(va); + /* Perform the icache synchronisation on the guest's behalf */ + er = kvm_mips_guest_cache_op(protected_writeback_dcache_line, + curr_pc, va, run, vcpu, cause); + if (er != EMULATE_DONE) + goto done; + er = kvm_mips_guest_cache_op(protected_flush_icache_line, + curr_pc, va, run, vcpu, cause); + if (er != EMULATE_DONE) + goto done; #ifdef CONFIG_KVM_MIPS_DYN_TRANS /* Replace the CACHE instruction, with a SYNCI */ @@ -1855,17 +1841,13 @@ skip_fault: er = EMULATE_FAIL; } - preempt_enable(); done: /* Rollback PC only if emulation was unsuccessful */ if (er == EMULATE_FAIL) vcpu->arch.pc = curr_pc; - -dont_update_pc: - /* - * This is for exceptions whose emulation updates the PC, so do not - * overwrite the PC under any circumstances - */ + /* Guest exception needs guest to resume */ + if (er == EMULATE_EXCEPT) + er = EMULATE_DONE; return er; } diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index cee2e9feb942..2819eb793345 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -117,41 +117,6 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi) } EXPORT_SYMBOL_GPL(kvm_mips_guest_tlb_lookup); -int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr) -{ - unsigned long old_entryhi, flags; - int idx; - - local_irq_save(flags); - - old_entryhi = read_c0_entryhi(); - - if (KVM_GUEST_KERNEL_MODE(vcpu)) - write_c0_entryhi((vaddr & VPN2_MASK) | - kvm_mips_get_kernel_asid(vcpu)); - else { - write_c0_entryhi((vaddr & VPN2_MASK) | - kvm_mips_get_user_asid(vcpu)); - } - - mtc0_tlbw_hazard(); - - tlb_probe(); - tlb_probe_hazard(); - idx = read_c0_index(); - - /* Restore old ASID */ - write_c0_entryhi(old_entryhi); - mtc0_tlbw_hazard(); - - local_irq_restore(flags); - - kvm_debug("Host TLB lookup, %#lx, idx: %2d\n", vaddr, idx); - - return idx; -} -EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_lookup); - static int _kvm_mips_host_tlb_inv(unsigned long entryhi) { int idx; -- cgit v1.2.3-70-g09d2 From b6209110863363b55dd60fe28a993e5367d4a215 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 25 Oct 2016 00:01:37 +0100 Subject: KVM: MIPS: Implement kvm_arch_flush_shadow_all/memslot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the kvm_arch_flush_shadow_all() and kvm_arch_flush_shadow_memslot() KVM functions for MIPS to allow guest physical mappings to be safely changed. The general MIPS KVM code takes care of flushing of GPA page table entries. kvm_arch_flush_shadow_all() flushes the whole GPA page table, and is always called on the cleanup path so there is no need to acquire the kvm->mmu_lock. kvm_arch_flush_shadow_memslot() flushes only the range of mappings in the GPA page table corresponding to the slot being flushed, and happens when memory regions are moved or deleted. MIPS KVM implementation callbacks are added for handling the implementation specific flushing of mappings derived from the GPA page tables. These are implemented for trap_emul.c using kvm_flush_remote_tlbs() which should now be functional, and will flush the per-VCPU GVA page tables and ASIDS synchronously (before next entering guest mode or directly accessing GVA space). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 11 ++++++++--- arch/mips/kvm/mips.c | 26 ++++++++++++++++++++++++++ arch/mips/kvm/trap_emul.c | 14 ++++++++++++++ 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 33d3d8ac742e..ea1b495c042c 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -531,6 +531,14 @@ struct kvm_mips_callbacks { int (*vcpu_init)(struct kvm_vcpu *vcpu); void (*vcpu_uninit)(struct kvm_vcpu *vcpu); int (*vcpu_setup)(struct kvm_vcpu *vcpu); + void (*flush_shadow_all)(struct kvm *kvm); + /* + * Must take care of flushing any cached GPA PTEs (e.g. guest entries in + * VZ root TLB, or T&E GVA page tables and corresponding root TLB + * mappings). + */ + void (*flush_shadow_memslot)(struct kvm *kvm, + const struct kvm_memory_slot *slot); gpa_t (*gva_to_gpa)(gva_t gva); void (*queue_timer_int)(struct kvm_vcpu *vcpu); void (*dequeue_timer_int)(struct kvm_vcpu *vcpu); @@ -827,9 +835,6 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} -static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} -static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index ff5e34293227..01f3fa1b9f0e 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -157,6 +157,32 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, return 0; } +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + /* Flush whole GPA */ + kvm_mips_flush_gpa_pt(kvm, 0, ~0); + + /* Let implementation do the rest */ + kvm_mips_callbacks->flush_shadow_all(kvm); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + /* + * The slot has been made invalid (ready for moving or deletion), so we + * need to ensure that it can no longer be accessed by any guest VCPUs. + */ + + spin_lock(&kvm->mmu_lock); + /* Flush slot from GPA */ + kvm_mips_flush_gpa_pt(kvm, slot->base_gfn, + slot->base_gfn + slot->npages - 1); + /* Let implementation do the rest */ + kvm_mips_callbacks->flush_shadow_memslot(kvm, slot); + spin_unlock(&kvm->mmu_lock); +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, const struct kvm_userspace_memory_region *mem, diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index e20369d45f24..1efe78d4bda8 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -586,6 +586,18 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm) +{ + /* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */ + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + kvm_trap_emul_flush_shadow_all(kvm); +} + static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu) { return 0; @@ -963,6 +975,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .vcpu_init = kvm_trap_emul_vcpu_init, .vcpu_uninit = kvm_trap_emul_vcpu_uninit, .vcpu_setup = kvm_trap_emul_vcpu_setup, + .flush_shadow_all = kvm_trap_emul_flush_shadow_all, + .flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot, .gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb, .queue_timer_int = kvm_mips_queue_timer_int_cb, .dequeue_timer_int = kvm_mips_dequeue_timer_int_cb, -- cgit v1.2.3-70-g09d2 From 89d6ad8a6b26a51f6fdfd356a56681a11f309bba Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 14 Dec 2016 01:58:44 +0000 Subject: KVM: MIPS/T&E: Ignore user writes to CP0_Config7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ignore userland writes to CP0_Config7 rather than reporting an error, since we do allow reads of this register and it is claimed to exist in the ioctl API. This allows userland to blindly save and restore KVM registers without having to special case certain registers as not being writable, for example during live migration once dirty page logging is fixed. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/trap_emul.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 1efe78d4bda8..d0b6409cb4a3 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -705,6 +705,9 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, kvm_write_c0_guest_config5(cop0, v); } break; + case KVM_REG_MIPS_CP0_CONFIG7: + /* writes ignored */ + break; case KVM_REG_MIPS_COUNT_CTL: ret = kvm_mips_set_count_ctl(vcpu, v); break; -- cgit v1.2.3-70-g09d2 From 577ed7f71e9c37a46c45a7bd9a392dd0372a409c Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 1 May 2015 14:56:31 +0100 Subject: KVM: MIPS: Pass type of fault down to kvm_mips_map_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_mips_map_page() will need to know whether the fault was due to a read or a write in order to support dirty page tracking, KVM_CAP_SYNC_MMU, and read only memory regions, so get that information passed down to it via new bool write_fault arguments to various functions. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 9 ++++++--- arch/mips/kvm/emulate.c | 7 ++++--- arch/mips/kvm/mmu.c | 21 +++++++++++++-------- arch/mips/kvm/trap_emul.c | 4 ++-- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index ea1b495c042c..17db9d6c7c93 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -597,19 +597,22 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu); u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu); extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr, - struct kvm_vcpu *vcpu); + struct kvm_vcpu *vcpu, + bool write_fault); extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu); extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, struct kvm_mips_tlb *tlb, - unsigned long gva); + unsigned long gva, + bool write_fault); extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, struct kvm_run *run, - struct kvm_vcpu *vcpu); + struct kvm_vcpu *vcpu, + bool write_fault); extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc, diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 40159cf5166b..4875afca3f26 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -2704,7 +2704,8 @@ enum emulation_result kvm_mips_check_privilege(u32 cause, enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, struct kvm_run *run, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, + bool write_fault) { enum emulation_result er = EMULATE_DONE; u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; @@ -2760,8 +2761,8 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, * OK we have a Guest TLB entry, now inject it into the * shadow host TLB */ - if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, - va)) { + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, va, + write_fault)) { kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", __func__, va, index, vcpu, read_c0_entryhi()); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index b3da473e1569..1af65f2e6bb7 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -308,6 +308,7 @@ bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) * kvm_mips_map_page() - Map a guest physical page. * @vcpu: VCPU pointer. * @gpa: Guest physical address of fault. + * @write_fault: Whether the fault was due to a write. * @out_entry: New PTE for @gpa (written on success unless NULL). * @out_buddy: New PTE for @gpa's buddy (written on success unless * NULL). @@ -327,6 +328,7 @@ bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) * as an MMIO access. */ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, + bool write_fault, pte_t *out_entry, pte_t *out_buddy) { struct kvm *kvm = vcpu->kvm; @@ -558,7 +560,8 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags) /* XXXKYMA: Must be called with interrupts disabled */ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, + bool write_fault) { unsigned long gpa; kvm_pfn_t pfn0, pfn1; @@ -576,10 +579,11 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, gpa = KVM_GUEST_CPHYSADDR(badvaddr & (PAGE_MASK << 1)); vaddr = badvaddr & (PAGE_MASK << 1); - if (kvm_mips_map_page(vcpu, gpa, &pte_gpa[0], NULL) < 0) + if (kvm_mips_map_page(vcpu, gpa, write_fault, &pte_gpa[0], NULL) < 0) return -1; - if (kvm_mips_map_page(vcpu, gpa | PAGE_SIZE, &pte_gpa[1], NULL) < 0) + if (kvm_mips_map_page(vcpu, gpa | PAGE_SIZE, write_fault, &pte_gpa[1], + NULL) < 0) return -1; pfn0 = pte_pfn(pte_gpa[0]); @@ -604,7 +608,8 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, struct kvm_mips_tlb *tlb, - unsigned long gva) + unsigned long gva, + bool write_fault) { kvm_pfn_t pfn; long tlb_lo = 0; @@ -621,8 +626,8 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, tlb_lo = tlb->tlb_lo[idx]; /* Find host PFN */ - if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo), &pte_gpa, - NULL) < 0) + if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo), write_fault, + &pte_gpa, NULL) < 0) return -1; pfn = pte_pfn(pte_gpa); @@ -757,7 +762,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, int index; if (KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG0) { - if (kvm_mips_handle_kseg0_tlb_fault(gva, vcpu) < 0) + if (kvm_mips_handle_kseg0_tlb_fault(gva, vcpu, write) < 0) return KVM_MIPS_GPA; } else if ((KVM_GUEST_KSEGX(gva) < KVM_GUEST_KSEG0) || KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG23) { @@ -774,7 +779,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, if (write && !TLB_IS_DIRTY(*tlb, gva)) return KVM_MIPS_TLBMOD; - if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, gva)) + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, gva, write)) return KVM_MIPS_GPA; } else { return KVM_MIPS_GVA; diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index d0b6409cb4a3..070d1ddbc7ee 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -159,7 +159,7 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) * into the shadow host TLB */ - er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu); + er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu, store); if (er == EMULATE_DONE) ret = RESUME_GUEST; else { @@ -172,7 +172,7 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) * not expect to ever get them */ if (kvm_mips_handle_kseg0_tlb_fault - (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) { + (vcpu->arch.host_cp0_badvaddr, vcpu, store) < 0) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } -- cgit v1.2.3-70-g09d2 From 420ea09b645b0fb05b326a539190c41ee900ef50 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 6 Dec 2016 19:27:18 +0000 Subject: KVM: MIPS/T&E: Abstract bad access handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Abstract the handling of bad guest loads and stores which may need to trigger an MMIO, so that the same code can be used in a later patch for guest KSeg0 addresses (TLB exception handling) as well as for host KSeg1 addresses (existing address error exception and TLB exception handling). We now use kvm_mips_emulate_store() and kvm_mips_emulate_load() directly rather than the more generic kvm_mips_emulate_inst(), as there is no need to expose emulation of any other instructions. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/trap_emul.c | 119 ++++++++++++++++++++++++++++------------------ 1 file changed, 72 insertions(+), 47 deletions(-) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 070d1ddbc7ee..ae971ae30e30 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -85,6 +85,75 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) return ret; } +static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run, + struct kvm_vcpu *vcpu) +{ + enum emulation_result er; + union mips_instruction inst; + int err; + + /* A code fetch fault doesn't count as an MMIO */ + if (kvm_is_ifetch_fault(&vcpu->arch)) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + /* Fetch the instruction. */ + if (cause & CAUSEF_BD) + opc += 1; + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + /* Emulate the load */ + er = kvm_mips_emulate_load(inst, cause, run, vcpu); + if (er == EMULATE_FAIL) { + kvm_err("Emulate load from MMIO space failed\n"); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + } else { + run->exit_reason = KVM_EXIT_MMIO; + } + return RESUME_HOST; +} + +static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_run *run, + struct kvm_vcpu *vcpu) +{ + enum emulation_result er; + union mips_instruction inst; + int err; + + /* Fetch the instruction. */ + if (cause & CAUSEF_BD) + opc += 1; + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + /* Emulate the store */ + er = kvm_mips_emulate_store(inst, cause, run, vcpu); + if (er == EMULATE_FAIL) { + kvm_err("Emulate store to MMIO space failed\n"); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + } else { + run->exit_reason = KVM_EXIT_MMIO; + } + return RESUME_HOST; +} + +static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_run *run, + struct kvm_vcpu *vcpu, bool store) +{ + if (store) + return kvm_mips_bad_store(cause, opc, run, vcpu); + else + return kvm_mips_bad_load(cause, opc, run, vcpu); +} + static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; @@ -178,28 +247,11 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) } } else if (KVM_GUEST_KERNEL_MODE(vcpu) && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { - /* A code fetch fault doesn't count as an MMIO */ - if (!store && kvm_is_ifetch_fault(&vcpu->arch)) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - return RESUME_HOST; - } - /* * With EVA we may get a TLB exception instead of an address * error when the guest performs MMIO to KSeg1 addresses. */ - kvm_debug("Emulate %s MMIO space\n", - store ? "Store to" : "Load from"); - er = kvm_mips_emulate_inst(cause, opc, run, vcpu); - if (er == EMULATE_FAIL) { - kvm_err("Emulate %s MMIO space failed\n", - store ? "Store to" : "Load from"); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } else { - run->exit_reason = KVM_EXIT_MMIO; - ret = RESUME_HOST; - } + ret = kvm_mips_bad_access(cause, opc, run, vcpu, store); } else { kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", store ? "ST" : "LD", cause, opc, badvaddr); @@ -227,21 +279,11 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; - enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; if (KVM_GUEST_KERNEL_MODE(vcpu) && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { - kvm_debug("Emulate Store to MMIO space\n"); - er = kvm_mips_emulate_inst(cause, opc, run, vcpu); - if (er == EMULATE_FAIL) { - kvm_err("Emulate Store to MMIO space failed\n"); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } else { - run->exit_reason = KVM_EXIT_MMIO; - ret = RESUME_HOST; - } + ret = kvm_mips_bad_store(cause, opc, run, vcpu); } else { kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); @@ -257,32 +299,15 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; - enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) { - /* A code fetch fault doesn't count as an MMIO */ - if (kvm_is_ifetch_fault(&vcpu->arch)) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - return RESUME_HOST; - } - - kvm_debug("Emulate Load from MMIO space @ %#lx\n", badvaddr); - er = kvm_mips_emulate_inst(cause, opc, run, vcpu); - if (er == EMULATE_FAIL) { - kvm_err("Emulate Load from MMIO space failed\n"); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } else { - run->exit_reason = KVM_EXIT_MMIO; - ret = RESUME_HOST; - } + ret = kvm_mips_bad_load(cause, opc, run, vcpu); } else { kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; - er = EMULATE_FAIL; } return ret; } -- cgit v1.2.3-70-g09d2 From b8f79ddb7db95bb675b3d6009e7a4274161e1e53 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 11 May 2015 23:31:45 +0100 Subject: KVM: MIPS/T&E: Treat unhandled guest KSeg0 as MMIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Treat unhandled accesses to guest KSeg0 as MMIO, rather than only host KSeg0 addresses. This will allow read only memory regions (such as the Malta boot flash as emulated by QEMU) to have writes (before reads) treated as MMIO, and unallocated physical addresses to have all accesses treated as MMIO. The MMIO emulation uses the gva_to_gpa callback, so this is also updated for trap & emulate to handle guest KSeg0 addresses. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mmu.c | 1 - arch/mips/kvm/trap_emul.c | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 1af65f2e6bb7..934bcc3732da 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -350,7 +350,6 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, pfn = gfn_to_pfn(kvm, gfn); if (is_error_noslot_pfn(pfn)) { - kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn); err = -EFAULT; goto out; } diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index ae971ae30e30..77e059068a29 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -23,9 +23,12 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva) { gpa_t gpa; gva_t kseg = KSEGX(gva); + gva_t gkseg = KVM_GUEST_KSEGX(gva); if ((kseg == CKSEG0) || (kseg == CKSEG1)) gpa = CPHYSADDR(gva); + else if (gkseg == KVM_GUEST_KSEG0) + gpa = KVM_GUEST_CPHYSADDR(gva); else { kvm_err("%s: cannot find GPA for GVA: %#lx\n", __func__, gva); kvm_mips_dump_host_tlbs(); @@ -240,11 +243,8 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) * All KSEG0 faults are handled by KVM, as the guest kernel does * not expect to ever get them */ - if (kvm_mips_handle_kseg0_tlb_fault - (vcpu->arch.host_cp0_badvaddr, vcpu, store) < 0) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } + if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, store) < 0) + ret = kvm_mips_bad_access(cause, opc, run, vcpu, store); } else if (KVM_GUEST_KERNEL_MODE(vcpu) && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { /* -- cgit v1.2.3-70-g09d2 From 64ebc9e24074403c4127b06c0203f3e7b3367e69 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 13 Dec 2016 13:02:36 +0000 Subject: KVM: MIPS/T&E: Handle read only GPA in TLB mod MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite TLB modified exception handling to handle read only GPA memory regions, instead of unconditionally passing the exception to the guest. If the guest TLB is not the cause of the exception we call into the normal TLB fault handling depending on the memory segment, which will soon attempt to remap the physical page to be writable (handling dirty page tracking or copy on write in the process). Failing that we fall back to treating it as MMIO, due to a read only memory region. Once the capability is enabled, this will allow read only memory regions (such as the Malta boot flash as emulated by QEMU) to have writes treated as MMIO, while still allowing reads to run untrapped. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 5 --- arch/mips/kvm/emulate.c | 31 ------------------ arch/mips/kvm/trap_emul.c | 69 +++++++++++++++++++++++++--------------- 3 files changed, 43 insertions(+), 62 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 17db9d6c7c93..13c9e128bb86 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -614,11 +614,6 @@ extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, struct kvm_vcpu *vcpu, bool write_fault); -extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause, - u32 *opc, - struct kvm_run *run, - struct kvm_vcpu *vcpu); - extern void kvm_mips_dump_host_tlbs(void); extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu); extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi, diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 4875afca3f26..b6cafb0a9df4 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -2124,37 +2124,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, return EMULATE_DONE; } -/* TLBMOD: store into address matching TLB with Dirty bit off */ -enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc, - struct kvm_run *run, - struct kvm_vcpu *vcpu) -{ - enum emulation_result er = EMULATE_DONE; -#ifdef DEBUG - struct mips_coproc *cop0 = vcpu->arch.cop0; - unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) | - (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID); - bool kernel = KVM_GUEST_KERNEL_MODE(vcpu); - int index; - - /* If address not in the guest TLB, then we are in trouble */ - index = kvm_mips_guest_tlb_lookup(vcpu, entryhi); - if (index < 0) { - /* XXXKYMA Invalidate and retry */ - kvm_mips_host_tlb_inv(vcpu, vcpu->arch.host_cp0_badvaddr, - !kernel, kernel); - kvm_err("%s: host got TLBMOD for %#lx but entry not present in Guest TLB\n", - __func__, entryhi); - kvm_mips_dump_guest_tlbs(vcpu); - kvm_mips_dump_host_tlbs(); - return EMULATE_FAIL; - } -#endif - - er = kvm_mips_emulate_tlbmod(cause, opc, run, vcpu); - return er; -} - enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, u32 *opc, struct kvm_run *run, diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 77e059068a29..001c5fb61049 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -159,46 +159,63 @@ static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_run *run, static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) { + struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; - enum emulation_result er = EMULATE_DONE; - int ret = RESUME_GUEST; + struct kvm_mips_tlb *tlb; + unsigned long entryhi; + int index; if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { - kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); - er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu); + /* + * First find the mapping in the guest TLB. If the failure to + * write was due to the guest TLB, it should be up to the guest + * to handle it. + */ + entryhi = (badvaddr & VPN2_MASK) | + (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID); + index = kvm_mips_guest_tlb_lookup(vcpu, entryhi); - if (er == EMULATE_DONE) - ret = RESUME_GUEST; - else { + /* + * These should never happen. + * They would indicate stale host TLB entries. + */ + if (unlikely(index < 0)) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; + return RESUME_HOST; } - } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { + tlb = vcpu->arch.guest_tlb + index; + if (unlikely(!TLB_IS_VALID(*tlb, badvaddr))) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + /* - * XXXKYMA: The guest kernel does not expect to get this fault - * when we are not using HIGHMEM. Need to address this in a - * HIGHMEM kernel + * Guest entry not dirty? That would explain the TLB modified + * exception. Relay that on to the guest so it can handle it. */ - kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); - kvm_mips_dump_host_tlbs(); - kvm_arch_vcpu_dump_regs(vcpu); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; + if (!TLB_IS_DIRTY(*tlb, badvaddr)) { + kvm_mips_emulate_tlbmod(cause, opc, run, vcpu); + return RESUME_GUEST; + } + + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, badvaddr, + true)) + /* Not writable, needs handling as MMIO */ + return kvm_mips_bad_store(cause, opc, run, vcpu); + return RESUME_GUEST; + } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { + if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, true) < 0) + /* Not writable, needs handling as MMIO */ + return kvm_mips_bad_store(cause, opc, run, vcpu); + return RESUME_GUEST; } else { - kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); - kvm_mips_dump_host_tlbs(); - kvm_arch_vcpu_dump_regs(vcpu); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; + /* host kernel addresses are all handled as MMIO */ + return kvm_mips_bad_store(cause, opc, run, vcpu); } - return ret; } static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) -- cgit v1.2.3-70-g09d2 From f0c0c330f7bb1a640968798b63c0dffc6a8af0ec Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 6 Dec 2016 14:47:47 +0000 Subject: KVM: MIPS/MMU: Add GPA PT mkclean helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper function to make a range of guest physical address (GPA) mappings in the GPA page table clean so that writes can be caught. This will be used in a few places to manage dirty page logging. Note that until the dirty bit is transferred from GPA page table entries to GVA page table entries in an upcoming patch this won't trigger a TLB modified exception on write. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 1 + arch/mips/kvm/mmu.c | 124 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 13c9e128bb86..a7394940119c 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -643,6 +643,7 @@ enum kvm_mips_flush { }; void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags); bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn); +int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn); pgd_t *kvm_pgd_alloc(void); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 934bcc3732da..892fd0ede718 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -304,6 +304,130 @@ bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) end_gfn << PAGE_SHIFT); } +#define BUILD_PTE_RANGE_OP(name, op) \ +static int kvm_mips_##name##_pte(pte_t *pte, unsigned long start, \ + unsigned long end) \ +{ \ + int ret = 0; \ + int i_min = __pte_offset(start); \ + int i_max = __pte_offset(end); \ + int i; \ + pte_t old, new; \ + \ + for (i = i_min; i <= i_max; ++i) { \ + if (!pte_present(pte[i])) \ + continue; \ + \ + old = pte[i]; \ + new = op(old); \ + if (pte_val(new) == pte_val(old)) \ + continue; \ + set_pte(pte + i, new); \ + ret = 1; \ + } \ + return ret; \ +} \ + \ +/* returns true if anything was done */ \ +static int kvm_mips_##name##_pmd(pmd_t *pmd, unsigned long start, \ + unsigned long end) \ +{ \ + int ret = 0; \ + pte_t *pte; \ + unsigned long cur_end = ~0ul; \ + int i_min = __pmd_offset(start); \ + int i_max = __pmd_offset(end); \ + int i; \ + \ + for (i = i_min; i <= i_max; ++i, start = 0) { \ + if (!pmd_present(pmd[i])) \ + continue; \ + \ + pte = pte_offset(pmd + i, 0); \ + if (i == i_max) \ + cur_end = end; \ + \ + ret |= kvm_mips_##name##_pte(pte, start, cur_end); \ + } \ + return ret; \ +} \ + \ +static int kvm_mips_##name##_pud(pud_t *pud, unsigned long start, \ + unsigned long end) \ +{ \ + int ret = 0; \ + pmd_t *pmd; \ + unsigned long cur_end = ~0ul; \ + int i_min = __pud_offset(start); \ + int i_max = __pud_offset(end); \ + int i; \ + \ + for (i = i_min; i <= i_max; ++i, start = 0) { \ + if (!pud_present(pud[i])) \ + continue; \ + \ + pmd = pmd_offset(pud + i, 0); \ + if (i == i_max) \ + cur_end = end; \ + \ + ret |= kvm_mips_##name##_pmd(pmd, start, cur_end); \ + } \ + return ret; \ +} \ + \ +static int kvm_mips_##name##_pgd(pgd_t *pgd, unsigned long start, \ + unsigned long end) \ +{ \ + int ret = 0; \ + pud_t *pud; \ + unsigned long cur_end = ~0ul; \ + int i_min = pgd_index(start); \ + int i_max = pgd_index(end); \ + int i; \ + \ + for (i = i_min; i <= i_max; ++i, start = 0) { \ + if (!pgd_present(pgd[i])) \ + continue; \ + \ + pud = pud_offset(pgd + i, 0); \ + if (i == i_max) \ + cur_end = end; \ + \ + ret |= kvm_mips_##name##_pud(pud, start, cur_end); \ + } \ + return ret; \ +} + +/* + * kvm_mips_mkclean_gpa_pt. + * Mark a range of guest physical address space clean (writes fault) in the VM's + * GPA page table to allow dirty page tracking. + */ + +BUILD_PTE_RANGE_OP(mkclean, pte_mkclean) + +/** + * kvm_mips_mkclean_gpa_pt() - Make a range of guest physical addresses clean. + * @kvm: KVM pointer. + * @start_gfn: Guest frame number of first page in GPA range to flush. + * @end_gfn: Guest frame number of last page in GPA range to flush. + * + * Make a range of GPA mappings clean so that guest writes will fault and + * trigger dirty page logging. + * + * The caller must hold the @kvm->mmu_lock spinlock. + * + * Returns: Whether any GPA mappings were modified, which would require + * derived mappings (GVA page tables & TLB enties) to be + * invalidated. + */ +int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) +{ + return kvm_mips_mkclean_pgd(kvm->arch.gpa_mm.pgd, + start_gfn << PAGE_SHIFT, + end_gfn << PAGE_SHIFT); +} + /** * kvm_mips_map_page() - Map a guest physical page. * @vcpu: VCPU pointer. -- cgit v1.2.3-70-g09d2 From e88643ba1acb48fa30345ba75cc324d7181aa2bf Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 6 Dec 2016 14:50:52 +0000 Subject: KVM: MIPS/MMU: Use generic dirty log & protect helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIPS hasn't up to this point properly supported dirty page logging, as pages in slots with dirty logging enabled aren't made clean, and tlbmod exceptions from writes to clean pages have been assumed to be due to guest TLB protection and unconditionally passed to the guest. Use the generic dirty logging helper kvm_get_dirty_log_protect() to properly implement kvm_vm_ioctl_get_dirty_log(), similar to how ARM does. This uses xchg to clear the dirty bits when reading them, rather than wiping them out afterwards with a memset, which would potentially wipe recently set bits that weren't caught by kvm_get_dirty_log(). It also makes the pages clean again using the kvm_arch_mmu_enable_log_dirty_pt_masked() architecture callback so that further writes after the shadow memslot is flushed will trigger tlbmod exceptions and dirty handling. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/Kconfig | 1 + arch/mips/kvm/mips.c | 42 +++++++++++++++++++++++------------------- arch/mips/kvm/mmu.c | 22 ++++++++++++++++++++++ 3 files changed, 46 insertions(+), 19 deletions(-) diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 7c56d6b124d1..85c4593b634a 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM select EXPORT_UASM select PREEMPT_NOTIFIERS select ANON_INODES + select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_MMIO select SRCU ---help--- diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 01f3fa1b9f0e..0b84b336ee4d 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1086,42 +1086,46 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, return r; } -/* Get (and clear) the dirty memory log for a memory slot. */ +/** + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log + * + * Steps 1-4 below provide general overview of dirty page logging. See + * kvm_get_dirty_log_protect() function description for additional details. + * + * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we + * always flush the TLB (step 4) even if previous step failed and the dirty + * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * does not preclude user space subsequent dirty log read. Flushing TLB ensures + * writes will be marked dirty for next log read. + * + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Copy the snapshot to the userspace. + * 4. Flush TLB's if needed. + */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - unsigned long ga, ga_end; - int is_dirty = 0; + bool is_dirty = false; int r; - unsigned long n; mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) - goto out; + r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); - /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, log->slot); - ga = memslot->base_gfn << PAGE_SHIFT; - ga_end = ga + (memslot->npages << PAGE_SHIFT); - - kvm_info("%s: dirty, ga: %#lx, ga_end %#lx\n", __func__, ga, - ga_end); - - n = kvm_dirty_bitmap_bytes(memslot); - memset(memslot->dirty_bitmap, 0, n); + /* Let implementation handle TLB/GVA invalidation */ + kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); } - r = 0; -out: mutex_unlock(&kvm->slots_lock); return r; - } long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 892fd0ede718..63a6d542ecb3 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -428,6 +428,28 @@ int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) end_gfn << PAGE_SHIFT); } +/** + * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire @kvm->mmu_lock. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + gfn_t base_gfn = slot->base_gfn + gfn_offset; + gfn_t start = base_gfn + __ffs(mask); + gfn_t end = base_gfn + __fls(mask); + + kvm_mips_mkclean_gpa_pt(kvm, start, end); +} + /** * kvm_mips_map_page() - Map a guest physical page. * @vcpu: VCPU pointer. -- cgit v1.2.3-70-g09d2 From a1ac9e17b7c934666a780772866135b9fea17f4c Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 6 Dec 2016 14:56:20 +0000 Subject: KVM: MIPS: Clean & flush on dirty page logging enable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an existing memory region has dirty page logging enabled, make the entire slot clean (read only) so that writes will immediately start logging dirty pages (once the dirty bit is transferred from GPA to GVA page tables in an upcoming patch). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mips.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 0b84b336ee4d..475c4cc78bd6 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -197,9 +197,33 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + int needs_flush; + kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n", __func__, kvm, mem->slot, mem->guest_phys_addr, mem->memory_size, mem->userspace_addr); + + /* + * If dirty page logging is enabled, write protect all pages in the slot + * ready for dirty logging. + * + * There is no need to do this in any of the following cases: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() + */ + if (change == KVM_MR_FLAGS_ONLY && + (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) && + new->flags & KVM_MEM_LOG_DIRTY_PAGES)) { + spin_lock(&kvm->mmu_lock); + /* Write protect GPA page table entries */ + needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn, + new->base_gfn + new->npages - 1); + /* Let implementation do the rest */ + if (needs_flush) + kvm_mips_callbacks->flush_shadow_memslot(kvm, new); + spin_unlock(&kvm->mmu_lock); + } } static inline void dump_handler(const char *symbol, void *start, void *end) -- cgit v1.2.3-70-g09d2 From b5f1dd1ba4042bda191cd2e72726c920e6c2867f Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 6 Dec 2016 14:57:10 +0000 Subject: KVM: MIPS/MMU: Handle dirty logging on GPA faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update kvm_mips_map_page() to handle logging of dirty guest physical pages. Upcoming patches will propagate the dirty bit to the GVA page tables. A fast path is added for handling protection bits that can be resolved without calling into KVM, currently just dirtying of clean pages being written to. The slow path marks the GPA page table entry writable only on writes, and at the same time marks the page dirty in the dirty page logging bitmask. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mmu.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 63a6d542ecb3..7962eea4ebc3 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -450,6 +450,58 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mips_mkclean_gpa_pt(kvm, start, end); } +/** + * _kvm_mips_map_page_fast() - Fast path GPA fault handler. + * @vcpu: VCPU pointer. + * @gpa: Guest physical address of fault. + * @write_fault: Whether the fault was due to a write. + * @out_entry: New PTE for @gpa (written on success unless NULL). + * @out_buddy: New PTE for @gpa's buddy (written on success unless + * NULL). + * + * Perform fast path GPA fault handling, doing all that can be done without + * calling into KVM. This handles dirtying of clean pages (for dirty page + * logging). + * + * Returns: 0 on success, in which case we can update derived mappings and + * resume guest execution. + * -EFAULT on failure due to absent GPA mapping or write to + * read-only page, in which case KVM must be consulted. + */ +static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, + bool write_fault, + pte_t *out_entry, pte_t *out_buddy) +{ + struct kvm *kvm = vcpu->kvm; + gfn_t gfn = gpa >> PAGE_SHIFT; + pte_t *ptep; + int ret = 0; + + spin_lock(&kvm->mmu_lock); + + /* Fast path - just check GPA page table for an existing entry */ + ptep = kvm_mips_pte_for_gpa(kvm, NULL, gpa); + if (!ptep || !pte_present(*ptep)) { + ret = -EFAULT; + goto out; + } + + if (write_fault && !pte_dirty(*ptep)) { + /* Track dirtying of pages */ + set_pte(ptep, pte_mkdirty(*ptep)); + mark_page_dirty(kvm, gfn); + } + + if (out_entry) + *out_entry = *ptep; + if (out_buddy) + *out_buddy = *ptep_buddy(ptep); + +out: + spin_unlock(&kvm->mmu_lock); + return ret; +} + /** * kvm_mips_map_page() - Map a guest physical page. * @vcpu: VCPU pointer. @@ -462,9 +514,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, * Handle GPA faults by creating a new GPA mapping (or updating an existing * one). * - * This takes care of asking KVM for the corresponding PFN, and creating a - * mapping in the GPA page tables. Derived mappings (GVA page tables and TLBs) - * must be handled by the caller. + * This takes care of marking pages dirty (dirty page tracking), asking KVM for + * the corresponding PFN, and creating a mapping in the GPA page tables. Derived + * mappings (GVA page tables and TLBs) must be handled by the caller. * * Returns: 0 on success, in which case the caller may use the @out_entry * and @out_buddy PTEs to update derived mappings and resume guest @@ -485,7 +537,12 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, pte_t *ptep, entry, old_pte; unsigned long prot_bits; + /* Try the fast path to handle clean pages */ srcu_idx = srcu_read_lock(&kvm->srcu); + err = _kvm_mips_map_page_fast(vcpu, gpa, write_fault, out_entry, + out_buddy); + if (!err) + goto out; /* We need a minimum of cached pages ready for page table creation */ err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, @@ -493,6 +550,7 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, if (err) goto out; + /* Slow path - ask KVM core whether we can access this GPA */ pfn = gfn_to_pfn(kvm, gfn); if (is_error_noslot_pfn(pfn)) { @@ -502,11 +560,19 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, spin_lock(&kvm->mmu_lock); + /* Ensure page tables are allocated */ ptep = kvm_mips_pte_for_gpa(kvm, memcache, gpa); - prot_bits = __READABLE | _PAGE_PRESENT | __WRITEABLE; + /* Set up the PTE */ + prot_bits = __READABLE | _PAGE_PRESENT | _PAGE_WRITE | + _page_cachable_default; + if (write_fault) { + prot_bits |= __WRITEABLE; + mark_page_dirty(kvm, gfn); + } entry = pfn_pte(pfn, __pgprot(prot_bits)); + /* Write the PTE */ old_pte = *ptep; set_pte(ptep, entry); if (pte_present(old_pte)) -- cgit v1.2.3-70-g09d2 From b584f460e6d6f1bf968acfcd23aceb663ba996fa Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 6 Dec 2016 14:59:28 +0000 Subject: KVM: MIPS/MMU: Pass GPA PTE bits to KSeg0 GVA PTEs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Propagate the GPA PTE protection bits on to the GVA PTEs on a KSeg0 fault (except _PAGE_WRITE), rather than always overriding the protection. This allows dirty page tracking to work in KSeg0 as a clear dirty bit in the GPA PTE will propagate to the GVA PTEs. This makes it simpler to use a single kvm_mips_map_page() to obtain both the main GPA PTE and its buddy (which may be invalid), which also allows memory regions to be fully accessible when they don't start and end on a 2*PAGE_SIZE boundary. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mmu.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 7962eea4ebc3..9cc941864aa8 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -769,15 +769,27 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags) } } +static pte_t kvm_mips_gpa_pte_to_gva_unmapped(pte_t pte) +{ + /* + * Don't leak writeable but clean entries from GPA page tables. We don't + * want the normal Linux tlbmod handler to handle dirtying when KVM + * accesses guest memory. + */ + if (!pte_dirty(pte)) + pte = pte_wrprotect(pte); + + return pte; +} + /* XXXKYMA: Must be called with interrupts disabled */ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu, bool write_fault) { unsigned long gpa; - kvm_pfn_t pfn0, pfn1; - unsigned long vaddr; pte_t pte_gpa[2], *ptep_gva; + int idx; if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) { kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr); @@ -785,35 +797,26 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, return -1; } - /* Find host PFNs */ - - gpa = KVM_GUEST_CPHYSADDR(badvaddr & (PAGE_MASK << 1)); - vaddr = badvaddr & (PAGE_MASK << 1); - - if (kvm_mips_map_page(vcpu, gpa, write_fault, &pte_gpa[0], NULL) < 0) - return -1; - - if (kvm_mips_map_page(vcpu, gpa | PAGE_SIZE, write_fault, &pte_gpa[1], - NULL) < 0) + /* Get the GPA page table entry */ + gpa = KVM_GUEST_CPHYSADDR(badvaddr); + idx = (badvaddr >> PAGE_SHIFT) & 1; + if (kvm_mips_map_page(vcpu, gpa, write_fault, &pte_gpa[idx], + &pte_gpa[!idx]) < 0) return -1; - pfn0 = pte_pfn(pte_gpa[0]); - pfn1 = pte_pfn(pte_gpa[1]); - - /* Find GVA page table entry */ - - ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, vaddr); + /* Get the GVA page table entry */ + ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, badvaddr & ~PAGE_SIZE); if (!ptep_gva) { - kvm_err("No ptep for gva %lx\n", vaddr); + kvm_err("No ptep for gva %lx\n", badvaddr); return -1; } - /* Write host PFNs into GVA page table */ - ptep_gva[0] = pte_mkyoung(pte_mkdirty(pfn_pte(pfn0, PAGE_SHARED))); - ptep_gva[1] = pte_mkyoung(pte_mkdirty(pfn_pte(pfn1, PAGE_SHARED))); + /* Copy a pair of entries from GPA page table to GVA page table */ + ptep_gva[0] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[0]); + ptep_gva[1] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[1]); /* Invalidate this entry in the TLB, guest kernel ASID only */ - kvm_mips_host_tlb_inv(vcpu, vaddr, false, true); + kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true); return 0; } -- cgit v1.2.3-70-g09d2 From f9b11e51f89f6d2eca2ca8f41bb0ceb07c943e60 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 6 Dec 2016 14:59:43 +0000 Subject: KVM: MIPS/MMU: Pass GPA PTE bits to mapped GVA PTEs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Propagate the GPA PTE protection bits on to the GVA PTEs on a mapped fault (except _PAGE_WRITE, and filtered by the guest TLB entry), rather than always overriding the protection. This allows dirty page tracking to work in mapped guest segments as a clear dirty bit in the GPA PTE will propagate to the GVA PTEs even when the guest TLB has the dirty bit set. Since the filtering of protection bits is now abstracted, if the buddy GVA PTE is also valid, we obtain the corresponding GPA PTE using a simple non-allocating walk and load that into the GVA PTE similarly (which may itself be invalid). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/mmu.c | 56 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 9cc941864aa8..8a01bbd276fc 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -782,6 +782,15 @@ static pte_t kvm_mips_gpa_pte_to_gva_unmapped(pte_t pte) return pte; } +static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo) +{ + /* Guest EntryLo overrides host EntryLo */ + if (!(entrylo & ENTRYLO_D)) + pte = pte_mkclean(pte); + + return kvm_mips_gpa_pte_to_gva_unmapped(pte); +} + /* XXXKYMA: Must be called with interrupts disabled */ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu, @@ -825,39 +834,48 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, unsigned long gva, bool write_fault) { - kvm_pfn_t pfn; - long tlb_lo = 0; - pte_t pte_gpa, *ptep_gva; - unsigned int idx; + struct kvm *kvm = vcpu->kvm; + long tlb_lo[2]; + pte_t pte_gpa[2], *ptep_buddy, *ptep_gva; + unsigned int idx = TLB_LO_IDX(*tlb, gva); bool kernel = KVM_GUEST_KERNEL_MODE(vcpu); + tlb_lo[0] = tlb->tlb_lo[0]; + tlb_lo[1] = tlb->tlb_lo[1]; + /* * The commpage address must not be mapped to anything else if the guest * TLB contains entries nearby, or commpage accesses will break. */ - idx = TLB_LO_IDX(*tlb, gva); - if ((gva ^ KVM_GUEST_COMMPAGE_ADDR) & VPN2_MASK & PAGE_MASK) - tlb_lo = tlb->tlb_lo[idx]; + if (!((gva ^ KVM_GUEST_COMMPAGE_ADDR) & VPN2_MASK & (PAGE_MASK << 1))) + tlb_lo[TLB_LO_IDX(*tlb, KVM_GUEST_COMMPAGE_ADDR)] = 0; - /* Find host PFN */ - if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo), write_fault, - &pte_gpa, NULL) < 0) + /* Get the GPA page table entry */ + if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo[idx]), + write_fault, &pte_gpa[idx], NULL) < 0) return -1; - pfn = pte_pfn(pte_gpa); - /* Find GVA page table entry */ - ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva); + /* And its GVA buddy's GPA page table entry if it also exists */ + pte_gpa[!idx] = pfn_pte(0, __pgprot(0)); + if (tlb_lo[!idx] & ENTRYLO_V) { + spin_lock(&kvm->mmu_lock); + ptep_buddy = kvm_mips_pte_for_gpa(kvm, NULL, + mips3_tlbpfn_to_paddr(tlb_lo[!idx])); + if (ptep_buddy) + pte_gpa[!idx] = *ptep_buddy; + spin_unlock(&kvm->mmu_lock); + } + + /* Get the GVA page table entry pair */ + ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva & ~PAGE_SIZE); if (!ptep_gva) { kvm_err("No ptep for gva %lx\n", gva); return -1; } - /* Write PFN into GVA page table, taking attributes from Guest TLB */ - *ptep_gva = pfn_pte(pfn, (!(tlb_lo & ENTRYLO_V)) ? __pgprot(0) : - (tlb_lo & ENTRYLO_D) ? PAGE_SHARED : - PAGE_READONLY); - if (pte_present(*ptep_gva)) - *ptep_gva = pte_mkyoung(pte_mkdirty(*ptep_gva)); + /* Copy a pair of entries from GPA page table to GVA page table */ + ptep_gva[0] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[0], tlb_lo[0]); + ptep_gva[1] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[1], tlb_lo[1]); /* Invalidate this entry in the TLB, current guest mode ASID only */ kvm_mips_host_tlb_inv(vcpu, gva, !kernel, kernel); -- cgit v1.2.3-70-g09d2 From 411740f5422a960c30a4285343d821b62daec34b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 13 Dec 2016 16:32:39 +0000 Subject: KVM: MIPS/MMU: Implement KVM_CAP_SYNC_MMU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the SYNC_MMU capability for KVM MIPS, allowing changes in the underlying user host virtual address (HVA) mappings to be promptly reflected in the corresponding guest physical address (GPA) mappings. This allows for several features to work with guest RAM which require mappings to be altered or protected, such as copy-on-write, KSM (Kernel Samepage Merging), idle page tracking, memory swapping, and guest memory ballooning. There are two main aspects of this change, described below. The KVM MMU notifier architecture callbacks are implemented so we can be notified of changes in the HVA mappings. These arrange for the guest physical address (GPA) page tables to be modified and possibly for derived mappings (GVA page tables and TLBs) to be flushed. - kvm_unmap_hva[_range]() - These deal with HVA mappings being removed, for example before a copy-on-write takes place, which requires the corresponding GPA page table mappings to be removed too. - kvm_set_spte_hva() - These update a GPA page table entry to match the new HVA entry, but must be careful to respect KVM specific configuration such as not dirtying a clean guest page which is dirty to the host, and write protecting writable pages in read only memslots (which will soon be supported). - kvm[_test]_age_hva() - These update GPA page table entries to be old (invalid) so that access can be tracked, making them young again. The GPA page fault handling (kvm_mips_map_page) is updated to use gfn_to_pfn_prot() (which may provide read-only pages), to handle asynchronous page table invalidation from MMU notifier callbacks, and to handle more cases in the fast path. - mmu_notifier_seq is used to detect asynchronous page table invalidations while we're holding a pfn from gfn_to_pfn_prot() outside of kvm->mmu_lock, retrying if invalidations have taken place, e.g. a COW or a KSM page merge. - The fast path (_kvm_mips_map_page_fast) now handles marking old pages as young / accessed, and disallowing dirtying of clean pages that aren't actually writable (e.g. shared pages that should COW, and read-only memory regions when they are enabled in a future patch). - Due to the use of MMU notifications we no longer need to keep the page references after we've updated the GPA page tables. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 13 +++ arch/mips/kvm/Kconfig | 1 + arch/mips/kvm/mips.c | 1 + arch/mips/kvm/mmu.c | 235 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 233 insertions(+), 17 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index a7394940119c..718dfffa17d5 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -663,6 +663,19 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, unsigned long gva, bool write); +#define KVM_ARCH_WANT_MMU_NOTIFIER +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end); +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); + +static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, + unsigned long address) +{ +} + /* Emulation */ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 85c4593b634a..65067327db12 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM select ANON_INODES select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_MMIO + select MMU_NOTIFIER select SRCU ---help--- Support for hosting Guest kernels. diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 475c4cc78bd6..9932f53a1e5c 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1217,6 +1217,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_SYNC_MMU: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 8a01bbd276fc..cb0faade311e 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -194,7 +194,6 @@ static bool kvm_mips_flush_gpa_pte(pte_t *pte, unsigned long start_gpa, if (!pte_present(pte[i])) continue; - kvm_release_pfn_clean(pte_pfn(pte[i])); set_pte(pte + i, __pte(0)); } return safe_to_remove; @@ -450,6 +449,155 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mips_mkclean_gpa_pt(kvm, start, end); } +/* + * kvm_mips_mkold_gpa_pt. + * Mark a range of guest physical address space old (all accesses fault) in the + * VM's GPA page table to allow detection of commonly used pages. + */ + +BUILD_PTE_RANGE_OP(mkold, pte_mkold) + +static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn, + gfn_t end_gfn) +{ + return kvm_mips_mkold_pgd(kvm->arch.gpa_mm.pgd, + start_gfn << PAGE_SHIFT, + end_gfn << PAGE_SHIFT); +} + +static int handle_hva_to_gpa(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, gfn_t gfn, + gpa_t gfn_end, + struct kvm_memory_slot *memslot, + void *data), + void *data) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int ret = 0; + + slots = kvm_memslots(kvm); + + /* we only care about the pages that the guest sees */ + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + ret |= handler(kvm, gfn, gfn_end, memslot, data); + } + + return ret; +} + + +static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, + struct kvm_memory_slot *memslot, void *data) +{ + kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end); + return 1; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + unsigned long end = hva + PAGE_SIZE; + + handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); + + kvm_mips_callbacks->flush_shadow_all(kvm); + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); + + kvm_mips_callbacks->flush_shadow_all(kvm); + return 0; +} + +static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, + struct kvm_memory_slot *memslot, void *data) +{ + gpa_t gpa = gfn << PAGE_SHIFT; + pte_t hva_pte = *(pte_t *)data; + pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); + pte_t old_pte; + + if (!gpa_pte) + return 0; + + /* Mapping may need adjusting depending on memslot flags */ + old_pte = *gpa_pte; + if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte)) + hva_pte = pte_mkclean(hva_pte); + else if (memslot->flags & KVM_MEM_READONLY) + hva_pte = pte_wrprotect(hva_pte); + + set_pte(gpa_pte, hva_pte); + + /* Replacing an absent or old page doesn't need flushes */ + if (!pte_present(old_pte) || !pte_young(old_pte)) + return 0; + + /* Pages swapped, aged, moved, or cleaned require flushes */ + return !pte_present(hva_pte) || + !pte_young(hva_pte) || + pte_pfn(old_pte) != pte_pfn(hva_pte) || + (pte_dirty(old_pte) && !pte_dirty(hva_pte)); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + unsigned long end = hva + PAGE_SIZE; + int ret; + + ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); + if (ret) + kvm_mips_callbacks->flush_shadow_all(kvm); +} + +static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, + struct kvm_memory_slot *memslot, void *data) +{ + return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end); +} + +static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, + struct kvm_memory_slot *memslot, void *data) +{ + gpa_t gpa = gfn << PAGE_SHIFT; + pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); + + if (!gpa_pte) + return 0; + return pte_young(*gpa_pte); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); +} + /** * _kvm_mips_map_page_fast() - Fast path GPA fault handler. * @vcpu: VCPU pointer. @@ -460,8 +608,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, * NULL). * * Perform fast path GPA fault handling, doing all that can be done without - * calling into KVM. This handles dirtying of clean pages (for dirty page - * logging). + * calling into KVM. This handles marking old pages young (for idle page + * tracking), and dirtying of clean pages (for dirty page logging). * * Returns: 0 on success, in which case we can update derived mappings and * resume guest execution. @@ -475,6 +623,8 @@ static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, struct kvm *kvm = vcpu->kvm; gfn_t gfn = gpa >> PAGE_SHIFT; pte_t *ptep; + kvm_pfn_t pfn = 0; /* silence bogus GCC warning */ + bool pfn_valid = false; int ret = 0; spin_lock(&kvm->mmu_lock); @@ -486,10 +636,24 @@ static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, goto out; } + /* Track access to pages marked old */ + if (!pte_young(*ptep)) { + set_pte(ptep, pte_mkyoung(*ptep)); + pfn = pte_pfn(*ptep); + pfn_valid = true; + /* call kvm_set_pfn_accessed() after unlock */ + } if (write_fault && !pte_dirty(*ptep)) { - /* Track dirtying of pages */ + if (!pte_write(*ptep)) { + ret = -EFAULT; + goto out; + } + + /* Track dirtying of writeable pages */ set_pte(ptep, pte_mkdirty(*ptep)); + pfn = pte_pfn(*ptep); mark_page_dirty(kvm, gfn); + kvm_set_pfn_dirty(pfn); } if (out_entry) @@ -499,6 +663,8 @@ static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, out: spin_unlock(&kvm->mmu_lock); + if (pfn_valid) + kvm_set_pfn_accessed(pfn); return ret; } @@ -514,9 +680,10 @@ out: * Handle GPA faults by creating a new GPA mapping (or updating an existing * one). * - * This takes care of marking pages dirty (dirty page tracking), asking KVM for - * the corresponding PFN, and creating a mapping in the GPA page tables. Derived - * mappings (GVA page tables and TLBs) must be handled by the caller. + * This takes care of marking pages young or dirty (idle/dirty page tracking), + * asking KVM for the corresponding PFN, and creating a mapping in the GPA page + * tables. Derived mappings (GVA page tables and TLBs) must be handled by the + * caller. * * Returns: 0 on success, in which case the caller may use the @out_entry * and @out_buddy PTEs to update derived mappings and resume guest @@ -535,9 +702,11 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, int srcu_idx, err; kvm_pfn_t pfn; pte_t *ptep, entry, old_pte; + bool writeable; unsigned long prot_bits; + unsigned long mmu_seq; - /* Try the fast path to handle clean pages */ + /* Try the fast path to handle old / clean pages */ srcu_idx = srcu_read_lock(&kvm->srcu); err = _kvm_mips_map_page_fast(vcpu, gpa, write_fault, out_entry, out_buddy); @@ -550,33 +719,63 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, if (err) goto out; - /* Slow path - ask KVM core whether we can access this GPA */ - pfn = gfn_to_pfn(kvm, gfn); +retry: + /* + * Used to check for invalidations in progress, of the pfn that is + * returned by pfn_to_pfn_prot below. + */ + mmu_seq = kvm->mmu_notifier_seq; + /* + * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in + * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't + * risk the page we get a reference to getting unmapped before we have a + * chance to grab the mmu_lock without mmu_notifier_retry() noticing. + * + * This smp_rmb() pairs with the effective smp_wmb() of the combination + * of the pte_unmap_unlock() after the PTE is zapped, and the + * spin_lock() in kvm_mmu_notifier_invalidate_() before + * mmu_notifier_seq is incremented. + */ + smp_rmb(); + /* Slow path - ask KVM core whether we can access this GPA */ + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writeable); if (is_error_noslot_pfn(pfn)) { err = -EFAULT; goto out; } spin_lock(&kvm->mmu_lock); + /* Check if an invalidation has taken place since we got pfn */ + if (mmu_notifier_retry(kvm, mmu_seq)) { + /* + * This can happen when mappings are changed asynchronously, but + * also synchronously if a COW is triggered by + * gfn_to_pfn_prot(). + */ + spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + goto retry; + } /* Ensure page tables are allocated */ ptep = kvm_mips_pte_for_gpa(kvm, memcache, gpa); /* Set up the PTE */ - prot_bits = __READABLE | _PAGE_PRESENT | _PAGE_WRITE | - _page_cachable_default; - if (write_fault) { - prot_bits |= __WRITEABLE; - mark_page_dirty(kvm, gfn); + prot_bits = _PAGE_PRESENT | __READABLE | _page_cachable_default; + if (writeable) { + prot_bits |= _PAGE_WRITE; + if (write_fault) { + prot_bits |= __WRITEABLE; + mark_page_dirty(kvm, gfn); + kvm_set_pfn_dirty(pfn); + } } entry = pfn_pte(pfn, __pgprot(prot_bits)); /* Write the PTE */ old_pte = *ptep; set_pte(ptep, entry); - if (pte_present(old_pte)) - kvm_release_pfn_clean(pte_pfn(old_pte)); err = 0; if (out_entry) @@ -585,6 +784,8 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, *out_buddy = *ptep_buddy(ptep); spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + kvm_set_pfn_accessed(pfn); out: srcu_read_unlock(&kvm->srcu, srcu_idx); return err; -- cgit v1.2.3-70-g09d2 From 230c57244c2c4d945dba7f9d15845bffe4135b58 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 May 2015 17:11:49 +0100 Subject: KVM: MIPS: Claim KVM_CAP_READONLY_MEM support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that load/store faults due to read only memory regions are treated as MMIO accesses it is safe to claim support for read only memory regions (KVM_CAP_READONLY_MEM). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/uapi/asm/kvm.h | 2 ++ arch/mips/kvm/mips.c | 1 + 2 files changed, 3 insertions(+) diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h index 6985eb59b085..a8a0199bf760 100644 --- a/arch/mips/include/uapi/asm/kvm.h +++ b/arch/mips/include/uapi/asm/kvm.h @@ -19,6 +19,8 @@ * Some parts derived from the x86 version of this file. */ +#define __KVM_HAVE_READONLY_MEM + /* * for KVM_GET_REGS and KVM_SET_REGS * diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 9932f53a1e5c..591426cda15e 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1217,6 +1217,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_READONLY_MEM: case KVM_CAP_SYNC_MMU: r = 1; break; -- cgit v1.2.3-70-g09d2 From 654229a02456a9af372defb13d1911345360074d Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 8 Dec 2016 22:46:41 +0000 Subject: KVM: MIPS/T&E: Move CP0 register access into T&E MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Access to various CP0 registers via the KVM register access API needs to be implementation specific to allow restrictions to be made on changes, for example when VZ guest registers aren't present, so move them all into trap_emul.c in preparation for VZ. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 1 - arch/mips/kvm/emulate.c | 2 +- arch/mips/kvm/mips.c | 198 --------------------------------------- arch/mips/kvm/trap_emul.c | 181 ++++++++++++++++++++++++++++++++++- 4 files changed, 179 insertions(+), 203 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 718dfffa17d5..bc56a312497d 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -345,7 +345,6 @@ struct kvm_vcpu_arch { u8 fpu_enabled; u8 msa_enabled; - u8 kscratch_enabled; }; diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index b6cafb0a9df4..f2b054b80bca 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1066,7 +1066,7 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu) unsigned int mask = MIPS_CONF_M; /* KScrExist */ - mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16; + mask |= 0xfc << MIPS_CONF4_KSCREXIST_SHIFT; return mask; } diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 591426cda15e..9338aec08790 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -520,33 +520,6 @@ static u64 kvm_mips_get_one_regs[] = { KVM_REG_MIPS_LO, #endif KVM_REG_MIPS_PC, - - KVM_REG_MIPS_CP0_INDEX, - KVM_REG_MIPS_CP0_CONTEXT, - KVM_REG_MIPS_CP0_USERLOCAL, - KVM_REG_MIPS_CP0_PAGEMASK, - KVM_REG_MIPS_CP0_WIRED, - KVM_REG_MIPS_CP0_HWRENA, - KVM_REG_MIPS_CP0_BADVADDR, - KVM_REG_MIPS_CP0_COUNT, - KVM_REG_MIPS_CP0_ENTRYHI, - KVM_REG_MIPS_CP0_COMPARE, - KVM_REG_MIPS_CP0_STATUS, - KVM_REG_MIPS_CP0_CAUSE, - KVM_REG_MIPS_CP0_EPC, - KVM_REG_MIPS_CP0_PRID, - KVM_REG_MIPS_CP0_CONFIG, - KVM_REG_MIPS_CP0_CONFIG1, - KVM_REG_MIPS_CP0_CONFIG2, - KVM_REG_MIPS_CP0_CONFIG3, - KVM_REG_MIPS_CP0_CONFIG4, - KVM_REG_MIPS_CP0_CONFIG5, - KVM_REG_MIPS_CP0_CONFIG7, - KVM_REG_MIPS_CP0_ERROREPC, - - KVM_REG_MIPS_COUNT_CTL, - KVM_REG_MIPS_COUNT_RESUME, - KVM_REG_MIPS_COUNT_HZ, }; static u64 kvm_mips_get_one_regs_fpu[] = { @@ -559,15 +532,6 @@ static u64 kvm_mips_get_one_regs_msa[] = { KVM_REG_MIPS_MSA_CSR, }; -static u64 kvm_mips_get_one_regs_kscratch[] = { - KVM_REG_MIPS_CP0_KSCRATCH1, - KVM_REG_MIPS_CP0_KSCRATCH2, - KVM_REG_MIPS_CP0_KSCRATCH3, - KVM_REG_MIPS_CP0_KSCRATCH4, - KVM_REG_MIPS_CP0_KSCRATCH5, - KVM_REG_MIPS_CP0_KSCRATCH6, -}; - static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) { unsigned long ret; @@ -581,7 +545,6 @@ static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) } if (kvm_mips_guest_can_have_msa(&vcpu->arch)) ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32; - ret += __arch_hweight8(vcpu->arch.kscratch_enabled); ret += kvm_mips_callbacks->num_regs(vcpu); return ret; @@ -634,16 +597,6 @@ static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices) } } - for (i = 0; i < 6; ++i) { - if (!(vcpu->arch.kscratch_enabled & BIT(i + 2))) - continue; - - if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i], - sizeof(kvm_mips_get_one_regs_kscratch[i]))) - return -EFAULT; - ++indices; - } - return kvm_mips_callbacks->copy_reg_indices(vcpu, indices); } @@ -734,95 +687,6 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, v = fpu->msacsr; break; - /* Co-processor 0 registers */ - case KVM_REG_MIPS_CP0_INDEX: - v = (long)kvm_read_c0_guest_index(cop0); - break; - case KVM_REG_MIPS_CP0_CONTEXT: - v = (long)kvm_read_c0_guest_context(cop0); - break; - case KVM_REG_MIPS_CP0_USERLOCAL: - v = (long)kvm_read_c0_guest_userlocal(cop0); - break; - case KVM_REG_MIPS_CP0_PAGEMASK: - v = (long)kvm_read_c0_guest_pagemask(cop0); - break; - case KVM_REG_MIPS_CP0_WIRED: - v = (long)kvm_read_c0_guest_wired(cop0); - break; - case KVM_REG_MIPS_CP0_HWRENA: - v = (long)kvm_read_c0_guest_hwrena(cop0); - break; - case KVM_REG_MIPS_CP0_BADVADDR: - v = (long)kvm_read_c0_guest_badvaddr(cop0); - break; - case KVM_REG_MIPS_CP0_ENTRYHI: - v = (long)kvm_read_c0_guest_entryhi(cop0); - break; - case KVM_REG_MIPS_CP0_COMPARE: - v = (long)kvm_read_c0_guest_compare(cop0); - break; - case KVM_REG_MIPS_CP0_STATUS: - v = (long)kvm_read_c0_guest_status(cop0); - break; - case KVM_REG_MIPS_CP0_CAUSE: - v = (long)kvm_read_c0_guest_cause(cop0); - break; - case KVM_REG_MIPS_CP0_EPC: - v = (long)kvm_read_c0_guest_epc(cop0); - break; - case KVM_REG_MIPS_CP0_PRID: - v = (long)kvm_read_c0_guest_prid(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG: - v = (long)kvm_read_c0_guest_config(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG1: - v = (long)kvm_read_c0_guest_config1(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG2: - v = (long)kvm_read_c0_guest_config2(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG3: - v = (long)kvm_read_c0_guest_config3(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG4: - v = (long)kvm_read_c0_guest_config4(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG5: - v = (long)kvm_read_c0_guest_config5(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG7: - v = (long)kvm_read_c0_guest_config7(cop0); - break; - case KVM_REG_MIPS_CP0_ERROREPC: - v = (long)kvm_read_c0_guest_errorepc(cop0); - break; - case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6: - idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2; - if (!(vcpu->arch.kscratch_enabled & BIT(idx))) - return -EINVAL; - switch (idx) { - case 2: - v = (long)kvm_read_c0_guest_kscratch1(cop0); - break; - case 3: - v = (long)kvm_read_c0_guest_kscratch2(cop0); - break; - case 4: - v = (long)kvm_read_c0_guest_kscratch3(cop0); - break; - case 5: - v = (long)kvm_read_c0_guest_kscratch4(cop0); - break; - case 6: - v = (long)kvm_read_c0_guest_kscratch5(cop0); - break; - case 7: - v = (long)kvm_read_c0_guest_kscratch6(cop0); - break; - } - break; /* registers to be handled specially */ default: ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v); @@ -954,68 +818,6 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu, fpu->msacsr = v; break; - /* Co-processor 0 registers */ - case KVM_REG_MIPS_CP0_INDEX: - kvm_write_c0_guest_index(cop0, v); - break; - case KVM_REG_MIPS_CP0_CONTEXT: - kvm_write_c0_guest_context(cop0, v); - break; - case KVM_REG_MIPS_CP0_USERLOCAL: - kvm_write_c0_guest_userlocal(cop0, v); - break; - case KVM_REG_MIPS_CP0_PAGEMASK: - kvm_write_c0_guest_pagemask(cop0, v); - break; - case KVM_REG_MIPS_CP0_WIRED: - kvm_write_c0_guest_wired(cop0, v); - break; - case KVM_REG_MIPS_CP0_HWRENA: - kvm_write_c0_guest_hwrena(cop0, v); - break; - case KVM_REG_MIPS_CP0_BADVADDR: - kvm_write_c0_guest_badvaddr(cop0, v); - break; - case KVM_REG_MIPS_CP0_ENTRYHI: - kvm_write_c0_guest_entryhi(cop0, v); - break; - case KVM_REG_MIPS_CP0_STATUS: - kvm_write_c0_guest_status(cop0, v); - break; - case KVM_REG_MIPS_CP0_EPC: - kvm_write_c0_guest_epc(cop0, v); - break; - case KVM_REG_MIPS_CP0_PRID: - kvm_write_c0_guest_prid(cop0, v); - break; - case KVM_REG_MIPS_CP0_ERROREPC: - kvm_write_c0_guest_errorepc(cop0, v); - break; - case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6: - idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2; - if (!(vcpu->arch.kscratch_enabled & BIT(idx))) - return -EINVAL; - switch (idx) { - case 2: - kvm_write_c0_guest_kscratch1(cop0, v); - break; - case 3: - kvm_write_c0_guest_kscratch2(cop0, v); - break; - case 4: - kvm_write_c0_guest_kscratch3(cop0, v); - break; - case 5: - kvm_write_c0_guest_kscratch4(cop0, v); - break; - case 6: - kvm_write_c0_guest_kscratch5(cop0, v); - break; - case 7: - kvm_write_c0_guest_kscratch6(cop0, v); - break; - } - break; /* registers to be handled specially */ default: return kvm_mips_callbacks->set_one_reg(vcpu, reg, v); diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 001c5fb61049..08327de4323a 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -489,8 +489,6 @@ static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; - vcpu->arch.kscratch_enabled = 0xfc; - /* * Allocate GVA -> HPA page tables. * MIPS doesn't use the mm_struct pointer argument. @@ -640,14 +638,54 @@ static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm, kvm_trap_emul_flush_shadow_all(kvm); } +static u64 kvm_trap_emul_get_one_regs[] = { + KVM_REG_MIPS_CP0_INDEX, + KVM_REG_MIPS_CP0_CONTEXT, + KVM_REG_MIPS_CP0_USERLOCAL, + KVM_REG_MIPS_CP0_PAGEMASK, + KVM_REG_MIPS_CP0_WIRED, + KVM_REG_MIPS_CP0_HWRENA, + KVM_REG_MIPS_CP0_BADVADDR, + KVM_REG_MIPS_CP0_COUNT, + KVM_REG_MIPS_CP0_ENTRYHI, + KVM_REG_MIPS_CP0_COMPARE, + KVM_REG_MIPS_CP0_STATUS, + KVM_REG_MIPS_CP0_CAUSE, + KVM_REG_MIPS_CP0_EPC, + KVM_REG_MIPS_CP0_PRID, + KVM_REG_MIPS_CP0_CONFIG, + KVM_REG_MIPS_CP0_CONFIG1, + KVM_REG_MIPS_CP0_CONFIG2, + KVM_REG_MIPS_CP0_CONFIG3, + KVM_REG_MIPS_CP0_CONFIG4, + KVM_REG_MIPS_CP0_CONFIG5, + KVM_REG_MIPS_CP0_CONFIG7, + KVM_REG_MIPS_CP0_ERROREPC, + KVM_REG_MIPS_CP0_KSCRATCH1, + KVM_REG_MIPS_CP0_KSCRATCH2, + KVM_REG_MIPS_CP0_KSCRATCH3, + KVM_REG_MIPS_CP0_KSCRATCH4, + KVM_REG_MIPS_CP0_KSCRATCH5, + KVM_REG_MIPS_CP0_KSCRATCH6, + + KVM_REG_MIPS_COUNT_CTL, + KVM_REG_MIPS_COUNT_RESUME, + KVM_REG_MIPS_COUNT_HZ, +}; + static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu) { - return 0; + return ARRAY_SIZE(kvm_trap_emul_get_one_regs); } static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices) { + if (copy_to_user(indices, kvm_trap_emul_get_one_regs, + sizeof(kvm_trap_emul_get_one_regs))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_trap_emul_get_one_regs); + return 0; } @@ -655,7 +693,69 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, s64 *v) { + struct mips_coproc *cop0 = vcpu->arch.cop0; + switch (reg->id) { + case KVM_REG_MIPS_CP0_INDEX: + *v = (long)kvm_read_c0_guest_index(cop0); + break; + case KVM_REG_MIPS_CP0_CONTEXT: + *v = (long)kvm_read_c0_guest_context(cop0); + break; + case KVM_REG_MIPS_CP0_USERLOCAL: + *v = (long)kvm_read_c0_guest_userlocal(cop0); + break; + case KVM_REG_MIPS_CP0_PAGEMASK: + *v = (long)kvm_read_c0_guest_pagemask(cop0); + break; + case KVM_REG_MIPS_CP0_WIRED: + *v = (long)kvm_read_c0_guest_wired(cop0); + break; + case KVM_REG_MIPS_CP0_HWRENA: + *v = (long)kvm_read_c0_guest_hwrena(cop0); + break; + case KVM_REG_MIPS_CP0_BADVADDR: + *v = (long)kvm_read_c0_guest_badvaddr(cop0); + break; + case KVM_REG_MIPS_CP0_ENTRYHI: + *v = (long)kvm_read_c0_guest_entryhi(cop0); + break; + case KVM_REG_MIPS_CP0_COMPARE: + *v = (long)kvm_read_c0_guest_compare(cop0); + break; + case KVM_REG_MIPS_CP0_STATUS: + *v = (long)kvm_read_c0_guest_status(cop0); + break; + case KVM_REG_MIPS_CP0_CAUSE: + *v = (long)kvm_read_c0_guest_cause(cop0); + break; + case KVM_REG_MIPS_CP0_EPC: + *v = (long)kvm_read_c0_guest_epc(cop0); + break; + case KVM_REG_MIPS_CP0_PRID: + *v = (long)kvm_read_c0_guest_prid(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG: + *v = (long)kvm_read_c0_guest_config(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG1: + *v = (long)kvm_read_c0_guest_config1(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG2: + *v = (long)kvm_read_c0_guest_config2(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG3: + *v = (long)kvm_read_c0_guest_config3(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG4: + *v = (long)kvm_read_c0_guest_config4(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG5: + *v = (long)kvm_read_c0_guest_config5(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG7: + *v = (long)kvm_read_c0_guest_config7(cop0); + break; case KVM_REG_MIPS_CP0_COUNT: *v = kvm_mips_read_count(vcpu); break; @@ -668,6 +768,27 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_COUNT_HZ: *v = vcpu->arch.count_hz; break; + case KVM_REG_MIPS_CP0_ERROREPC: + *v = (long)kvm_read_c0_guest_errorepc(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH1: + *v = (long)kvm_read_c0_guest_kscratch1(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH2: + *v = (long)kvm_read_c0_guest_kscratch2(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH3: + *v = (long)kvm_read_c0_guest_kscratch3(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH4: + *v = (long)kvm_read_c0_guest_kscratch4(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH5: + *v = (long)kvm_read_c0_guest_kscratch5(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH6: + *v = (long)kvm_read_c0_guest_kscratch6(cop0); + break; default: return -EINVAL; } @@ -683,6 +804,39 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, unsigned int cur, change; switch (reg->id) { + case KVM_REG_MIPS_CP0_INDEX: + kvm_write_c0_guest_index(cop0, v); + break; + case KVM_REG_MIPS_CP0_CONTEXT: + kvm_write_c0_guest_context(cop0, v); + break; + case KVM_REG_MIPS_CP0_USERLOCAL: + kvm_write_c0_guest_userlocal(cop0, v); + break; + case KVM_REG_MIPS_CP0_PAGEMASK: + kvm_write_c0_guest_pagemask(cop0, v); + break; + case KVM_REG_MIPS_CP0_WIRED: + kvm_write_c0_guest_wired(cop0, v); + break; + case KVM_REG_MIPS_CP0_HWRENA: + kvm_write_c0_guest_hwrena(cop0, v); + break; + case KVM_REG_MIPS_CP0_BADVADDR: + kvm_write_c0_guest_badvaddr(cop0, v); + break; + case KVM_REG_MIPS_CP0_ENTRYHI: + kvm_write_c0_guest_entryhi(cop0, v); + break; + case KVM_REG_MIPS_CP0_STATUS: + kvm_write_c0_guest_status(cop0, v); + break; + case KVM_REG_MIPS_CP0_EPC: + kvm_write_c0_guest_epc(cop0, v); + break; + case KVM_REG_MIPS_CP0_PRID: + kvm_write_c0_guest_prid(cop0, v); + break; case KVM_REG_MIPS_CP0_COUNT: kvm_mips_write_count(vcpu, v); break; @@ -759,6 +913,27 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_COUNT_HZ: ret = kvm_mips_set_count_hz(vcpu, v); break; + case KVM_REG_MIPS_CP0_ERROREPC: + kvm_write_c0_guest_errorepc(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH1: + kvm_write_c0_guest_kscratch1(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH2: + kvm_write_c0_guest_kscratch2(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH3: + kvm_write_c0_guest_kscratch3(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH4: + kvm_write_c0_guest_kscratch4(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH5: + kvm_write_c0_guest_kscratch5(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH6: + kvm_write_c0_guest_kscratch6(cop0, v); + break; default: return -EINVAL; } -- cgit v1.2.3-70-g09d2 From 7801bbe1bd907a8f8b136fc184583260508febb6 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 14 Nov 2016 23:59:27 +0000 Subject: KVM: MIPS/T&E: Implement CP0_EBase register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CP0_EBase register is a standard feature of MIPS32r2, so we should always have been implementing it properly. However the register value was ignored and wasn't exposed to userland. Fix the emulation of exceptions and interrupts to use the value stored in guest CP0_EBase, and fix the masks so that the top 3 bits (rather than the standard 2) are fixed, so that it is always in the guest KSeg0 segment. Also add CP0_EBASE to the KVM one_reg interface so it can be accessed by userland, also allowing the CPU number field to be written (which isn't permitted by the guest). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- Documentation/virtual/kvm/api.txt | 1 + arch/mips/include/asm/kvm_host.h | 3 ++ arch/mips/kvm/emulate.c | 73 ++++++++++++++++++++++----------------- arch/mips/kvm/interrupt.c | 5 +-- arch/mips/kvm/trap_emul.c | 12 +++++++ 5 files changed, 61 insertions(+), 33 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 03145b7cafaa..8d52d0f990ae 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2074,6 +2074,7 @@ registers, find a list below: MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 MIPS | KVM_REG_MIPS_CP0_EPC | 64 MIPS | KVM_REG_MIPS_CP0_PRID | 32 + MIPS | KVM_REG_MIPS_CP0_EBASE | 64 MIPS | KVM_REG_MIPS_CP0_CONFIG | 32 MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32 MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32 diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index bc56a312497d..420372fa5bbc 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -88,6 +88,7 @@ #define KVM_GUEST_KUSEG 0x00000000UL #define KVM_GUEST_KSEG0 0x40000000UL +#define KVM_GUEST_KSEG1 0x40000000UL #define KVM_GUEST_KSEG23 0x60000000UL #define KVM_GUEST_KSEGX(a) ((_ACAST32_(a)) & 0xe0000000) #define KVM_GUEST_CPHYSADDR(a) ((_ACAST32_(a)) & 0x1fffffff) @@ -713,6 +714,8 @@ extern enum emulation_result kvm_mips_emulate_inst(u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); +long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu); + extern enum emulation_result kvm_mips_emulate_syscall(u32 cause, u32 *opc, struct kvm_run *run, diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index f2b054b80bca..d40cfaad4529 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1200,14 +1200,13 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, er = EMULATE_FAIL; break; } -#define C0_EBASE_CORE_MASK 0xff if ((rd == MIPS_CP0_PRID) && (sel == 1)) { - /* Preserve CORE number */ - kvm_change_c0_guest_ebase(cop0, - ~(C0_EBASE_CORE_MASK), + /* + * Preserve core number, and keep the exception + * base in guest KSeg0. + */ + kvm_change_c0_guest_ebase(cop0, 0x1ffff000, vcpu->arch.gprs[rt]); - kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n", - kvm_read_c0_guest_ebase(cop0)); } else if (rd == MIPS_CP0_TLB_HI && sel == 0) { u32 nasid = vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID; @@ -1917,6 +1916,22 @@ unknown: return er; } +/** + * kvm_mips_guest_exception_base() - Find guest exception vector base address. + * + * Returns: The base address of the current guest exception vector, taking + * both Guest.CP0_Status.BEV and Guest.CP0_EBase into account. + */ +long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + + if (kvm_read_c0_guest_status(cop0) & ST0_BEV) + return KVM_GUEST_CKSEG1ADDR(0x1fc00200); + else + return kvm_read_c0_guest_ebase(cop0) & MIPS_EBASE_BASE; +} + enum emulation_result kvm_mips_emulate_syscall(u32 cause, u32 *opc, struct kvm_run *run, @@ -1942,7 +1957,7 @@ enum emulation_result kvm_mips_emulate_syscall(u32 cause, (EXCCODE_SYS << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver SYSCALL when EXL is already set\n"); @@ -1976,13 +1991,13 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, arch->pc); /* set pc to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x0; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0; } else { kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n", arch->pc); - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } kvm_change_c0_guest_cause(cop0, (0xff), @@ -2019,16 +2034,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, kvm_debug("[EXL == 0] delivering TLB INV @ pc %#lx\n", arch->pc); - - /* set pc to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; - } else { kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n", arch->pc); - arch->pc = KVM_GUEST_KSEG0 + 0x180; } + /* set pc to the exception entry point */ + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; + kvm_change_c0_guest_cause(cop0, (0xff), (EXCCODE_TLBL << CAUSEB_EXCCODE)); @@ -2064,11 +2077,11 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, arch->pc); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x0; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0; } else { kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n", arch->pc); - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } kvm_change_c0_guest_cause(cop0, (0xff), @@ -2104,15 +2117,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, kvm_debug("[EXL == 0] Delivering TLB MISS @ pc %#lx\n", arch->pc); - - /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; } else { kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n", arch->pc); - arch->pc = KVM_GUEST_KSEG0 + 0x180; } + /* Set PC to the exception entry point */ + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; + kvm_change_c0_guest_cause(cop0, (0xff), (EXCCODE_TLBS << CAUSEB_EXCCODE)); @@ -2146,14 +2158,13 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, kvm_debug("[EXL == 0] Delivering TLB MOD @ pc %#lx\n", arch->pc); - - arch->pc = KVM_GUEST_KSEG0 + 0x180; } else { kvm_debug("[EXL == 1] Delivering TLB MOD @ pc %#lx\n", arch->pc); - arch->pc = KVM_GUEST_KSEG0 + 0x180; } + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; + kvm_change_c0_guest_cause(cop0, (0xff), (EXCCODE_MOD << CAUSEB_EXCCODE)); @@ -2185,7 +2196,7 @@ enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, } - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; kvm_change_c0_guest_cause(cop0, (0xff), (EXCCODE_CPU << CAUSEB_EXCCODE)); @@ -2219,7 +2230,7 @@ enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, (EXCCODE_RI << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver RI when EXL is already set\n"); @@ -2254,7 +2265,7 @@ enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, (EXCCODE_BP << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver BP when EXL is already set\n"); @@ -2289,7 +2300,7 @@ enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, (EXCCODE_TR << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver TRAP when EXL is already set\n"); @@ -2324,7 +2335,7 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, (EXCCODE_MSAFPE << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver MSAFPE when EXL is already set\n"); @@ -2359,7 +2370,7 @@ enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, (EXCCODE_FPE << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver FPE when EXL is already set\n"); @@ -2394,7 +2405,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, (EXCCODE_MSADIS << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver MSADIS when EXL is already set\n"); @@ -2560,7 +2571,7 @@ static enum emulation_result kvm_mips_emulate_exc(u32 cause, (exccode << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); kvm_debug("Delivering EXC %d @ pc %#lx, badVaddr: %#lx\n", diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c index e88403b3dcdd..aa0a1a00faf6 100644 --- a/arch/mips/kvm/interrupt.c +++ b/arch/mips/kvm/interrupt.c @@ -183,10 +183,11 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority, (exccode << CAUSEB_EXCCODE)); /* XXXSL Set PC to the interrupt exception entry point */ + arch->pc = kvm_mips_guest_exception_base(vcpu); if (kvm_read_c0_guest_cause(cop0) & CAUSEF_IV) - arch->pc = KVM_GUEST_KSEG0 + 0x200; + arch->pc += 0x200; else - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc += 0x180; clear_bit(priority, &vcpu->arch.pending_exceptions); } diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 08327de4323a..80a681f42bf5 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -653,6 +653,7 @@ static u64 kvm_trap_emul_get_one_regs[] = { KVM_REG_MIPS_CP0_CAUSE, KVM_REG_MIPS_CP0_EPC, KVM_REG_MIPS_CP0_PRID, + KVM_REG_MIPS_CP0_EBASE, KVM_REG_MIPS_CP0_CONFIG, KVM_REG_MIPS_CP0_CONFIG1, KVM_REG_MIPS_CP0_CONFIG2, @@ -735,6 +736,9 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_CP0_PRID: *v = (long)kvm_read_c0_guest_prid(cop0); break; + case KVM_REG_MIPS_CP0_EBASE: + *v = (long)kvm_read_c0_guest_ebase(cop0); + break; case KVM_REG_MIPS_CP0_CONFIG: *v = (long)kvm_read_c0_guest_config(cop0); break; @@ -837,6 +841,14 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_CP0_PRID: kvm_write_c0_guest_prid(cop0, v); break; + case KVM_REG_MIPS_CP0_EBASE: + /* + * Allow core number to be written, but the exception base must + * remain in guest KSeg0. + */ + kvm_change_c0_guest_ebase(cop0, 0x1ffff000 | MIPS_EBASE_CPUNUM, + v); + break; case KVM_REG_MIPS_CP0_COUNT: kvm_mips_write_count(vcpu, v); break; -- cgit v1.2.3-70-g09d2 From be67a0be94b65746dee63af5c184c78d00a707f6 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 18 Jan 2017 16:20:31 +0000 Subject: KVM: MIPS/T&E: Default to reset vector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set the default VCPU state closer to the architectural reset state, with PC pointing at the reset vector (uncached PA 0x1fc00000, which for KVM T&E is VA 0x5fc00000), and with CP0_Status.BEV and CP0_Status.ERL to 1. Although QEMU at least will overwrite this state, it makes sense to do this now that CP0_EBase is properly implemented to check BEV, and now that we support a sparse GPA layout potentially with a boot ROM at GPA 0x1fc00000. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/kvm/trap_emul.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 80a681f42bf5..ce44f91c653a 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -614,6 +614,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) /* Set Wait IE/IXMT Ignore in Config7, IAR, AR */ kvm_write_c0_guest_config7(cop0, (MIPS_CONF7_WII) | (1 << 10)); + /* Status */ + kvm_write_c0_guest_status(cop0, ST0_BEV | ST0_ERL); + /* * Setup IntCtl defaults, compatibility mode for timer interrupts (HW5) */ @@ -623,6 +626,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 | (vcpu_id & MIPS_EBASE_CPUNUM)); + /* Put PC at guest reset vector */ + vcpu->arch.pc = KVM_GUEST_CKSEG1ADDR(0x1fc00000); + return 0; } -- cgit v1.2.3-70-g09d2 From 013044cc65f8661c5fa2b59da5e134b3453d975d Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 7 Dec 2016 17:16:37 +0000 Subject: KVM: MIPS/T&E: Expose CP0_EntryLo0/1 registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose the CP0_EntryLo0 and CP0_EntryLo1 registers through the KVM register access API. This is fairly straightforward for trap & emulate since we don't support the RI and XI bits. For the sake of future proofing (particularly for VZ) it is explicitly specified that the API always exposes the 64-bit version of these registers (i.e. with the RI and XI bits in bit positions 63 and 62 respectively), and they are implemented in trap_emul.c rather than mips.c to allow them to be implemented differently for VZ. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- Documentation/virtual/kvm/api.txt | 8 ++++++++ arch/mips/include/asm/kvm_host.h | 2 ++ arch/mips/kvm/trap_emul.c | 14 ++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 8d52d0f990ae..df4a309ba56e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2061,6 +2061,8 @@ registers, find a list below: MIPS | KVM_REG_MIPS_LO | 64 MIPS | KVM_REG_MIPS_PC | 64 MIPS | KVM_REG_MIPS_CP0_INDEX | 32 + MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64 + MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64 MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 @@ -2149,6 +2151,12 @@ patterns depending on whether they're 32-bit or 64-bit registers: 0x7020 0000 0001 00 (32-bit) 0x7030 0000 0001 00 (64-bit) +Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64 +versions of the EntryLo registers regardless of the word size of the host +hardware, host kernel, guest, and whether XPA is present in the guest, i.e. +with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and +the PFNX field starting at bit 30. + MIPS KVM control registers (see above) have the following id bit patterns: 0x7030 0000 0002 diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 420372fa5bbc..66459ca4af81 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -352,7 +352,9 @@ struct kvm_vcpu_arch { #define kvm_read_c0_guest_index(cop0) (cop0->reg[MIPS_CP0_TLB_INDEX][0]) #define kvm_write_c0_guest_index(cop0, val) (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val) #define kvm_read_c0_guest_entrylo0(cop0) (cop0->reg[MIPS_CP0_TLB_LO0][0]) +#define kvm_write_c0_guest_entrylo0(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO0][0] = (val)) #define kvm_read_c0_guest_entrylo1(cop0) (cop0->reg[MIPS_CP0_TLB_LO1][0]) +#define kvm_write_c0_guest_entrylo1(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO1][0] = (val)) #define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0]) #define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val)) #define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2]) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index ce44f91c653a..2f9e44b0f177 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -646,6 +646,8 @@ static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm, static u64 kvm_trap_emul_get_one_regs[] = { KVM_REG_MIPS_CP0_INDEX, + KVM_REG_MIPS_CP0_ENTRYLO0, + KVM_REG_MIPS_CP0_ENTRYLO1, KVM_REG_MIPS_CP0_CONTEXT, KVM_REG_MIPS_CP0_USERLOCAL, KVM_REG_MIPS_CP0_PAGEMASK, @@ -706,6 +708,12 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_CP0_INDEX: *v = (long)kvm_read_c0_guest_index(cop0); break; + case KVM_REG_MIPS_CP0_ENTRYLO0: + *v = kvm_read_c0_guest_entrylo0(cop0); + break; + case KVM_REG_MIPS_CP0_ENTRYLO1: + *v = kvm_read_c0_guest_entrylo1(cop0); + break; case KVM_REG_MIPS_CP0_CONTEXT: *v = (long)kvm_read_c0_guest_context(cop0); break; @@ -817,6 +825,12 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_CP0_INDEX: kvm_write_c0_guest_index(cop0, v); break; + case KVM_REG_MIPS_CP0_ENTRYLO0: + kvm_write_c0_guest_entrylo0(cop0, v); + break; + case KVM_REG_MIPS_CP0_ENTRYLO1: + kvm_write_c0_guest_entrylo1(cop0, v); + break; case KVM_REG_MIPS_CP0_CONTEXT: kvm_write_c0_guest_context(cop0, v); break; -- cgit v1.2.3-70-g09d2 From ad58d4d4a274e9290725188c557d16e7d0cd1b3d Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 2 Feb 2015 22:55:17 +0000 Subject: KVM: MIPS/T&E: Expose read-only CP0_IntCtl register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose the CP0_IntCtl register through the KVM register access API, which is a required register since MIPS32r2. It is currently read-only since the VS field isn't implemented due to lack of Config3.VInt or Config3.VEIC. It is implemented in trap_emul.c so that a VZ implementation can allow writes. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- Documentation/virtual/kvm/api.txt | 1 + arch/mips/include/asm/kvm_host.h | 1 + arch/mips/kvm/trap_emul.c | 7 +++++++ 3 files changed, 9 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index df4a309ba56e..d34b03c99233 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2073,6 +2073,7 @@ registers, find a list below: MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 MIPS | KVM_REG_MIPS_CP0_STATUS | 32 + MIPS | KVM_REG_MIPS_CP0_INTCTL | 32 MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 MIPS | KVM_REG_MIPS_CP0_EPC | 64 MIPS | KVM_REG_MIPS_CP0_PRID | 32 diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 66459ca4af81..ebcc55963941 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -43,6 +43,7 @@ #define KVM_REG_MIPS_CP0_ENTRYHI MIPS_CP0_64(10, 0) #define KVM_REG_MIPS_CP0_COMPARE MIPS_CP0_32(11, 0) #define KVM_REG_MIPS_CP0_STATUS MIPS_CP0_32(12, 0) +#define KVM_REG_MIPS_CP0_INTCTL MIPS_CP0_32(12, 1) #define KVM_REG_MIPS_CP0_CAUSE MIPS_CP0_32(13, 0) #define KVM_REG_MIPS_CP0_EPC MIPS_CP0_64(14, 0) #define KVM_REG_MIPS_CP0_PRID MIPS_CP0_32(15, 0) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 2f9e44b0f177..b1fa53b252ea 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -658,6 +658,7 @@ static u64 kvm_trap_emul_get_one_regs[] = { KVM_REG_MIPS_CP0_ENTRYHI, KVM_REG_MIPS_CP0_COMPARE, KVM_REG_MIPS_CP0_STATUS, + KVM_REG_MIPS_CP0_INTCTL, KVM_REG_MIPS_CP0_CAUSE, KVM_REG_MIPS_CP0_EPC, KVM_REG_MIPS_CP0_PRID, @@ -741,6 +742,9 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_CP0_STATUS: *v = (long)kvm_read_c0_guest_status(cop0); break; + case KVM_REG_MIPS_CP0_INTCTL: + *v = (long)kvm_read_c0_guest_intctl(cop0); + break; case KVM_REG_MIPS_CP0_CAUSE: *v = (long)kvm_read_c0_guest_cause(cop0); break; @@ -855,6 +859,9 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_CP0_STATUS: kvm_write_c0_guest_status(cop0, v); break; + case KVM_REG_MIPS_CP0_INTCTL: + /* No VInt, so no VS, read-only for now */ + break; case KVM_REG_MIPS_CP0_EPC: kvm_write_c0_guest_epc(cop0, v); break; -- cgit v1.2.3-70-g09d2 From 12ed1faece3f141c2604b5b3a8377ba71d23ec9d Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 13 Dec 2016 22:39:39 +0000 Subject: KVM: MIPS: Allow multiple VCPUs to be created MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increase the maximum number of MIPS KVM VCPUs to 8, and implement the KVM_CAP_NR_VCPUS and KVM_CAP_MAX_CPUS capabilities which expose the recommended and maximum number of VCPUs to userland. The previous maximum of 1 didn't allow for any form of SMP guests. We calculate the values similarly to ARM, recommending as many VCPUs as there are CPUs online in the system. This will allow userland to know how many VCPUs it is possible to create. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org --- arch/mips/include/asm/kvm_host.h | 2 +- arch/mips/kvm/mips.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index ebcc55963941..05e785fc061d 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -65,7 +65,7 @@ #define KVM_REG_MIPS_CP0_KSCRATCH6 MIPS_CP0_64(31, 7) -#define KVM_MAX_VCPUS 1 +#define KVM_MAX_VCPUS 8 #define KVM_USER_MEM_SLOTS 8 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 0 diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 9338aec08790..31ee5ee0010b 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1026,6 +1026,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; + case KVM_CAP_NR_VCPUS: + r = num_online_cpus(); + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; case KVM_CAP_MIPS_FPU: /* We don't handle systems with inconsistent cpu_has_fpu */ r = !!raw_cpu_has_fpu; -- cgit v1.2.3-70-g09d2 From e1e8a9624f7ba8ead4f056ff558ed070e86fa747 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Thu, 2 Feb 2017 16:39:31 +0100 Subject: KVM: s390: Disable dirty log retrieval for UCONTROL guests User controlled KVM guests do not support the dirty log, as they have no single gmap that we can check for changes. As they have no single gmap, kvm->arch.gmap is NULL and all further referencing to it for dirty checking will result in a NULL dereference. Let's return -EINVAL if a caller tries to sync dirty logs for a UCONTROL guest. Fixes: 15f36eb ("KVM: s390: Add proper dirty bitmap support to S390 kvm.") Cc: # 3.16+ Signed-off-by: Janosch Frank Reported-by: Martin Schwidefsky Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index dabd3b15bf11..502de74ea984 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -442,6 +442,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; int is_dirty = 0; + if (kvm_is_ucontrol(kvm)) + return -EINVAL; + mutex_lock(&kvm->slots_lock); r = -EINVAL; -- cgit v1.2.3-70-g09d2 From fb7dc1d4ddce744c8d8e1aca19d4982102cf72e1 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 26 Jan 2017 20:45:33 +0100 Subject: KVM: s390: detect some program check loops Sometimes (e.g. early boot) a guest is broken in such ways that it loops 100% delivering operation exceptions (illegal operation) but the pgm new PSW is not set properly. This will result in code being read from address zero, which usually contains another illegal op. Let's detect this case and return to userspace. Instead of only detecting this for address zero apply a heuristic that will work for any program check new psw. We do not want guest problem state to be able to trigger a guest panic, e.g. by faulting on an address that is the same as the program check new PSW, so we check for the problem state bit being off. With proper handling in userspace we a: get rid of CPU consumption of such broken guests b: keep the program old PSW. This allows to find out the original illegal operation - making debugging such early boot issues much easier than with single stepping Signed-off-by: Christian Borntraeger Reviewed-by: Cornelia Huck --- arch/s390/kvm/intercept.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 8b13f7098c61..59920f96ebc0 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -361,6 +361,9 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) static int handle_operexc(struct kvm_vcpu *vcpu) { + psw_t oldpsw, newpsw; + int rc; + vcpu->stat.exit_operation_exception++; trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa, vcpu->arch.sie_block->ipb); @@ -371,6 +374,24 @@ static int handle_operexc(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) return -EOPNOTSUPP; + rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t)); + if (rc) + return rc; + /* + * Avoid endless loops of operation exceptions, if the pgm new + * PSW will cause a new operation exception. + * The heuristic checks if the pgm new psw is within 6 bytes before + * the faulting psw address (with same DAT, AS settings) and the + * new psw is not a wait psw and the fault was not triggered by + * problem state. + */ + oldpsw = vcpu->arch.sie_block->gpsw; + if (oldpsw.addr - newpsw.addr <= 6 && + !(newpsw.mask & PSW_MASK_WAIT) && + !(oldpsw.mask & PSW_MASK_PSTATE) && + (newpsw.mask & PSW_MASK_ASC) == (oldpsw.mask & PSW_MASK_ASC) && + (newpsw.mask & PSW_MASK_DAT) == (oldpsw.mask & PSW_MASK_DAT)) + return -EOPNOTSUPP; return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); } -- cgit v1.2.3-70-g09d2 From 42cf014d38d8822cce63703a467e00f65d000952 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 25 Jan 2017 11:58:57 +0100 Subject: KVM: nVMX: kmap() can't fail kmap() can't fail, therefore it will always return a valid pointer. Let's just get rid of the unnecessary checks. Signed-off-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d850d5d36182..693e4203b666 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4973,10 +4973,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) return 0; vapic_page = kmap(vmx->nested.virtual_apic_page); - if (!vapic_page) { - WARN_ON(1); - return -ENOMEM; - } __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); kunmap(vmx->nested.virtual_apic_page); @@ -9738,11 +9734,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, return false; } msr_bitmap_l1 = (unsigned long *)kmap(page); - if (!msr_bitmap_l1) { - nested_release_page_clean(page); - WARN_ON(1); - return false; - } memset(msr_bitmap_l0, 0xff, PAGE_SIZE); -- cgit v1.2.3-70-g09d2 From 6342c50ad12e8ce0736e722184a7dbdea4a3477f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 25 Jan 2017 11:58:58 +0100 Subject: KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail vmx_complete_nested_posted_interrupt() can't fail, let's turn it into a void function. Signed-off-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 693e4203b666..7c3e42623090 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4953,7 +4953,7 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } -static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; @@ -4964,13 +4964,13 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmx->nested.pi_pending) { vmx->nested.pi_pending = false; if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return 0; + return; max_irr = find_last_bit( (unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr == 256) - return 0; + return; vapic_page = kmap(vmx->nested.virtual_apic_page); __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); @@ -4983,7 +4983,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_INTR_STATUS, status); } } - return 0; } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) @@ -10695,7 +10694,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - return vmx_complete_nested_posted_interrupt(vcpu); + vmx_complete_nested_posted_interrupt(vcpu); + return 0; } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 55dd00a73a518281bc846dc5de1a718349431eb2 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 24 Jan 2017 15:09:39 -0200 Subject: KVM: x86: add KVM_HC_CLOCK_PAIRING hypercall Add a hypercall to retrieve the host realtime clock and the TSC value used to calculate that clock read. Used to implement clock synchronization between host and guest. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/hypercalls.txt | 35 +++++++++++++++++ arch/x86/include/uapi/asm/kvm_para.h | 9 +++++ arch/x86/kvm/x86.c | 66 ++++++++++++++++++++++++++++++++ include/uapi/linux/kvm_para.h | 2 + 4 files changed, 112 insertions(+) diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index c8d040e27046..feaaa634f154 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -81,3 +81,38 @@ the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) is used in the hypercall for future use. + + +6. KVM_HC_CLOCK_PAIRING +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to synchronize host and guest clocks. +Usage: + +a0: guest physical address where host copies +"struct kvm_clock_offset" structure. + +a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0) +is supported (corresponding to the host's CLOCK_REALTIME clock). + + struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; + }; + + Where: + * sec: seconds from clock_type clock. + * nsec: nanoseconds from clock_type clock. + * tsc: guest TSC value used to calculate sec/nsec pair + * flags: flags, unused (0) at the moment. + +The hypercall lets a guest compute a precise timestamp across +host and guest. The guest can use the returned TSC value to +compute the CLOCK_REALTIME for its clock, at the same instant. + +Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, +or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 1421a6585126..cff0bb6556f8 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -50,6 +50,15 @@ struct kvm_steal_time { __u32 pad[11]; }; +#define KVM_CLOCK_PAIRING_WALLCLOCK 0 +struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; +}; + #define KVM_STEAL_ALIGNMENT_BITS 5 #define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) #define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4fd4d4f35caf..09e5d31dac98 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1142,6 +1142,7 @@ struct pvclock_gtod_data { u64 boot_ns; u64 nsec_base; + u64 wall_time_sec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1165,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->boot_ns = boot_ns; vdata->nsec_base = tk->tkr_mono.xtime_nsec; + vdata->wall_time_sec = tk->xtime_sec; + write_seqcount_end(&vdata->seq); } #endif @@ -1626,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) return mode; } +static int do_realtime(struct timespec *ts, u64 *cycle_now) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + unsigned long seq; + int mode; + u64 ns; + + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->wall_time_sec; + ns = gtod->nsec_base; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + /* returns true if host is using tsc clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) { @@ -1635,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; } + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_walltime_and_clockread(struct timespec *ts, + u64 *cycle_now) +{ + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + return do_realtime(ts, cycle_now) == VCLOCK_TSC; +} #endif /* @@ -6112,6 +6148,33 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, + unsigned long clock_type) +{ + struct kvm_clock_pairing clock_pairing; + struct timespec ts; + cycle_t cycle; + int ret; + + if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) + return -KVM_EOPNOTSUPP; + + if (kvm_get_walltime_and_clockread(&ts, &cycle) == false) + return -KVM_EOPNOTSUPP; + + clock_pairing.sec = ts.tv_sec; + clock_pairing.nsec = ts.tv_nsec; + clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); + clock_pairing.flags = 0; + + ret = 0; + if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing, + sizeof(struct kvm_clock_pairing))) + ret = -KVM_EFAULT; + + return ret; +} + /* * kvm_pv_kick_cpu_op: Kick a vcpu. * @@ -6176,6 +6239,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); ret = 0; break; + case KVM_HC_CLOCK_PAIRING: + ret = kvm_pv_clock_pairing(vcpu, a0, a1); + break; default: ret = -KVM_ENOSYS; break; diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index bf6cd7d5cac2..fed506aeff62 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -14,6 +14,7 @@ #define KVM_EFAULT EFAULT #define KVM_E2BIG E2BIG #define KVM_EPERM EPERM +#define KVM_EOPNOTSUPP 95 #define KVM_HC_VAPIC_POLL_IRQ 1 #define KVM_HC_MMU_OP 2 @@ -23,6 +24,7 @@ #define KVM_HC_MIPS_GET_CLOCK_FREQ 6 #define KVM_HC_MIPS_EXIT_VM 7 #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 +#define KVM_HC_CLOCK_PAIRING 9 /* * hypercalls use architecture specific -- cgit v1.2.3-70-g09d2 From 80fbd89cbd07287a7013006c14ddec923b7a4ff6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Feb 2017 10:57:24 +0100 Subject: KVM: x86: fix compilation Fix rebase breakage from commit 55dd00a73a51 ("KVM: x86: add KVM_HC_CLOCK_PAIRING hypercall", 2017-01-24), courtesy of the "I could have sworn I had pushed the right branch" department. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 09e5d31dac98..96dd7dd13ee6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6153,7 +6153,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, { struct kvm_clock_pairing clock_pairing; struct timespec ts; - cycle_t cycle; + u64 cycle; int ret; if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) -- cgit v1.2.3-70-g09d2 From 0bdbf3b071986ba80731203683cf623d5c0cacb1 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Thu, 2 Feb 2017 20:30:03 -0600 Subject: KVM: arm/arm64: vgic: Stop injecting the MSI occurrence twice The IRQFD framework calls the architecture dependent function twice if the corresponding GSI type is edge triggered. For ARM, the function kvm_set_msi() is getting called twice whenever the IRQFD receives the event signal. The rest of the code path is trying to inject the MSI without any validation checks. No need to call the function vgic_its_inject_msi() second time to avoid an unnecessary overhead in IRQ queue logic. It also avoids the possibility of VM seeing the MSI twice. Simple fix, return -1 if the argument 'level' value is zero. Cc: stable@vger.kernel.org Reviewed-by: Eric Auger Reviewed-by: Christoffer Dall Signed-off-by: Shanker Donthineni Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-irqfd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c index d918dcf26a5a..f138ed2e9c63 100644 --- a/virt/kvm/arm/vgic/vgic-irqfd.c +++ b/virt/kvm/arm/vgic/vgic-irqfd.c @@ -99,6 +99,9 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, if (!vgic_has_its(kvm)) return -ENODEV; + if (!level) + return -1; + return vgic_its_inject_msi(kvm, &msi); } -- cgit v1.2.3-70-g09d2 From fbb4aeec5fc2ab47615b2a0cbabc503e1eef4c60 Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Fri, 3 Feb 2017 10:19:59 -0500 Subject: KVM: arm/arm64: Abstract virtual timer context into separate structure Abstract virtual timer context into a separate structure and change all callers referring to timer registers, irq state and so on. No change in functionality. This is about to become very handy when adding the EL1 physical timer. Signed-off-by: Jintack Lim Acked-by: Christoffer Dall Acked-by: Marc Zyngier Signed-off-by: Marc Zyngier --- include/kvm/arm_arch_timer.h | 27 ++++++++--------- virt/kvm/arm/arch_timer.c | 69 +++++++++++++++++++++++--------------------- virt/kvm/arm/hyp/timer-sr.c | 10 ++++--- 3 files changed, 56 insertions(+), 50 deletions(-) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 5c970ce67949..daad3c133b9f 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -28,15 +28,20 @@ struct arch_timer_kvm { u64 cntvoff; }; -struct arch_timer_cpu { +struct arch_timer_context { /* Registers: control register, timer value */ - u32 cntv_ctl; /* Saved/restored */ - u64 cntv_cval; /* Saved/restored */ + u32 cnt_ctl; + u64 cnt_cval; + + /* Timer IRQ */ + struct kvm_irq_level irq; + + /* Active IRQ state caching */ + bool active_cleared_last; +}; - /* - * Anything that is not used directly from assembly code goes - * here. - */ +struct arch_timer_cpu { + struct arch_timer_context vtimer; /* Background timer used when the guest is not running */ struct hrtimer timer; @@ -47,12 +52,6 @@ struct arch_timer_cpu { /* Background timer active */ bool armed; - /* Timer IRQ */ - struct kvm_irq_level irq; - - /* Active IRQ state caching */ - bool active_cleared_last; - /* Is the timer enabled */ bool enabled; }; @@ -77,4 +76,6 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu); void kvm_timer_init_vhe(void); + +#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer) #endif diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 91ecf48f7fe2..d3556b3ca694 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -37,7 +37,7 @@ static u32 host_vtimer_irq_flags; void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { - vcpu->arch.timer_cpu.active_cleared_last = false; + vcpu_vtimer(vcpu)->active_cleared_last = false; } static u64 kvm_phys_timer_read(void) @@ -102,7 +102,7 @@ static u64 kvm_timer_compute_delta(struct kvm_vcpu *vcpu) { u64 cval, now; - cval = vcpu->arch.timer_cpu.cntv_cval; + cval = vcpu_vtimer(vcpu)->cnt_cval; now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; if (now < cval) { @@ -144,21 +144,21 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) && - (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE); + return !(vtimer->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && + (vtimer->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); } bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); u64 cval, now; if (!kvm_timer_irq_can_fire(vcpu)) return false; - cval = timer->cntv_cval; + cval = vtimer->cnt_cval; now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; return cval <= now; @@ -167,18 +167,18 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) { int ret; - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); BUG_ON(!vgic_initialized(vcpu->kvm)); - timer->active_cleared_last = false; - timer->irq.level = new_level; - trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->irq.irq, - timer->irq.level); + vtimer->active_cleared_last = false; + vtimer->irq.level = new_level; + trace_kvm_timer_update_irq(vcpu->vcpu_id, vtimer->irq.irq, + vtimer->irq.level); ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - timer->irq.irq, - timer->irq.level); + vtimer->irq.irq, + vtimer->irq.level); WARN_ON(ret); } @@ -189,18 +189,19 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) static int kvm_timer_update_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); /* * If userspace modified the timer registers via SET_ONE_REG before - * the vgic was initialized, we mustn't set the timer->irq.level value + * the vgic was initialized, we mustn't set the vtimer->irq.level value * because the guest would never see the interrupt. Instead wait * until we call this function from kvm_timer_flush_hwstate. */ if (!vgic_initialized(vcpu->kvm) || !timer->enabled) return -ENODEV; - if (kvm_timer_should_fire(vcpu) != timer->irq.level) - kvm_timer_update_irq(vcpu, !timer->irq.level); + if (kvm_timer_should_fire(vcpu) != vtimer->irq.level) + kvm_timer_update_irq(vcpu, !vtimer->irq.level); return 0; } @@ -250,7 +251,7 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu) */ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); bool phys_active; int ret; @@ -274,8 +275,8 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) * to ensure that hardware interrupts from the timer triggers a guest * exit. */ - phys_active = timer->irq.level || - kvm_vgic_map_is_active(vcpu, timer->irq.irq); + phys_active = vtimer->irq.level || + kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); /* * We want to avoid hitting the (re)distributor as much as @@ -297,7 +298,7 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) * - cached value is "active clear" * - value to be programmed is "active clear" */ - if (timer->active_cleared_last && !phys_active) + if (vtimer->active_cleared_last && !phys_active) return; ret = irq_set_irqchip_state(host_vtimer_irq, @@ -305,7 +306,7 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) phys_active); WARN_ON(ret); - timer->active_cleared_last = !phys_active; + vtimer->active_cleared_last = !phys_active; } /** @@ -331,7 +332,7 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, const struct kvm_irq_level *irq) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); /* * The vcpu timer irq number cannot be determined in @@ -339,7 +340,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, * kvm_vcpu_set_target(). To handle this, we determine * vcpu timer irq number when the vcpu is reset. */ - timer->irq.irq = irq->irq; + vtimer->irq.irq = irq->irq; /* * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 @@ -347,7 +348,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, * resets the timer to be disabled and unmasked and is compliant with * the ARMv7 architecture. */ - timer->cntv_ctl = 0; + vtimer->cnt_ctl = 0; kvm_timer_update_state(vcpu); return 0; @@ -369,17 +370,17 @@ static void kvm_timer_init_interrupt(void *info) int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); switch (regid) { case KVM_REG_ARM_TIMER_CTL: - timer->cntv_ctl = value; + vtimer->cnt_ctl = value; break; case KVM_REG_ARM_TIMER_CNT: vcpu->kvm->arch.timer.cntvoff = kvm_phys_timer_read() - value; break; case KVM_REG_ARM_TIMER_CVAL: - timer->cntv_cval = value; + vtimer->cnt_cval = value; break; default: return -1; @@ -391,15 +392,15 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); switch (regid) { case KVM_REG_ARM_TIMER_CTL: - return timer->cntv_ctl; + return vtimer->cnt_ctl; case KVM_REG_ARM_TIMER_CNT: return kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; case KVM_REG_ARM_TIMER_CVAL: - return timer->cntv_cval; + return vtimer->cnt_cval; } return (u64)-1; } @@ -463,14 +464,16 @@ int kvm_timer_hyp_init(void) void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); timer_disarm(timer); - kvm_vgic_unmap_phys_irq(vcpu, timer->irq.irq); + kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); } int kvm_timer_enable(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct irq_desc *desc; struct irq_data *data; int phys_irq; @@ -498,7 +501,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) * Tell the VGIC that the virtual interrupt is tied to a * physical interrupt. We do that once per VCPU. */ - ret = kvm_vgic_map_phys_irq(vcpu, timer->irq.irq, phys_irq); + ret = kvm_vgic_map_phys_irq(vcpu, vtimer->irq.irq, phys_irq); if (ret) return ret; diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c index 63e28dd18bb0..0cf08953e81c 100644 --- a/virt/kvm/arm/hyp/timer-sr.c +++ b/virt/kvm/arm/hyp/timer-sr.c @@ -25,11 +25,12 @@ void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); u64 val; if (timer->enabled) { - timer->cntv_ctl = read_sysreg_el0(cntv_ctl); - timer->cntv_cval = read_sysreg_el0(cntv_cval); + vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); + vtimer->cnt_cval = read_sysreg_el0(cntv_cval); } /* Disable the virtual timer */ @@ -54,6 +55,7 @@ void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) { struct kvm *kvm = kern_hyp_va(vcpu->kvm); struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); u64 val; /* Those bits are already configured at boot on VHE-system */ @@ -70,8 +72,8 @@ void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) if (timer->enabled) { write_sysreg(kvm->arch.timer.cntvoff, cntvoff_el2); - write_sysreg_el0(timer->cntv_cval, cntv_cval); + write_sysreg_el0(vtimer->cnt_cval, cntv_cval); isb(); - write_sysreg_el0(timer->cntv_ctl, cntv_ctl); + write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); } } -- cgit v1.2.3-70-g09d2 From 90de943a430028ee389b22bf4a7ae5867c32ce0c Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Fri, 3 Feb 2017 10:20:00 -0500 Subject: KVM: arm/arm64: Move cntvoff to each timer context Make cntvoff per each timer context. This is helpful to abstract kvm timer functions to work with timer context without considering timer types (e.g. physical timer or virtual timer). This also would pave the way for ever doing adjustments of the cntvoff on a per-CPU basis if that should ever make sense. Signed-off-by: Jintack Lim Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_host.h | 3 --- arch/arm/kvm/arm.c | 1 - arch/arm64/include/asm/kvm_host.h | 3 --- include/kvm/arm_arch_timer.h | 9 +++------ virt/kvm/arm/arch_timer.c | 38 ++++++++++++++++++++++++++++---------- virt/kvm/arm/hyp/timer-sr.c | 3 +-- 6 files changed, 32 insertions(+), 25 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index d5423ab15ed5..cc495d799c67 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -60,9 +60,6 @@ struct kvm_arch { /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; - /* Timer */ - struct arch_timer_kvm timer; - /* * Anything that is not used directly from assembly code goes * here. diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 9d7446456e0c..f93f2171a48b 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -135,7 +135,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) goto out_free_stage2_pgd; kvm_vgic_early_init(kvm); - kvm_timer_init(kvm); /* Mark the initial VMID generation invalid */ kvm->arch.vmid_gen = 0; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e5050388e062..4a758cba1262 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -70,9 +70,6 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_dist vgic; - - /* Timer */ - struct arch_timer_kvm timer; }; #define KVM_NR_MEM_OBJS 40 diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index daad3c133b9f..2c8560b4642a 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -23,11 +23,6 @@ #include #include -struct arch_timer_kvm { - /* Virtual offset */ - u64 cntvoff; -}; - struct arch_timer_context { /* Registers: control register, timer value */ u32 cnt_ctl; @@ -38,6 +33,9 @@ struct arch_timer_context { /* Active IRQ state caching */ bool active_cleared_last; + + /* Virtual offset */ + u64 cntvoff; }; struct arch_timer_cpu { @@ -58,7 +56,6 @@ struct arch_timer_cpu { int kvm_timer_hyp_init(void); int kvm_timer_enable(struct kvm_vcpu *vcpu); -void kvm_timer_init(struct kvm *kvm); int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, const struct kvm_irq_level *irq); void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index d3556b3ca694..5004a679b125 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -101,9 +101,10 @@ static void kvm_timer_inject_irq_work(struct work_struct *work) static u64 kvm_timer_compute_delta(struct kvm_vcpu *vcpu) { u64 cval, now; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - cval = vcpu_vtimer(vcpu)->cnt_cval; - now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; + cval = vtimer->cnt_cval; + now = kvm_phys_timer_read() - vtimer->cntvoff; if (now < cval) { u64 ns; @@ -159,7 +160,7 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) return false; cval = vtimer->cnt_cval; - now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; + now = kvm_phys_timer_read() - vtimer->cntvoff; return cval <= now; } @@ -354,10 +355,32 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, return 0; } +/* Make the updates of cntvoff for all vtimer contexts atomic */ +static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) +{ + int i; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tmp; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(i, tmp, kvm) + vcpu_vtimer(tmp)->cntvoff = cntvoff; + + /* + * When called from the vcpu create path, the CPU being created is not + * included in the loop above, so we just set it here as well. + */ + vcpu_vtimer(vcpu)->cntvoff = cntvoff; + mutex_unlock(&kvm->lock); +} + void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + /* Synchronize cntvoff across all vtimers of a VM. */ + update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); + INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->timer.function = kvm_timer_expire; @@ -377,7 +400,7 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) vtimer->cnt_ctl = value; break; case KVM_REG_ARM_TIMER_CNT: - vcpu->kvm->arch.timer.cntvoff = kvm_phys_timer_read() - value; + update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); break; case KVM_REG_ARM_TIMER_CVAL: vtimer->cnt_cval = value; @@ -398,7 +421,7 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) case KVM_REG_ARM_TIMER_CTL: return vtimer->cnt_ctl; case KVM_REG_ARM_TIMER_CNT: - return kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; + return kvm_phys_timer_read() - vtimer->cntvoff; case KVM_REG_ARM_TIMER_CVAL: return vtimer->cnt_cval; } @@ -510,11 +533,6 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) return 0; } -void kvm_timer_init(struct kvm *kvm) -{ - kvm->arch.timer.cntvoff = kvm_phys_timer_read(); -} - /* * On VHE system, we only need to configure trap on physical timer and counter * accesses in EL0 and EL1 once, not for every world switch. diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c index 0cf08953e81c..4734915ab71f 100644 --- a/virt/kvm/arm/hyp/timer-sr.c +++ b/virt/kvm/arm/hyp/timer-sr.c @@ -53,7 +53,6 @@ void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu) void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) { - struct kvm *kvm = kern_hyp_va(vcpu->kvm); struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); u64 val; @@ -71,7 +70,7 @@ void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) } if (timer->enabled) { - write_sysreg(kvm->arch.timer.cntvoff, cntvoff_el2); + write_sysreg(vtimer->cntvoff, cntvoff_el2); write_sysreg_el0(vtimer->cnt_cval, cntv_cval); isb(); write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); -- cgit v1.2.3-70-g09d2 From 9171fa2e0951b0cb6c443e0dce2c4620de3b1dfe Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Fri, 3 Feb 2017 10:20:01 -0500 Subject: KVM: arm/arm64: Decouple kvm timer functions from virtual timer Now that we have a separate structure for timer context, make functions generic so that they can work with any timer context, not just the virtual timer context. This does not change the virtual timer functionality. Signed-off-by: Jintack Lim Acked-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/kvm/arm.c | 2 +- include/kvm/arm_arch_timer.h | 2 +- virt/kvm/arm/arch_timer.c | 54 ++++++++++++++++++++------------------------ 3 files changed, 27 insertions(+), 31 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index f93f2171a48b..0ecd6cf362fc 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -300,7 +300,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - return kvm_timer_should_fire(vcpu); + return kvm_timer_should_fire(vcpu_vtimer(vcpu)); } void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 2c8560b4642a..f46fa3b62b06 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -66,7 +66,7 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); -bool kvm_timer_should_fire(struct kvm_vcpu *vcpu); +bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); void kvm_timer_schedule(struct kvm_vcpu *vcpu); void kvm_timer_unschedule(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 5004a679b125..5261f98ae686 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -98,13 +98,12 @@ static void kvm_timer_inject_irq_work(struct work_struct *work) kvm_vcpu_kick(vcpu); } -static u64 kvm_timer_compute_delta(struct kvm_vcpu *vcpu) +static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) { u64 cval, now; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - cval = vtimer->cnt_cval; - now = kvm_phys_timer_read() - vtimer->cntvoff; + cval = timer_ctx->cnt_cval; + now = kvm_phys_timer_read() - timer_ctx->cntvoff; if (now < cval) { u64 ns; @@ -133,7 +132,7 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) * PoV (NTP on the host may have forced it to expire * early). If we should have slept longer, restart it. */ - ns = kvm_timer_compute_delta(vcpu); + ns = kvm_timer_compute_delta(vcpu_vtimer(vcpu)); if (unlikely(ns)) { hrtimer_forward_now(hrt, ns_to_ktime(ns)); return HRTIMER_RESTART; @@ -143,43 +142,39 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) return HRTIMER_NORESTART; } -static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu) +static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - - return !(vtimer->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && - (vtimer->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); + return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && + (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); } -bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) +bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); u64 cval, now; - if (!kvm_timer_irq_can_fire(vcpu)) + if (!kvm_timer_irq_can_fire(timer_ctx)) return false; - cval = vtimer->cnt_cval; - now = kvm_phys_timer_read() - vtimer->cntvoff; + cval = timer_ctx->cnt_cval; + now = kvm_phys_timer_read() - timer_ctx->cntvoff; return cval <= now; } -static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, + struct arch_timer_context *timer_ctx) { int ret; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); BUG_ON(!vgic_initialized(vcpu->kvm)); - vtimer->active_cleared_last = false; - vtimer->irq.level = new_level; - trace_kvm_timer_update_irq(vcpu->vcpu_id, vtimer->irq.irq, - vtimer->irq.level); + timer_ctx->active_cleared_last = false; + timer_ctx->irq.level = new_level; + trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, + timer_ctx->irq.level); - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - vtimer->irq.irq, - vtimer->irq.level); + ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, timer_ctx->irq.irq, + timer_ctx->irq.level); WARN_ON(ret); } @@ -201,8 +196,8 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu) if (!vgic_initialized(vcpu->kvm) || !timer->enabled) return -ENODEV; - if (kvm_timer_should_fire(vcpu) != vtimer->irq.level) - kvm_timer_update_irq(vcpu, !vtimer->irq.level); + if (kvm_timer_should_fire(vtimer) != vtimer->irq.level) + kvm_timer_update_irq(vcpu, !vtimer->irq.level, vtimer); return 0; } @@ -215,6 +210,7 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu) void kvm_timer_schedule(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); BUG_ON(timer_is_armed(timer)); @@ -223,18 +219,18 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu) * already expired, because kvm_vcpu_block will return before putting * the thread to sleep. */ - if (kvm_timer_should_fire(vcpu)) + if (kvm_timer_should_fire(vtimer)) return; /* * If the timer is not capable of raising interrupts (disabled or * masked), then there's no more work for us to do. */ - if (!kvm_timer_irq_can_fire(vcpu)) + if (!kvm_timer_irq_can_fire(vtimer)) return; /* The timer has not yet expired, schedule a background timer */ - timer_arm(timer, kvm_timer_compute_delta(vcpu)); + timer_arm(timer, kvm_timer_compute_delta(vtimer)); } void kvm_timer_unschedule(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 009a5701bb2d166073f75643bc9237fe014c6bf5 Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Fri, 3 Feb 2017 10:20:02 -0500 Subject: KVM: arm/arm64: Add the EL1 physical timer context Add the EL1 physical timer context. Signed-off-by: Jintack Lim Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/kvm/arm_arch_timer.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index f46fa3b62b06..6445a3d9a6e2 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -40,6 +40,7 @@ struct arch_timer_context { struct arch_timer_cpu { struct arch_timer_context vtimer; + struct arch_timer_context ptimer; /* Background timer used when the guest is not running */ struct hrtimer timer; @@ -75,4 +76,5 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu); void kvm_timer_init_vhe(void); #define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer) +#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer) #endif -- cgit v1.2.3-70-g09d2 From a91d18551e7b35e34a04b6fd199ca8568e7e9315 Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Fri, 3 Feb 2017 10:20:03 -0500 Subject: KVM: arm/arm64: Initialize the emulated EL1 physical timer Initialize the emulated EL1 physical timer with the default irq number. Signed-off-by: Jintack Lim Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/kvm/reset.c | 9 ++++++++- arch/arm64/kvm/reset.c | 9 ++++++++- include/kvm/arm_arch_timer.h | 3 ++- virt/kvm/arm/arch_timer.c | 9 +++++++-- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c index 4b5e802e57d1..1da8b2d14550 100644 --- a/arch/arm/kvm/reset.c +++ b/arch/arm/kvm/reset.c @@ -37,6 +37,11 @@ static struct kvm_regs cortexa_regs_reset = { .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, }; +static const struct kvm_irq_level cortexa_ptimer_irq = { + { .irq = 30 }, + .level = 1, +}; + static const struct kvm_irq_level cortexa_vtimer_irq = { { .irq = 27 }, .level = 1, @@ -58,6 +63,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { struct kvm_regs *reset_regs; const struct kvm_irq_level *cpu_vtimer_irq; + const struct kvm_irq_level *cpu_ptimer_irq; switch (vcpu->arch.target) { case KVM_ARM_TARGET_CORTEX_A7: @@ -65,6 +71,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) reset_regs = &cortexa_regs_reset; vcpu->arch.midr = read_cpuid_id(); cpu_vtimer_irq = &cortexa_vtimer_irq; + cpu_ptimer_irq = &cortexa_ptimer_irq; break; default: return -ENODEV; @@ -77,5 +84,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) kvm_reset_coprocs(vcpu); /* Reset arch_timer context */ - return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq); + return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq); } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e95d4f68bf54..d9e9697de1b2 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -46,6 +46,11 @@ static const struct kvm_regs default_regs_reset32 = { COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT), }; +static const struct kvm_irq_level default_ptimer_irq = { + .irq = 30, + .level = 1, +}; + static const struct kvm_irq_level default_vtimer_irq = { .irq = 27, .level = 1, @@ -104,6 +109,7 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { const struct kvm_irq_level *cpu_vtimer_irq; + const struct kvm_irq_level *cpu_ptimer_irq; const struct kvm_regs *cpu_reset; switch (vcpu->arch.target) { @@ -117,6 +123,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } cpu_vtimer_irq = &default_vtimer_irq; + cpu_ptimer_irq = &default_ptimer_irq; break; } @@ -130,5 +137,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) kvm_pmu_vcpu_reset(vcpu); /* Reset timer */ - return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq); + return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq); } diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6445a3d9a6e2..f1d2fba0b9c6 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -58,7 +58,8 @@ struct arch_timer_cpu { int kvm_timer_hyp_init(void); int kvm_timer_enable(struct kvm_vcpu *vcpu); int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, - const struct kvm_irq_level *irq); + const struct kvm_irq_level *virt_irq, + const struct kvm_irq_level *phys_irq); void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 5261f98ae686..dbd0af19d27e 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -327,9 +327,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) } int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, - const struct kvm_irq_level *irq) + const struct kvm_irq_level *virt_irq, + const struct kvm_irq_level *phys_irq) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); /* * The vcpu timer irq number cannot be determined in @@ -337,7 +339,8 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, * kvm_vcpu_set_target(). To handle this, we determine * vcpu timer irq number when the vcpu is reset. */ - vtimer->irq.irq = irq->irq; + vtimer->irq.irq = virt_irq->irq; + ptimer->irq.irq = phys_irq->irq; /* * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 @@ -346,6 +349,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, * the ARMv7 architecture. */ vtimer->cnt_ctl = 0; + ptimer->cnt_ctl = 0; kvm_timer_update_state(vcpu); return 0; @@ -376,6 +380,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) /* Synchronize cntvoff across all vtimers of a VM. */ update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); + vcpu_ptimer(vcpu)->cntvoff = 0; INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); -- cgit v1.2.3-70-g09d2 From 58e0c9732a31afdef488a41fd1edba065124f442 Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Fri, 3 Feb 2017 10:20:04 -0500 Subject: KVM: arm/arm64: Update the physical timer interrupt level Now that we maintain the EL1 physical timer register states of VMs, update the physical timer interrupt level along with the virtual one. Signed-off-by: Jintack Lim Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/arch_timer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index dbd0af19d27e..7f9a66419991 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -186,6 +186,7 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); /* * If userspace modified the timer registers via SET_ONE_REG before @@ -199,6 +200,9 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu) if (kvm_timer_should_fire(vtimer) != vtimer->irq.level) kvm_timer_update_irq(vcpu, !vtimer->irq.level, vtimer); + if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) + kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); + return 0; } -- cgit v1.2.3-70-g09d2 From fb280e97576a91c01b2a1712dba31024748b3084 Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Fri, 3 Feb 2017 10:20:05 -0500 Subject: KVM: arm/arm64: Set a background timer to the earliest timer expiration When scheduling a background timer, consider both of the virtual and physical timer and pick the earliest expiration time. Signed-off-by: Jintack Lim Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/kvm/arm.c | 3 ++- virt/kvm/arm/arch_timer.c | 53 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 0ecd6cf362fc..21c493a9e5c9 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -300,7 +300,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - return kvm_timer_should_fire(vcpu_vtimer(vcpu)); + return kvm_timer_should_fire(vcpu_vtimer(vcpu)) || + kvm_timer_should_fire(vcpu_ptimer(vcpu)); } void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 7f9a66419991..0ea745290871 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -118,6 +118,35 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) return 0; } +static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) +{ + return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && + (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); +} + +/* + * Returns the earliest expiration time in ns among guest timers. + * Note that it will return 0 if none of timers can fire. + */ +static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) +{ + u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + + if (kvm_timer_irq_can_fire(vtimer)) + min_virt = kvm_timer_compute_delta(vtimer); + + if (kvm_timer_irq_can_fire(ptimer)) + min_phys = kvm_timer_compute_delta(ptimer); + + /* If none of timers can fire, then return 0 */ + if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX)) + return 0; + + return min(min_virt, min_phys); +} + static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) { struct arch_timer_cpu *timer; @@ -132,7 +161,7 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) * PoV (NTP on the host may have forced it to expire * early). If we should have slept longer, restart it. */ - ns = kvm_timer_compute_delta(vcpu_vtimer(vcpu)); + ns = kvm_timer_earliest_exp(vcpu); if (unlikely(ns)) { hrtimer_forward_now(hrt, ns_to_ktime(ns)); return HRTIMER_RESTART; @@ -142,12 +171,6 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) return HRTIMER_NORESTART; } -static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) -{ - return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && - (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); -} - bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { u64 cval, now; @@ -215,26 +238,30 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); BUG_ON(timer_is_armed(timer)); /* - * No need to schedule a background timer if the guest timer has + * No need to schedule a background timer if any guest timer has * already expired, because kvm_vcpu_block will return before putting * the thread to sleep. */ - if (kvm_timer_should_fire(vtimer)) + if (kvm_timer_should_fire(vtimer) || kvm_timer_should_fire(ptimer)) return; /* - * If the timer is not capable of raising interrupts (disabled or + * If both timers are not capable of raising interrupts (disabled or * masked), then there's no more work for us to do. */ - if (!kvm_timer_irq_can_fire(vtimer)) + if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer)) return; - /* The timer has not yet expired, schedule a background timer */ - timer_arm(timer, kvm_timer_compute_delta(vtimer)); + /* + * The guest timers have not yet expired, schedule a background timer. + * Set the earliest expiration time among the guest timers. + */ + timer_arm(timer, kvm_timer_earliest_exp(vcpu)); } void kvm_timer_unschedule(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From f242adaf0c7e8376e9f54ffd6ef088df84577893 Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Fri, 3 Feb 2017 10:20:06 -0500 Subject: KVM: arm/arm64: Set up a background timer for the physical timer emulation Set a background timer for the EL1 physical timer emulation while VMs are running, so that VMs get the physical timer interrupts in a timely manner. Schedule the background timer on entry to the VM and cancel it on exit. This would not have any performance impact to the guest OSes that currently use the virtual timer since the physical timer is always not enabled. Signed-off-by: Jintack Lim Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/arch_timer.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 0ea745290871..33257b560f74 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -229,6 +229,22 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu) return 0; } +/* Schedule the background timer for the emulated timer. */ +static void kvm_timer_emulate(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer_ctx) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + if (kvm_timer_should_fire(timer_ctx)) + return; + + if (!kvm_timer_irq_can_fire(timer_ctx)) + return; + + /* The timer has not yet expired, schedule a background timer */ + timer_arm(timer, kvm_timer_compute_delta(timer_ctx)); +} + /* * Schedule the background timer before calling kvm_vcpu_block, so that this * thread is removed from its waitqueue and made runnable when there's a timer @@ -286,6 +302,9 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) if (kvm_timer_update_state(vcpu)) return; + /* Set the background timer for the physical timer emulation. */ + kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu)); + /* * If we enter the guest with the virtual input level to the VGIC * asserted, then we have already told the VGIC what we need to, and @@ -348,7 +367,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - BUG_ON(timer_is_armed(timer)); + /* + * This is to cancel the background timer for the physical timer + * emulation if it is set. + */ + timer_disarm(timer); /* * The guest could have modified the timer registers or the timer -- cgit v1.2.3-70-g09d2 From c9a3c58f01fb0af78b512ab4515d16f3ef1a03f1 Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Fri, 3 Feb 2017 10:20:07 -0500 Subject: KVM: arm64: Add the EL1 physical timer access handler KVM traps on the EL1 phys timer accesses from VMs, but it doesn't handle those traps. This results in terminating VMs. Instead, set a handler for the EL1 phys timer access, and inject an undefined exception as an intermediate step. Signed-off-by: Jintack Lim Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index caa47ce7e006..1cd3464ff88d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -820,6 +820,30 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \ access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), } +static bool access_cntp_tval(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return true; +} + +static bool access_cntp_ctl(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return true; +} + +static bool access_cntp_cval(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return true; +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -1029,6 +1053,16 @@ static const struct sys_reg_desc sys_reg_descs[] = { { Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011), NULL, reset_unknown, TPIDRRO_EL0 }, + /* CNTP_TVAL_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b000), + access_cntp_tval }, + /* CNTP_CTL_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b001), + access_cntp_ctl }, + /* CNTP_CVAL_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b010), + access_cntp_cval }, + /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), PMU_PMEVCNTR_EL0(1), -- cgit v1.2.3-70-g09d2 From 7b6b46311a8562fb3a9e035ed6ffab6d49c28886 Mon Sep 17 00:00:00 2001 From: Jintack Lim Date: Fri, 3 Feb 2017 10:20:08 -0500 Subject: KVM: arm/arm64: Emulate the EL1 phys timer registers Emulate read and write operations to CNTP_TVAL, CNTP_CVAL and CNTP_CTL. Now VMs are able to use the EL1 physical timer. Signed-off-by: Jintack Lim Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 37 ++++++++++++++++++++++++++++++++++--- include/kvm/arm_arch_timer.h | 2 ++ virt/kvm/arm/arch_timer.c | 2 +- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1cd3464ff88d..0e26f8c2b56f 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -824,7 +824,14 @@ static bool access_cntp_tval(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - kvm_inject_undefined(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + u64 now = kvm_phys_timer_read(); + + if (p->is_write) + ptimer->cnt_cval = p->regval + now; + else + p->regval = ptimer->cnt_cval - now; + return true; } @@ -832,7 +839,25 @@ static bool access_cntp_ctl(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - kvm_inject_undefined(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + + if (p->is_write) { + /* ISTATUS bit is read-only */ + ptimer->cnt_ctl = p->regval & ~ARCH_TIMER_CTRL_IT_STAT; + } else { + u64 now = kvm_phys_timer_read(); + + p->regval = ptimer->cnt_ctl; + /* + * Set ISTATUS bit if it's expired. + * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is + * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit + * regardless of ENABLE bit for our implementation convenience. + */ + if (ptimer->cnt_cval <= now) + p->regval |= ARCH_TIMER_CTRL_IT_STAT; + } + return true; } @@ -840,7 +865,13 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - kvm_inject_undefined(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + + if (p->is_write) + ptimer->cnt_cval = p->regval; + else + p->regval = ptimer->cnt_cval; + return true; } diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index f1d2fba0b9c6..fe797d6ef89d 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -72,6 +72,8 @@ bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); void kvm_timer_schedule(struct kvm_vcpu *vcpu); void kvm_timer_unschedule(struct kvm_vcpu *vcpu); +u64 kvm_phys_timer_read(void); + void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu); void kvm_timer_init_vhe(void); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 33257b560f74..35d7100e0815 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -40,7 +40,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) vcpu_vtimer(vcpu)->active_cleared_last = false; } -static u64 kvm_phys_timer_read(void) +u64 kvm_phys_timer_read(void) { return timecounter->cc->read(timecounter->cc); } -- cgit v1.2.3-70-g09d2 From f4066c2bc4d0de4e5dcbff21dae41e89fe8f38c0 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 24 Jan 2017 15:09:41 -0200 Subject: kvmclock: export kvmclock clocksource and data pointers To be used by KVM PTP driver. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvmclock.h | 6 ++++++ arch/x86/kernel/kvmclock.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/kvmclock.h diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h new file mode 100644 index 000000000000..f260bef63591 --- /dev/null +++ b/arch/x86/include/asm/kvmclock.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_KVM_CLOCK_H +#define _ASM_X86_KVM_CLOCK_H + +extern struct clocksource kvm_clock; + +#endif /* _ASM_X86_KVM_CLOCK_H */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2a5cafdf8808..995fa260a6da 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -28,6 +28,7 @@ #include #include +#include static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; @@ -49,6 +50,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) { return hv_clock; } +EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va); /* * The wallclock is the time of day when we booted. Since then, some time may @@ -174,13 +176,14 @@ bool kvm_check_and_clear_guest_paused(void) return ret; } -static struct clocksource kvm_clock = { +struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +EXPORT_SYMBOL_GPL(kvm_clock); int kvm_register_clock(char *txt) { -- cgit v1.2.3-70-g09d2 From a0e136d436ded817c0aade72efdefa56a00b4e5e Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 24 Jan 2017 15:09:42 -0200 Subject: PTP: add kvm PTP driver Add a driver with gettime method returning hosts realtime clock. This allows Chrony to synchronize host and guest clocks with high precision (see results below). chronyc> sources MS Name/IP address Stratum Poll Reach LastRx Last sample =============================================================================== To configure Chronyd to use PHC refclock, add the following line to its configuration file: refclock PHC /dev/ptpX poll 3 dpoll -2 offset 0 Where /dev/ptpX is the kvmclock PTP clock. Signed-off-by: Marcelo Tosatti Acked-by: Richard Cochran Signed-off-by: Paolo Bonzini --- drivers/ptp/Kconfig | 12 +++ drivers/ptp/Makefile | 1 + drivers/ptp/ptp_kvm.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 213 insertions(+) create mode 100644 drivers/ptp/ptp_kvm.c diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index bdce33291161..384f661a6496 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -90,4 +90,16 @@ config PTP_1588_CLOCK_PCH To compile this driver as a module, choose M here: the module will be called ptp_pch. +config PTP_1588_CLOCK_KVM + tristate "KVM virtual PTP clock" + depends on PTP_1588_CLOCK + depends on KVM_GUEST && X86 + default y + help + This driver adds support for using kvm infrastructure as a PTP + clock. This clock is only useful if you are using KVM guests. + + To compile this driver as a module, choose M here: the module + will be called ptp_kvm. + endmenu diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile index 8b58597298de..530736161a8b 100644 --- a/drivers/ptp/Makefile +++ b/drivers/ptp/Makefile @@ -6,3 +6,4 @@ ptp-y := ptp_clock.o ptp_chardev.o ptp_sysfs.o obj-$(CONFIG_PTP_1588_CLOCK) += ptp.o obj-$(CONFIG_PTP_1588_CLOCK_IXP46X) += ptp_ixp46x.o obj-$(CONFIG_PTP_1588_CLOCK_PCH) += ptp_pch.o +obj-$(CONFIG_PTP_1588_CLOCK_KVM) += ptp_kvm.o diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c new file mode 100644 index 000000000000..0a54e8326a90 --- /dev/null +++ b/drivers/ptp/ptp_kvm.c @@ -0,0 +1,200 @@ +/* + * Virtual PTP 1588 clock for use with KVM guests + * + * Copyright (C) 2017 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct kvm_ptp_clock { + struct ptp_clock *ptp_clock; + struct ptp_clock_info caps; +}; + +DEFINE_SPINLOCK(kvm_ptp_lock); + +static struct pvclock_vsyscall_time_info *hv_clock; + +static struct kvm_clock_pairing clock_pair; +static phys_addr_t clock_pair_gpa; + +static int ptp_kvm_get_time_fn(ktime_t *device_time, + struct system_counterval_t *system_counter, + void *ctx) +{ + unsigned long ret; + struct timespec64 tspec; + unsigned version; + int cpu; + struct pvclock_vcpu_time_info *src; + + spin_lock(&kvm_ptp_lock); + + preempt_disable_notrace(); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + + do { + /* + * We are using a TSC value read in the hosts + * kvm_hc_clock_pairing handling. + * So any changes to tsc_to_system_mul + * and tsc_shift or any other pvclock + * data invalidate that measurement. + */ + version = pvclock_read_begin(src); + + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, + clock_pair_gpa, + KVM_CLOCK_PAIRING_WALLCLOCK); + if (ret != 0) { + pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret); + spin_unlock(&kvm_ptp_lock); + preempt_enable_notrace(); + return -EOPNOTSUPP; + } + + tspec.tv_sec = clock_pair.sec; + tspec.tv_nsec = clock_pair.nsec; + ret = __pvclock_read_cycles(src, clock_pair.tsc); + } while (pvclock_read_retry(src, version)); + + preempt_enable_notrace(); + + system_counter->cycles = ret; + system_counter->cs = &kvm_clock; + + *device_time = timespec64_to_ktime(tspec); + + spin_unlock(&kvm_ptp_lock); + + return 0; +} + +static int ptp_kvm_getcrosststamp(struct ptp_clock_info *ptp, + struct system_device_crosststamp *xtstamp) +{ + return get_device_system_crosststamp(ptp_kvm_get_time_fn, NULL, + NULL, xtstamp); +} + +/* + * PTP clock operations + */ + +static int ptp_kvm_adjfreq(struct ptp_clock_info *ptp, s32 ppb) +{ + return -EOPNOTSUPP; +} + +static int ptp_kvm_adjtime(struct ptp_clock_info *ptp, s64 delta) +{ + return -EOPNOTSUPP; +} + +static int ptp_kvm_settime(struct ptp_clock_info *ptp, + const struct timespec64 *ts) +{ + return -EOPNOTSUPP; +} + +static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) +{ + unsigned long ret; + struct timespec64 tspec; + + spin_lock(&kvm_ptp_lock); + + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, + clock_pair_gpa, + KVM_CLOCK_PAIRING_WALLCLOCK); + if (ret != 0) { + pr_err_ratelimited("clock offset hypercall ret %lu\n", ret); + spin_unlock(&kvm_ptp_lock); + return -EOPNOTSUPP; + } + + tspec.tv_sec = clock_pair.sec; + tspec.tv_nsec = clock_pair.nsec; + spin_unlock(&kvm_ptp_lock); + + memcpy(ts, &tspec, sizeof(struct timespec64)); + + return 0; +} + +static int ptp_kvm_enable(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, int on) +{ + return -EOPNOTSUPP; +} + +static struct ptp_clock_info ptp_kvm_caps = { + .owner = THIS_MODULE, + .name = "KVM virtual PTP", + .max_adj = 0, + .n_ext_ts = 0, + .n_pins = 0, + .pps = 0, + .adjfreq = ptp_kvm_adjfreq, + .adjtime = ptp_kvm_adjtime, + .gettime64 = ptp_kvm_gettime, + .settime64 = ptp_kvm_settime, + .enable = ptp_kvm_enable, + .getcrosststamp = ptp_kvm_getcrosststamp, +}; + +/* module operations */ + +static struct kvm_ptp_clock kvm_ptp_clock; + +static void __exit ptp_kvm_exit(void) +{ + ptp_clock_unregister(kvm_ptp_clock.ptp_clock); +} + +static int __init ptp_kvm_init(void) +{ + clock_pair_gpa = slow_virt_to_phys(&clock_pair); + hv_clock = pvclock_pvti_cpu0_va(); + + if (!hv_clock) + return -ENODEV; + + kvm_ptp_clock.caps = ptp_kvm_caps; + + kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL); + + if (IS_ERR(kvm_ptp_clock.ptp_clock)) + return PTR_ERR(kvm_ptp_clock.ptp_clock); + + return 0; +} + +module_init(ptp_kvm_init); +module_exit(ptp_kvm_exit); + +MODULE_AUTHOR("Marcelo Tosatti "); +MODULE_DESCRIPTION("PTP clock using KVMCLOCK"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-70-g09d2 From 5982f0849e08fe4e4e7df5e345c4539ce9780b1b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 8 Feb 2017 16:20:01 +0000 Subject: KVM: PPC: Book 3S: Fix error return in kvm_vm_ioctl_create_spapr_tce() Fix to return error code -ENOMEM from the memory alloc error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index c379ff5a4438..491c5d8120f7 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -171,6 +171,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; } + ret = -ENOMEM; stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); if (!stt) -- cgit v1.2.3-70-g09d2 From 8ef81a9a453f9048c1683e40b540a4221986a2d1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 9 Feb 2017 16:10:42 +0100 Subject: KVM: x86: hide KVM_HC_CLOCK_PAIRING on 32 bit The newly added hypercall doesn't work on x86-32: arch/x86/kvm/x86.c: In function 'kvm_pv_clock_pairing': arch/x86/kvm/x86.c:6163:6: error: implicit declaration of function 'kvm_get_walltime_and_clockread';did you mean 'kvm_get_time_scale'? [-Werror=implicit-function-declaration] This adds an #ifdef around it, matching the one around the related functions that are also only implemented on 64-bit systems. Fixes: 55dd00a73a51 ("KVM: x86: add KVM_HC_CLOCK_PAIRING hypercall") Signed-off-by: Arnd Bergmann Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 500008f800dc..2f64e5d0ae53 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6148,6 +6148,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +#ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, unsigned long clock_type) { @@ -6174,6 +6175,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, return ret; } +#endif /* * kvm_pv_kick_cpu_op: Kick a vcpu. @@ -6239,9 +6241,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); ret = 0; break; +#ifdef CONFIG_X86_64 case KVM_HC_CLOCK_PAIRING: ret = kvm_pv_clock_pairing(vcpu, a0, a1); break; +#endif default: ret = -KVM_ENOSYS; break; -- cgit v1.2.3-70-g09d2 From db1c056cee59f4a7670c3bd9ee1657468cf0b4c4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 8 Dec 2016 15:31:41 +0100 Subject: kvm: vmx: Use the hardware provided GPA instead of page walk As in the SVM patch, the guest physical address is passed by VMX to x86_emulate_instruction already, so mark the GPA as available in vcpu->arch. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7c3e42623090..0828b02b5af2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6382,6 +6382,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) EPT_VIOLATION_EXECUTABLE)) ? PFERR_PRESENT_MASK : 0; + vcpu->arch.gpa_available = true; vcpu->arch.exit_qualification = exit_qualification; return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); @@ -6399,6 +6400,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) } ret = handle_mmio_page_fault(vcpu, gpa, true); + vcpu->arch.gpa_available = true; if (likely(ret == RET_MMIO_PF_EMULATE)) return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == EMULATE_DONE; @@ -8517,6 +8519,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 vectoring_info = vmx->idt_vectoring_info; trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); + vcpu->arch.gpa_available = false; /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more -- cgit v1.2.3-70-g09d2 From 967235d320329e4a7a2bd1a36b04293063e985ae Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Dec 2016 14:03:45 +0100 Subject: KVM: vmx: clear pending interrupts on KVM_SET_LAPIC Pending interrupts might be in the PI descriptor when the LAPIC is restored from an external state; we do not want them to be injected. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 +-- arch/x86/kvm/vmx.c | 9 +++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 33b799fd3a6e..8ddd0ed03880 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2204,8 +2204,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { - if (kvm_x86_ops->apicv_post_state_restore) - kvm_x86_ops->apicv_post_state_restore(vcpu); + kvm_x86_ops->apicv_post_state_restore(vcpu); kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); kvm_x86_ops->hwapic_isr_update(vcpu, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0828b02b5af2..8d2e0cc8e83e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8749,6 +8749,14 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } +static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + pi_clear_on(&vmx->pi_desc); + memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); +} + static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -11574,6 +11582,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_enable_apicv = vmx_get_enable_apicv, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, + .apicv_post_state_restore = vmx_apicv_post_state_restore, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, .sync_pir_to_irr = vmx_sync_pir_to_irr, -- cgit v1.2.3-70-g09d2 From 0ad3bed6c5ec6dbb093a26802c85088a85fb9757 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Dec 2016 15:23:54 +0100 Subject: kvm: nVMX: move nested events check to kvm_vcpu_running vcpu_run calls kvm_vcpu_running, not kvm_arch_vcpu_runnable, and the former does not call check_nested_events. Once KVM_REQ_EVENT is removed from the APICv interrupt injection path, however, this would leave no place to trigger a vmexit from L2 to L1, causing a missed interrupt delivery while in guest mode. This is caught by the "ack interrupt on exit" test in vmx.flat. [This does not change the calls to check_nested_events in inject_pending_event. That is material for a separate cleanup.] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2f64e5d0ae53..204793f0f0e2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7023,6 +7023,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) + kvm_x86_ops->check_nested_events(vcpu, false); + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); } @@ -8389,9 +8392,6 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu, false); - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } -- cgit v1.2.3-70-g09d2 From 810e6defcca4d05275aa15c2872c0a4949178fcb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Dec 2016 13:05:46 +0100 Subject: KVM: x86: preparatory changes for APICv cleanups Add return value to __kvm_apic_update_irr/kvm_apic_update_irr. Move vmx_sync_pir_to_irr around. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 25 +++++++++++++++++-------- arch/x86/kvm/lapic.h | 4 ++-- arch/x86/kvm/vmx.c | 32 ++++++++++++++++---------------- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8ddd0ed03880..120afc2bcfd3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -341,7 +341,7 @@ static int find_highest_vector(void *bitmap) vec >= 0; vec -= APIC_VECTORS_PER_REG) { reg = bitmap + REG_POS(vec); if (*reg) - return fls(*reg) - 1 + vec; + return __fls(*reg) + vec; } return -1; @@ -361,27 +361,36 @@ static u8 count_vectors(void *bitmap) return count; } -void __kvm_apic_update_irr(u32 *pir, void *regs) +int __kvm_apic_update_irr(u32 *pir, void *regs) { - u32 i, pir_val; + u32 i, vec; + u32 pir_val, irr_val; + int max_irr = -1; - for (i = 0; i <= 7; i++) { + for (i = vec = 0; i <= 7; i++, vec += 32) { pir_val = READ_ONCE(pir[i]); + irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10)); if (pir_val) { - pir_val = xchg(&pir[i], 0); - *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; + irr_val |= xchg(&pir[i], 0); + *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val; } + if (irr_val) + max_irr = __fls(irr_val) + vec; } + + return max_irr; } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); -void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) { struct kvm_lapic *apic = vcpu->arch.apic; + int max_irr; - __kvm_apic_update_irr(pir, apic->regs); + max_irr = __kvm_apic_update_irr(pir, apic->regs); kvm_make_request(KVM_REQ_EVENT, vcpu); + return max_irr; } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 05abd837b78a..bcbe811f3b97 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -71,8 +71,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); -void __kvm_apic_update_irr(u32 *pir, void *regs); -void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); +int __kvm_apic_update_irr(u32 *pir, void *regs); +int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8d2e0cc8e83e..4ac9b484e244 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5057,22 +5057,6 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) kvm_vcpu_kick(vcpu); } -static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!pi_test_on(&vmx->pi_desc)) - return; - - pi_clear_on(&vmx->pi_desc); - /* - * IOMMU can write to PIR.ON, so the barrier matters even on UP. - * But on x86 this is just a compiler barrier anyway. - */ - smp_mb__after_atomic(); - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); -} - /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -8738,6 +8722,22 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) } } +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!pi_test_on(&vmx->pi_desc)) + return; + + pi_clear_on(&vmx->pi_desc); + /* + * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * But on x86 this is just a compiler barrier anyway. + */ + smp_mb__after_atomic(); + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) -- cgit v1.2.3-70-g09d2 From 3d92789f69162ee5689f3766e5f50bb46b7e1d97 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Dec 2016 13:29:03 +0100 Subject: KVM: vmx: move sync_pir_to_irr from apic_find_highest_irr to callers Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 7 ++++--- arch/x86/kvm/x86.c | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 120afc2bcfd3..8af6db9b64aa 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -410,8 +410,6 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; - if (apic->vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -581,7 +579,10 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { - int highest_irr = apic_find_highest_irr(apic); + int highest_irr; + if (apic->vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); + highest_irr = apic_find_highest_irr(apic); if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) return -1; return highest_irr; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 204793f0f0e2..8f80da161e80 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6822,9 +6822,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * Update architecture specific hints for APIC * virtual interrupt delivery. */ - if (vcpu->arch.apicv_active) + if (vcpu->arch.apicv_active) { + kvm_x86_ops->sync_pir_to_irr(vcpu); kvm_x86_ops->hwapic_irr_update(vcpu, kvm_lapic_find_highest_irr(vcpu)); + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { -- cgit v1.2.3-70-g09d2 From 76dfafd536730ef9b9d99b1cf596916d52be76d1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Dec 2016 17:17:11 +0100 Subject: KVM: x86: do not scan IRR twice on APICv vmentry Calls to apic_find_highest_irr are scanning IRR twice, once in vmx_sync_pir_from_irr and once in apic_search_irr. Change sync_pir_from_irr to get the new maximum IRR from kvm_apic_update_irr; now that it does the computation, it can also do the RVI write. In order to avoid complications in svm.c, make the callback optional. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/lapic.c | 8 +++++--- arch/x86/kvm/svm.c | 6 ------ arch/x86/kvm/vmx.c | 31 +++++++++++++++++++------------ arch/x86/kvm/x86.c | 9 +++------ 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 417502cf42b6..e4f13e714bcf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -969,7 +969,7 @@ struct kvm_x86_ops { void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); - void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); + int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8af6db9b64aa..7ed2400b2777 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -515,6 +515,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) */ return apic_find_highest_irr(vcpu->arch.apic); } +EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, @@ -580,9 +581,10 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; - if (apic->vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(apic->vcpu); - highest_irr = apic_find_highest_irr(apic); + if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active) + highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu); + else + highest_irr = apic_find_highest_irr(apic); if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) return -1; return highest_irr; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d0414f054bdf..13cd06220b19 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4359,11 +4359,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) return; } -static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - return; -} - static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) { kvm_lapic_set_irr(vec, vcpu->arch.apic); @@ -5373,7 +5368,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_enable_apicv = svm_get_enable_apicv, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, - .sync_pir_to_irr = svm_sync_pir_to_irr, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, .apicv_post_state_restore = avic_post_state_restore, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4ac9b484e244..d03cb62b70d2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6649,8 +6649,10 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; - if (!cpu_has_vmx_apicv()) + if (!cpu_has_vmx_apicv()) { enable_apicv = 0; + kvm_x86_ops->sync_pir_to_irr = NULL; + } if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true; @@ -8722,20 +8724,25 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) } } -static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; - if (!pi_test_on(&vmx->pi_desc)) - return; - - pi_clear_on(&vmx->pi_desc); - /* - * IOMMU can write to PIR.ON, so the barrier matters even on UP. - * But on x86 this is just a compiler barrier anyway. - */ - smp_mb__after_atomic(); - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); + WARN_ON(!vcpu->arch.apicv_active); + if (pi_test_on(&vmx->pi_desc)) { + pi_clear_on(&vmx->pi_desc); + /* + * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * But on x86 this is just a compiler barrier anyway. + */ + smp_mb__after_atomic(); + max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); + } else { + max_irr = kvm_lapic_find_highest_irr(vcpu); + } + vmx_hwapic_irr_update(vcpu, max_irr); + return max_irr; } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f80da161e80..75b0f30d75ee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2909,7 +2909,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); return kvm_apic_get_state(vcpu, s); @@ -6659,7 +6659,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - if (vcpu->arch.apicv_active) + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -6822,11 +6822,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * Update architecture specific hints for APIC * virtual interrupt delivery. */ - if (vcpu->arch.apicv_active) { + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); - kvm_x86_ops->hwapic_irr_update(vcpu, - kvm_lapic_find_highest_irr(vcpu)); - } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { -- cgit v1.2.3-70-g09d2 From b95234c840045b7c72380fd14c59416af28fcb02 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Dec 2016 13:57:33 +0100 Subject: kvm: x86: do not use KVM_REQ_EVENT for APICv interrupt injection Since bf9f6ac8d749 ("KVM: Update Posted-Interrupts Descriptor when vCPU is blocked", 2015-09-18) the posted interrupt descriptor is checked unconditionally for PIR.ON. Therefore we don't need KVM_REQ_EVENT to trigger the scan and, if NMIs or SMIs are not involved, we can avoid the complicated event injection path. Calling kvm_vcpu_kick if PIR.ON=1 is also useless, though it has been there since APICv was introduced. However, without the KVM_REQ_EVENT safety net KVM needs to be much more careful about races between vmx_deliver_posted_interrupt and vcpu_enter_guest. First, the IPI for posted interrupts may be issued between setting vcpu->mode = IN_GUEST_MODE and disabling interrupts. If that happens, kvm_trigger_posted_interrupt returns true, but smp_kvm_posted_intr_ipi doesn't do anything about it. The guest is entered with PIR.ON, but the posted interrupt IPI has not been sent and the interrupt is only delivered to the guest on the next vmentry (if any). To fix this, disable interrupts before setting vcpu->mode. This ensures that the IPI is delayed until the guest enters non-root mode; it is then trapped by the processor causing the interrupt to be injected. Second, the IPI may be issued between kvm_x86_ops->sync_pir_to_irr(vcpu) and vcpu->mode = IN_GUEST_MODE. In this case, kvm_vcpu_kick is called but it (correctly) doesn't do anything because it sees vcpu->mode == OUTSIDE_GUEST_MODE. Again, the guest is entered with PIR.ON but no posted interrupt IPI is pending; this time, the fix for this is to move the RVI update after IN_GUEST_MODE. Both issues were mostly masked by the liberal usage of KVM_REQ_EVENT, though the second could actually happen with VT-d posted interrupts. In both race scenarios KVM_REQ_EVENT would cancel guest entry, resulting in another vmentry which would inject the interrupt. This saves about 300 cycles on the self_ipi_* tests of vmexit.flat. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 11 ++++------- arch/x86/kvm/vmx.c | 8 +++++--- arch/x86/kvm/x86.c | 44 +++++++++++++++++++++++++------------------- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 7ed2400b2777..9fa5b8164961 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -385,12 +385,8 @@ EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) { struct kvm_lapic *apic = vcpu->arch.apic; - int max_irr; - max_irr = __kvm_apic_update_irr(pir, apic->regs); - - kvm_make_request(KVM_REQ_EVENT, vcpu); - return max_irr; + return __kvm_apic_update_irr(pir, apic->regs); } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); @@ -423,9 +419,10 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) vcpu = apic->vcpu; if (unlikely(vcpu->arch.apicv_active)) { - /* try to update RVI */ + /* need to update RVI */ apic_clear_vector(vec, apic->regs + APIC_IRR); - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_x86_ops->hwapic_irr_update(vcpu, + apic_find_highest_irr(apic)); } else { apic->irr_pending = false; apic_clear_vector(vec, apic->regs + APIC_IRR); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d03cb62b70d2..fd8cd50e9dc6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5051,9 +5051,11 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_pir(vector, &vmx->pi_desc)) return; - r = pi_test_and_set_on(&vmx->pi_desc); - kvm_make_request(KVM_REQ_EVENT, vcpu); - if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu)) + /* If a previous notification has sent the IPI, nothing to do. */ + if (pi_test_and_set_on(&vmx->pi_desc)) + return; + + if (!kvm_vcpu_trigger_posted_interrupt(vcpu)) kvm_vcpu_kick(vcpu); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 75b0f30d75ee..63a89a51dcc9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6813,19 +6813,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_hv_process_stimers(vcpu); } - /* - * KVM_REQ_EVENT is not set when posted interrupts are set by - * VT-d hardware, so we have to update RVI unconditionally. - */ - if (kvm_lapic_enabled(vcpu)) { - /* - * Update architecture specific hints for APIC - * virtual interrupt delivery. - */ - if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); - } - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { ++vcpu->stat.req_event; kvm_apic_accept_events(vcpu); @@ -6870,20 +6857,39 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->prepare_guest_switch(vcpu); if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); + + /* + * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt + * IPI are then delayed after guest entry, which ensures that they + * result in virtual interrupt delivery. + */ + local_irq_disable(); vcpu->mode = IN_GUEST_MODE; srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); /* - * We should set ->mode before check ->requests, - * Please see the comment in kvm_make_all_cpus_request. - * This also orders the write to mode from any reads - * to the page tables done while the VCPU is running. - * Please see the comment in kvm_flush_remote_tlbs. + * 1) We should set ->mode before checking ->requests. Please see + * the comment in kvm_make_all_cpus_request. + * + * 2) For APICv, we should set ->mode before checking PIR.ON. This + * pairs with the memory barrier implicit in pi_test_and_set_on + * (see vmx_deliver_posted_interrupt). + * + * 3) This also orders the write to mode from any reads to the page + * tables done while the VCPU is running. Please see the comment + * in kvm_flush_remote_tlbs. */ smp_mb__after_srcu_read_unlock(); - local_irq_disable(); + /* + * This handles the case where a posted interrupt was + * notified with kvm_vcpu_kick. + */ + if (kvm_lapic_enabled(vcpu)) { + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(vcpu); + } if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests || need_resched() || signal_pending(current)) { -- cgit v1.2.3-70-g09d2 From cf8b84f48a5936f558a7a415f1d2f42161bf73eb Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 30 Nov 2016 12:03:42 -0800 Subject: kvm: nVMX: Prepare for checkpointing L2 state Split prepare_vmcs12 into two parts: the part that stores the current L2 guest state and the part that sets up the exit information fields. The former will be used when checkpointing the vCPU's VMX state. Modify prepare_vmcs02 so that it can construct a vmcs02 midway through L2 execution, using the checkpointed L2 guest state saved into the cached vmcs12 above. Signed-off-by: Jim Mattson [Rebasing: add from_vmentry argument to prepare_vmcs02 instead of using vmx->nested.nested_run_pending, because it is no longer 1 at the point prepare_vmcs02 is called. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 77 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fd8cd50e9dc6..4b4b59b34d44 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10040,7 +10040,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * is assigned to entry_failure_code on failure. */ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - unsigned long *entry_failure_code) + bool from_vmentry, unsigned long *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; @@ -10083,21 +10083,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + if (from_vmentry && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - vmcs12->vm_entry_intr_info_field); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs12->vm_entry_exception_error_code); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs12->vm_entry_instruction_len); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs12->guest_interruptibility_info); + if (from_vmentry) { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs12->vm_entry_intr_info_field); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs12->vm_entry_exception_error_code); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs12->vm_entry_instruction_len); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs12->guest_interruptibility_info); + } else { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); + } vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, @@ -10290,16 +10295,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { + if (from_vmentry && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; - } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - + } set_cr4_guest_host_mask(vmx); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) + if (from_vmentry && + vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) @@ -10351,7 +10358,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_set_cr4(vcpu, vmcs12->guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) + if (from_vmentry && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) vcpu->arch.efer = vmcs12->guest_ia32_efer; else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) vcpu->arch.efer |= (EFER_LMA | EFER_LME); @@ -10561,7 +10569,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) { + if (prepare_vmcs02(vcpu, vmcs12, true, &exit_qualification)) { leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); nested_vmx_entry_failure(vcpu, vmcs12, @@ -10733,21 +10741,13 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) } /* - * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits - * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), - * and this function updates it to reflect the changes to the guest state while - * L2 was running (and perhaps made some exits which were handled directly by L0 - * without going back to L1), and to reflect the exit reason. - * Note that we do not have to copy here all VMCS fields, just those that - * could have changed by the L2 guest or the exit - i.e., the guest-state and - * exit-information fields only. Other fields are modified by L1 with VMWRITE, - * which already writes to vmcs12 directly. + * Update the guest state fields of vmcs12 to reflect changes that + * occurred while L2 was running. (The "IA-32e mode guest" bit of the + * VM-entry controls is also updated, since this is really a guest + * state bit.) */ -static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) +static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - /* update guest state fields: */ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); @@ -10853,6 +10853,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); if (nested_cpu_has_xsaves(vmcs12)) vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); +} + +/* + * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits + * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), + * and this function updates it to reflect the changes to the guest state while + * L2 was running (and perhaps made some exits which were handled directly by L0 + * without going back to L1), and to reflect the exit reason. + * Note that we do not have to copy here all VMCS fields, just those that + * could have changed by the L2 guest or the exit - i.e., the guest-state and + * exit-information fields only. Other fields are modified by L1 with VMWRITE, + * which already writes to vmcs12 directly. + */ +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 exit_reason, u32 exit_intr_info, + unsigned long exit_qualification) +{ + /* update guest state fields: */ + sync_vmcs12(vcpu, vmcs12); /* update exit information fields: */ -- cgit v1.2.3-70-g09d2 From e29acc55bfc3afcfdf9ea3468f1217d08f0b2d2b Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 30 Nov 2016 12:03:43 -0800 Subject: kvm: nVMX: Refactor handle_vmon() Handle_vmon is split into two parts: the part that handles the VMXON instruction, and the part that modifies the vcpu state to transition from legacy mode to VMX operation. The latter will be used when restoring the checkpointed state of a vCPU that was in VMX operation when a snapshot was taken. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 93 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 41 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4b4b59b34d44..33cb8d2b9653 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7124,6 +7124,53 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, return 0; } +static int enter_vmx_operation(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs *shadow_vmcs; + + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx->nested.msr_bitmap) + goto out_msr_bitmap; + } + + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) + goto out_cached_vmcs12; + + if (enable_shadow_vmcs) { + shadow_vmcs = alloc_vmcs(); + if (!shadow_vmcs) + goto out_shadow_vmcs; + /* mark vmcs as shadow */ + shadow_vmcs->revision_id |= (1u << 31); + /* init shadow vmcs */ + vmcs_clear(shadow_vmcs); + vmx->vmcs01.shadow_vmcs = shadow_vmcs; + } + + INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); + vmx->nested.vmcs02_num = 0; + + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + + vmx->nested.vmxon = true; + return 0; + +out_shadow_vmcs: + kfree(vmx->nested.cached_vmcs12); + +out_cached_vmcs12: + free_page((unsigned long)vmx->nested.msr_bitmap); + +out_msr_bitmap: + return -ENOMEM; +} + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -7134,9 +7181,9 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, */ static int handle_vmon(struct kvm_vcpu *vcpu) { + int ret; struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs *shadow_vmcs; const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; @@ -7176,49 +7223,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) return 1; - - if (cpu_has_vmx_msr_bitmap()) { - vmx->nested.msr_bitmap = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx->nested.msr_bitmap) - goto out_msr_bitmap; - } - - vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); - if (!vmx->nested.cached_vmcs12) - goto out_cached_vmcs12; - - if (enable_shadow_vmcs) { - shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) - goto out_shadow_vmcs; - /* mark vmcs as shadow */ - shadow_vmcs->revision_id |= (1u << 31); - /* init shadow vmcs */ - vmcs_clear(shadow_vmcs); - vmx->vmcs01.shadow_vmcs = shadow_vmcs; - } - - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num = 0; - - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; - - vmx->nested.vmxon = true; + + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); - -out_shadow_vmcs: - kfree(vmx->nested.cached_vmcs12); - -out_cached_vmcs12: - free_page((unsigned long)vmx->nested.msr_bitmap); - -out_msr_bitmap: - return -ENOMEM; } /* -- cgit v1.2.3-70-g09d2 From a8bc284eb70f58a4fd0f4a70d816cca28ca01973 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 30 Nov 2016 12:03:44 -0800 Subject: kvm: nVMX: Refactor handle_vmptrld() Handle_vmptrld is split into two parts: the part that handles the VMPTRLD instruction, and the part that establishes the current VMCS pointer. The latter will be used when restoring the checkpointed state of a vCPU that had a valid VMCS pointer when a snapshot was taken. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 33cb8d2b9653..ef9affcabdeb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7678,6 +7678,18 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } +static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) +{ + vmx->nested.current_vmptr = vmptr; + if (enable_shadow_vmcs) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, + __pa(vmx->vmcs01.shadow_vmcs)); + vmx->nested.sync_shadow_vmcs = true; + } +} + /* Emulate the VMPTRLD instruction */ static int handle_vmptrld(struct kvm_vcpu *vcpu) { @@ -7708,7 +7720,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) } nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; /* @@ -7717,14 +7728,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) */ memcpy(vmx->nested.cached_vmcs12, vmx->nested.current_vmcs12, VMCS12_SIZE); - - if (enable_shadow_vmcs) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, - __pa(vmx->vmcs01.shadow_vmcs)); - vmx->nested.sync_shadow_vmcs = true; - } + set_current_vmptr(vmx, vmptr); } nested_vmx_succeed(vcpu); -- cgit v1.2.3-70-g09d2 From 6beb7bd52e482c213dfa6c1c88e27a579df5bf4d Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 30 Nov 2016 12:03:45 -0800 Subject: kvm: nVMX: Refactor nested_get_vmcs12_pages() Perform the checks on vmcs12 state early, but defer the gpa->hpa lookups until after prepare_vmcs02. Later, when we restore the checkpointed state of a vCPU in guest mode, we will not be able to do the gpa->hpa lookups when the restore is done. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 138 ++++++++++++++++++++++++++--------------------------- 1 file changed, 69 insertions(+), 69 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ef9affcabdeb..650f34336fad 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9626,17 +9626,16 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } -static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); + +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); + u64 hpa; if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - if (!PAGE_ALIGNED(vmcs12->apic_access_addr) || - vmcs12->apic_access_addr >> maxphyaddr) - return false; - /* * Translate L1 physical address to host physical * address for vmcs02. Keep the page pinned, so this @@ -9647,59 +9646,80 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, nested_release_page(vmx->nested.apic_access_page); vmx->nested.apic_access_page = nested_get_page(vcpu, vmcs12->apic_access_addr); + /* + * If translation failed, no matter: This feature asks + * to exit when accessing the given address, and if it + * can never be accessed, this feature won't do + * anything anyway. + */ + if (vmx->nested.apic_access_page) { + hpa = page_to_phys(vmx->nested.apic_access_page); + vmcs_write64(APIC_ACCESS_ADDR, hpa); + } else { + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + } + } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && + cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + kvm_vcpu_reload_apic_access_page(vcpu); } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) || - vmcs12->virtual_apic_page_addr >> maxphyaddr) - return false; - if (vmx->nested.virtual_apic_page) /* shouldn't happen */ nested_release_page(vmx->nested.virtual_apic_page); vmx->nested.virtual_apic_page = nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); /* - * Failing the vm entry is _not_ what the processor does - * but it's basically the only possibility we have. - * We could still enter the guest if CR8 load exits are - * enabled, CR8 store exits are enabled, and virtualize APIC - * access is disabled; in this case the processor would never - * use the TPR shadow and we could simply clear the bit from - * the execution control. But such a configuration is useless, - * so let's keep the code simple. + * If translation failed, VM entry will fail because + * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. + * Failing the vm entry is _not_ what the processor + * does but it's basically the only possibility we + * have. We could still enter the guest if CR8 load + * exits are enabled, CR8 store exits are enabled, and + * virtualize APIC access is disabled; in this case + * the processor would never use the TPR shadow and we + * could simply clear the bit from the execution + * control. But such a configuration is useless, so + * let's keep the code simple. */ - if (!vmx->nested.virtual_apic_page) - return false; + if (vmx->nested.virtual_apic_page) { + hpa = page_to_phys(vmx->nested.virtual_apic_page); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + } } if (nested_cpu_has_posted_intr(vmcs12)) { - if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) || - vmcs12->posted_intr_desc_addr >> maxphyaddr) - return false; - if (vmx->nested.pi_desc_page) { /* shouldn't happen */ kunmap(vmx->nested.pi_desc_page); nested_release_page(vmx->nested.pi_desc_page); } vmx->nested.pi_desc_page = nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); - if (!vmx->nested.pi_desc_page) - return false; - vmx->nested.pi_desc = (struct pi_desc *)kmap(vmx->nested.pi_desc_page); if (!vmx->nested.pi_desc) { nested_release_page_clean(vmx->nested.pi_desc_page); - return false; + return; } vmx->nested.pi_desc = (struct pi_desc *)((void *)vmx->nested.pi_desc + (unsigned long)(vmcs12->posted_intr_desc_addr & (PAGE_SIZE - 1))); + vmcs_write64(POSTED_INTR_DESC_ADDR, + page_to_phys(vmx->nested.pi_desc_page) + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); } - - return true; + if (cpu_has_vmx_msr_bitmap() && + nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) && + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) + ; + else + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_USE_MSR_BITMAPS); } static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) @@ -10146,12 +10166,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, - page_to_phys(vmx->nested.pi_desc_page) + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - } else + } else { exec_control &= ~PIN_BASED_POSTED_INTR; + } vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); @@ -10196,26 +10213,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; - if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { - /* - * If translation failed, no matter: This feature asks - * to exit when accessing the given address, and if it - * can never be accessed, this feature won't do - * anything anyway. - */ - if (!vmx->nested.apic_access_page) - exec_control &= - ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - else - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->nested.apic_access_page)); - } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && - cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { - exec_control |= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - kvm_vcpu_reload_apic_access_page(vcpu); - } - if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); @@ -10230,6 +10227,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; + + /* + * Write an illegal value to APIC_ACCESS_ADDR. Later, + * nested_get_vmcs12_pages will either fix it up or + * remove the VM execution control. + */ + if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) + vmcs_write64(APIC_ACCESS_ADDR, -1ull); + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -10266,19 +10272,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; + /* + * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if + * nested_get_vmcs12_pages can't fix it up, the illegal value + * will result in a VM entry failure. + */ if (exec_control & CPU_BASED_TPR_SHADOW) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->nested.virtual_apic_page)); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); } - if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS && - nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) - ; /* MSR_BITMAP will be set by following vmx_set_efer. */ - else - exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; - /* * Merging of IO bitmap not currently supported. * Rather, exit every time. @@ -10456,11 +10459,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) goto out; } - if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - goto out; - } - if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); goto out; @@ -10592,6 +10590,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } + nested_get_vmcs12_pages(vcpu, vmcs12); + msr_entry_idx = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, vmcs12->vm_entry_msr_load_count); -- cgit v1.2.3-70-g09d2 From ca0bde28f2ed66c2229ecfb7f4bfa0defa3da4b5 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 30 Nov 2016 12:03:46 -0800 Subject: kvm: nVMX: Split VMCS checks from nested_vmx_run() The checks performed on the contents of the vmcs12 are extracted from nested_vmx_run so that they can be used to validate a vmcs12 that has been restored from a checkpoint. Signed-off-by: Jim Mattson [Change prepare_vmcs02 and nested_vmx_load_cr3's last argument to u32, to match check_vmentry_postreqs. Update comments for singlestep handling. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 194 ++++++++++++++++++++++++++++------------------------- 1 file changed, 103 insertions(+), 91 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 650f34336fad..71df7411959f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10035,7 +10035,7 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) * is assigned to entry_failure_code on failure. */ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, - unsigned long *entry_failure_code) + u32 *entry_failure_code) { if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { if (!nested_cr3_valid(vcpu, cr3)) { @@ -10075,7 +10075,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * is assigned to entry_failure_code on failure. */ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - bool from_vmentry, unsigned long *entry_failure_code) + bool from_vmentry, u32 *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; @@ -10411,68 +10411,22 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } -/* - * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 - * for running an L2 nested guest. - */ -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) +static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; - struct loaded_vmcs *vmcs02; - bool ia32e; - u32 msr_entry_idx; - unsigned long exit_qualification; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (!nested_vmx_check_vmcs12(vcpu)) - goto out; - - vmcs12 = get_vmcs12(vcpu); - - if (enable_shadow_vmcs) - copy_shadow_to_vmcs12(vmx); - - /* - * The nested entry process starts with enforcing various prerequisites - * on vmcs12 as required by the Intel SDM, and act appropriately when - * they fail: As the SDM explains, some conditions should cause the - * instruction to fail, while others will cause the instruction to seem - * to succeed, but return an EXIT_REASON_INVALID_STATE. - * To speed up the normal (success) code path, we should avoid checking - * for misconfigurations which will anyway be caught by the processor - * when using the merged vmcs02. - */ - if (vmcs12->launch_state == launch) { - nested_vmx_failValid(vcpu, - launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS - : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - goto out; - } if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - goto out; - } + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - goto out; - } + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - goto out; - } + if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - goto out; - } + if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.nested_vmx_procbased_ctls_low, @@ -10489,28 +10443,30 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) !vmx_control_verify(vmcs12->vm_entry_controls, vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high)) - { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - goto out; - } + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || - !nested_cr3_valid(vcpu, vmcs12->host_cr3)) { - nested_vmx_failValid(vcpu, - VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); - goto out; - } + !nested_cr3_valid(vcpu, vmcs12->host_cr3)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + + return 0; +} + +static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 *exit_qual) +{ + bool ia32e; + + *exit_qual = ENTRY_FAIL_DEFAULT; if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || - !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) return 1; - } - if (vmcs12->vmcs_link_pointer != -1ull) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); + + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) && + vmcs12->vmcs_link_pointer != -1ull) { + *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; return 1; } @@ -10523,16 +10479,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to * CR0.PG) is 1. */ - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) { + if (to_vmx(vcpu)->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || ((vmcs12->guest_cr0 & X86_CR0_PG) && - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) return 1; - } } /* @@ -10546,11 +10500,75 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) return 1; - } + } + + return 0; +} + +/* + * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 + * for running an L2 nested guest. + */ +static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + struct loaded_vmcs *vmcs02; + u32 msr_entry_idx; + u32 exit_qual; + int ret; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!nested_vmx_check_vmcs12(vcpu)) + goto out; + + vmcs12 = get_vmcs12(vcpu); + + if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + + /* + * The nested entry process starts with enforcing various prerequisites + * on vmcs12 as required by the Intel SDM, and act appropriately when + * they fail: As the SDM explains, some conditions should cause the + * instruction to fail, while others will cause the instruction to seem + * to succeed, but return an EXIT_REASON_INVALID_STATE. + * To speed up the normal (success) code path, we should avoid checking + * for misconfigurations which will anyway be caught by the processor + * when using the merged vmcs02. + */ + if (vmcs12->launch_state == launch) { + nested_vmx_failValid(vcpu, + launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS + : VMXERR_VMRESUME_NONLAUNCHED_VMCS); + goto out; + } + + ret = check_vmentry_prereqs(vcpu, vmcs12); + if (ret) { + nested_vmx_failValid(vcpu, ret); + goto out; + } + + /* + * After this point, the trap flag no longer triggers a singlestep trap + * on the vm entry instructions; don't call kvm_skip_emulated_instruction. + * This is not 100% correct; for performance reasons, we delegate most + * of the checks on host state to the processor. If those fail, + * the singlestep trap is missed. + */ + skip_emulated_instruction(vcpu); + + ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual); + if (ret) { + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, exit_qual); + return 1; } /* @@ -10562,12 +10580,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (!vmcs02) return -ENOMEM; - /* - * After this point, the trap flag no longer triggers a singlestep trap - * on the vm entry instructions. Don't call - * kvm_skip_emulated_instruction. - */ - skip_emulated_instruction(vcpu); enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) @@ -10582,11 +10594,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - if (prepare_vmcs02(vcpu, vmcs12, true, &exit_qualification)) { + if (prepare_vmcs02(vcpu, vmcs12, true, &exit_qual)) { leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, exit_qualification); + EXIT_REASON_INVALID_STATE, exit_qual); return 1; } @@ -10937,7 +10949,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct kvm_segment seg; - unsigned long entry_failure_code; + u32 entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; -- cgit v1.2.3-70-g09d2 From 858e25c06fb0bc4b39b2f01c14c990348e3a9b67 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 30 Nov 2016 12:03:47 -0800 Subject: kvm: nVMX: Refactor nested_vmx_run() Nested_vmx_run is split into two parts: the part that handles the VMLAUNCH/VMRESUME instruction, and the part that modifies the vcpu state to transition from VMX root mode to VMX non-root mode. The latter will be used when restoring the checkpointed state of a vCPU that was in VMX operation when a snapshot was taken. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 111 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 62 insertions(+), 49 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 71df7411959f..bca60665d55d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10507,6 +10507,65 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } +static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct loaded_vmcs *vmcs02; + int cpu; + u32 msr_entry_idx; + u32 exit_qual; + + vmcs02 = nested_get_current_vmcs02(vmx); + if (!vmcs02) + return -ENOMEM; + + enter_guest_mode(vcpu); + + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + + cpu = get_cpu(); + vmx->loaded_vmcs = vmcs02; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; + put_cpu(); + + vmx_segment_cache_clear(vmx); + + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, exit_qual); + return 1; + } + + nested_get_vmcs12_pages(vcpu, vmcs12); + + msr_entry_idx = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (msr_entry_idx) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); + return 1; + } + + vmcs12->launch_state = 1; + + /* + * Note no nested_vmx_succeed or nested_vmx_fail here. At this point + * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet + * returned as far as L1 is concerned. It will only return (and set + * the success flag) when L2 exits (see nested_vmx_vmexit()). + */ + return 0; +} + /* * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 * for running an L2 nested guest. @@ -10515,9 +10574,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; - struct loaded_vmcs *vmcs02; - u32 msr_entry_idx; u32 exit_qual; int ret; @@ -10576,58 +10632,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the nested entry. */ - vmcs02 = nested_get_current_vmcs02(vmx); - if (!vmcs02) - return -ENOMEM; - - enter_guest_mode(vcpu); - - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - - cpu = get_cpu(); - vmx->loaded_vmcs = vmcs02; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); - - vmx_segment_cache_clear(vmx); - - if (prepare_vmcs02(vcpu, vmcs12, true, &exit_qual)) { - leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, exit_qual); - return 1; - } - - nested_get_vmcs12_pages(vcpu, vmcs12); - - msr_entry_idx = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (msr_entry_idx) { - leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); - return 1; - } - - vmcs12->launch_state = 1; + ret = enter_vmx_non_root_mode(vcpu, true); + if (ret) + return ret; if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) return kvm_vcpu_halt(vcpu); vmx->nested.nested_run_pending = 1; - /* - * Note no nested_vmx_succeed or nested_vmx_fail here. At this point - * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet - * returned as far as L1 is concerned. It will only return (and set - * the success flag) when L2 exits (see nested_vmx_vmexit()). - */ return 1; out: -- cgit v1.2.3-70-g09d2 From 681bcea802f2bf0a28cdf0fd0f813de7bb5cd3c7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Jan 2017 22:21:16 +0100 Subject: KVM: svm: inititalize hash table structures directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hashtable and guarding spinlock are global data structures, we can inititalize them statically. Signed-off-by: David Hildenbrand Message-Id: <20170124212116.4568-1-david@redhat.com> Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 13cd06220b19..4e5905a1ce70 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -971,8 +971,8 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) * a particular vCPU. */ #define SVM_VM_DATA_HASH_BITS 8 -DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); -static spinlock_t svm_vm_data_hash_lock; +static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); +static DEFINE_SPINLOCK(svm_vm_data_hash_lock); /* Note: * This function is called from IOMMU driver to notify @@ -1077,8 +1077,6 @@ static __init int svm_hardware_setup(void) } else { pr_info("AVIC enabled\n"); - hash_init(svm_vm_data_hash); - spin_lock_init(&svm_vm_data_hash_lock); amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); } } -- cgit v1.2.3-70-g09d2 From 5b73d6347eb82cd2a26698fc339607e25e0ad917 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 15 Feb 2017 14:40:04 +1100 Subject: KVM: PPC: Book3S HV: Prevent double-free on HPT resize commit path resize_hpt_release(), called once the HPT resize of a KVM guest is completed (successfully or unsuccessfully) frees the state structure for the resize. It is currently not safe to call with a NULL pointer. However, one of the error paths in kvm_vm_ioctl_resize_hpt_commit() can invoke it with a NULL pointer. This will occur if userspace improperly invokes KVM_PPC_RESIZE_HPT_COMMIT without previously calling KVM_PPC_RESIZE_HPT_PREPARE, or if it calls COMMIT twice without an intervening PREPARE. To fix this potential crash bug - and maybe others like it, make it safe (and a no-op) to call resize_hpt_release() with a NULL resize pointer. Found by Dan Carpenter with a static checker. Reported-by: Dan Carpenter Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 013552f05182..72ccac26e464 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1407,6 +1407,9 @@ static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) { BUG_ON(kvm->arch.resize_hpt != resize); + if (!resize) + return; + if (resize->hpt.virt) kvmppc_free_hpt(&resize->hpt); -- cgit v1.2.3-70-g09d2 From 4306f200c71b959c44c32c8a0048fd5857a357c5 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Wed, 15 Feb 2017 20:27:20 +0100 Subject: ptp_kvm: try to detect hypercall availability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No point in registering the device if it cannot work. The hypercall does not advertise itself, so we have to call it. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- drivers/ptp/ptp_kvm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c index 0a54e8326a90..09b4df74291e 100644 --- a/drivers/ptp/ptp_kvm.c +++ b/drivers/ptp/ptp_kvm.c @@ -176,12 +176,19 @@ static void __exit ptp_kvm_exit(void) static int __init ptp_kvm_init(void) { + long ret; + clock_pair_gpa = slow_virt_to_phys(&clock_pair); hv_clock = pvclock_pvti_cpu0_va(); if (!hv_clock) return -ENODEV; + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, + KVM_CLOCK_PAIRING_WALLCLOCK); + if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP) + return -ENODEV; + kvm_ptp_clock.caps = ptp_kvm_caps; kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL); -- cgit v1.2.3-70-g09d2 From 47c0152e0f8bd325869417e0aaff032e62bcf6f2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Dec 2016 11:44:07 +0100 Subject: KVM: VMX: use vmcs_set/clear_bits for CPU-based execution controls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bca60665d55d..0e0b5d09597e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5459,26 +5459,20 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) static void enable_irq_window(struct kvm_vcpu *vcpu) { - u32 cpu_based_vm_exec_control; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_INTR_PENDING); } static void enable_nmi_window(struct kvm_vcpu *vcpu) { - u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis() || vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_NMI_PENDING); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -6137,12 +6131,8 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) static int handle_interrupt_window(struct kvm_vcpu *vcpu) { - u32 cpu_based_vm_exec_control; - - /* clear pending irq */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_INTR_PENDING); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -6408,12 +6398,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { - u32 cpu_based_vm_exec_control; - - /* clear pending NMI */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_NMI_PENDING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); -- cgit v1.2.3-70-g09d2 From 5a2d4365d2c39e8a3ff3cbee3af123a23aebdb9c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 3 Feb 2017 20:32:28 -0800 Subject: KVM: only retrieve memslots once when initializing cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will make it a bit simpler to handle multiple address spaces in gfn_to_hva_cache. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 482612b4e496..e21bac7ed5d3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1937,10 +1937,10 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, } EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); -int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - gpa_t gpa, unsigned long len) +static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, + struct gfn_to_hva_cache *ghc, + gpa_t gpa, unsigned long len) { - struct kvm_memslots *slots = kvm_memslots(kvm); int offset = offset_in_page(gpa); gfn_t start_gfn = gpa >> PAGE_SHIFT; gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; @@ -1950,7 +1950,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, ghc->gpa = gpa; ghc->generation = slots->generation; ghc->len = len; - ghc->memslot = gfn_to_memslot(kvm, start_gfn); + ghc->memslot = __gfn_to_memslot(slots, start_gfn); ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { ghc->hva += offset; @@ -1960,7 +1960,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, * verify that the entire region is valid here. */ while (start_gfn <= end_gfn) { - ghc->memslot = gfn_to_memslot(kvm, start_gfn); + ghc->memslot = __gfn_to_memslot(slots, start_gfn); ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail); if (kvm_is_error_hva(ghc->hva)) @@ -1972,6 +1972,13 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, } return 0; } + +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa, unsigned long len) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); +} EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, @@ -1984,7 +1991,7 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, BUG_ON(len + offset > ghc->len); if (slots->generation != ghc->generation) - kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); + __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); if (unlikely(!ghc->memslot)) return kvm_write_guest(kvm, gpa, data, len); @@ -2017,7 +2024,7 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, BUG_ON(len > ghc->len); if (slots->generation != ghc->generation) - kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); + __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); if (unlikely(!ghc->memslot)) return kvm_read_guest(kvm, ghc->gpa, data, len); -- cgit v1.2.3-70-g09d2 From 4bd518f1598d41985eb933aa73bac2605b70984c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 3 Feb 2017 20:44:51 -0800 Subject: KVM: use separate generations for each address space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will make it easier to support multiple address spaces in kvm_gfn_to_hva_cache_init. Instead of having to check the address space id, we can keep on checking just the generation number. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e21bac7ed5d3..a83c186cefc1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -506,11 +506,6 @@ static struct kvm_memslots *kvm_alloc_memslots(void) if (!slots) return NULL; - /* - * Init kvm generation close to the maximum to easily test the - * code of handling generation number wrap-around. - */ - slots->generation = -150; for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) slots->id_to_index[i] = slots->memslots[i].id = i; @@ -641,9 +636,16 @@ static struct kvm *kvm_create_vm(unsigned long type) r = -ENOMEM; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - kvm->memslots[i] = kvm_alloc_memslots(); - if (!kvm->memslots[i]) + struct kvm_memslots *slots = kvm_alloc_memslots(); + if (!slots) goto out_err_no_srcu; + /* + * Generations must be different for each address space. + * Init kvm generation close to the maximum to easily test the + * code of handling generation number wrap-around. + */ + slots->generation = i * 2 - 150; + rcu_assign_pointer(kvm->memslots[i], slots); } if (init_srcu_struct(&kvm->srcu)) @@ -870,8 +872,14 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, * Increment the new memslot generation a second time. This prevents * vm exits that race with memslot updates from caching a memslot * generation that will (potentially) be valid forever. + * + * Generations must be unique even across address spaces. We do not need + * a global counter for that, instead the generation space is evenly split + * across address spaces. For example, with two address spaces, address + * space 0 will use generations 0, 4, 8, ... while * address space 1 will + * use generations 2, 6, 10, 14, ... */ - slots->generation++; + slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1; kvm_arch_memslots_updated(kvm, slots); -- cgit v1.2.3-70-g09d2 From bbd6411513aa8ef3ea02abab61318daf87c1af1e Mon Sep 17 00:00:00 2001 From: "Cao, Lei" Date: Fri, 3 Feb 2017 20:04:35 +0000 Subject: KVM: Support vCPU-based gfn->hva cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide versions of struct gfn_to_hva_cache functions that take vcpu as a parameter instead of struct kvm. The existing functions are not needed anymore, so delete them. This allows dirty pages to be logged in the vcpu dirty ring, instead of the global dirty ring, for ring-based dirty memory tracking. Signed-off-by: Lei Cao Message-Id: Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 22 ++++++++++------------ arch/x86/kvm/x86.c | 41 ++++++++++++++++++++--------------------- include/linux/kvm_host.h | 16 ++++++++-------- virt/kvm/kvm_main.c | 34 +++++++++++++++++----------------- 4 files changed, 55 insertions(+), 58 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9fa5b8164961..bad6a25067bc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -529,16 +529,14 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, - sizeof(val)); + return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); } static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) { - - return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, - sizeof(*val)); + return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); } static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) @@ -2287,8 +2285,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32))) + if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32))) return; apic_set_tpr(vcpu->arch.apic, data & 0xff); @@ -2340,14 +2338,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32)); + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); } int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { if (vapic_addr) { - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; @@ -2441,7 +2439,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.pv_eoi.msr_val = data; if (!pv_eoi_enabled(vcpu)) return 0; - return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, + return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data, addr, sizeof(u8)); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 63a89a51dcc9..0aa8db229e0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1811,7 +1811,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time, &guest_hv_clock, sizeof(guest_hv_clock)))) return; @@ -1832,9 +1832,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); smp_wmb(); @@ -1848,16 +1848,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -2090,7 +2090,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa, sizeof(u32))) return 1; @@ -2109,7 +2109,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; @@ -2120,7 +2120,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.version += 1; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); @@ -2129,14 +2129,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); vcpu->arch.st.steal.version += 1; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } @@ -2241,7 +2241,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = false; @@ -2262,7 +2262,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime, data & KVM_STEAL_VALID_BITS, sizeof(struct kvm_steal_time))) return 1; @@ -2875,7 +2875,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.preempted = 1; - kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal.preempted, offsetof(struct kvm_steal_time, preempted), sizeof(vcpu->arch.st.steal.preempted)); @@ -8533,9 +8533,8 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) { - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); + return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val, + sizeof(val)); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cda457bcedc1..2db458ee94b0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -641,18 +641,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); -int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); +int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len); -int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); -int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, int offset, unsigned long len); -int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - gpa_t gpa, unsigned long len); +int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len); +int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, + void *data, int offset, unsigned long len); +int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, + gpa_t gpa, unsigned long len); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a83c186cefc1..263a80513ad9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1981,18 +1981,18 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, return 0; } -int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, +int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { - struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } -EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); +EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva_cache_init); -int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, int offset, unsigned long len) +int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, + void *data, int offset, unsigned long len) { - struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); int r; gpa_t gpa = ghc->gpa + offset; @@ -2002,7 +2002,7 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); if (unlikely(!ghc->memslot)) - return kvm_write_guest(kvm, gpa, data, len); + return kvm_vcpu_write_guest(vcpu, gpa, data, len); if (kvm_is_error_hva(ghc->hva)) return -EFAULT; @@ -2014,19 +2014,19 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); +EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_offset_cached); -int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) +int kvm_vcpu_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) { - return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); + return kvm_vcpu_write_guest_offset_cached(vcpu, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_write_guest_cached); +EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_cached); -int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) +int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) { - struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); int r; BUG_ON(len > ghc->len); @@ -2035,7 +2035,7 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); if (unlikely(!ghc->memslot)) - return kvm_read_guest(kvm, ghc->gpa, data, len); + return kvm_vcpu_read_guest(vcpu, ghc->gpa, data, len); if (kvm_is_error_hva(ghc->hva)) return -EFAULT; @@ -2046,7 +2046,7 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest_cached); +EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_cached); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { -- cgit v1.2.3-70-g09d2 From 4da934dc6515afaa1db4e02548803de0cd279734 Mon Sep 17 00:00:00 2001 From: Vipin K Parashar Date: Thu, 16 Feb 2017 22:40:26 +0530 Subject: KVM: PPC: Book3S PR: Ratelimit copy data failure error messages kvm_ppc_mmu_book3s_32/64 xlat() logs "KVM can't copy data" error upon failing to copy user data to kernel space. This floods kernel log once such fails occur in short time period. Ratelimit this error to avoid flooding kernel logs upon copy data failures. Signed-off-by: Vipin K Parashar Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_32_mmu.c | 3 ++- arch/powerpc/kvm/book3s_64_mmu.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index a2eb6d354a57..1992676c7a94 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -224,7 +224,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, ptem = kvmppc_mmu_book3s_32_get_ptem(sre, eaddr, primary); if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { - printk(KERN_ERR "KVM: Can't copy data from 0x%lx!\n", ptegp); + printk_ratelimited(KERN_ERR + "KVM: Can't copy data from 0x%lx!\n", ptegp); goto no_page_found; } diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index b9131aa1aedf..70153578131a 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -265,7 +265,8 @@ do_second: goto no_page_found; if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { - printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp); + printk_ratelimited(KERN_ERR + "KVM: Can't copy data from 0x%lx!\n", ptegp); goto no_page_found; } -- cgit v1.2.3-70-g09d2 From 3a4f17608162bbbdcdd9680789b1bc61017cefa3 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 16 Feb 2017 22:07:12 +0100 Subject: KVM: PPC: Book3S HV: Turn "KVM guest htab" message into a debug message The average user likely does not know what a "htab" or "LPID" is, and it's annoying that these messages are quickly filling the dmesg log when you're doing boot cycle tests, so let's turn it into a debug message instead. Signed-off-by: Thomas Huth Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 72ccac26e464..b68b342dd01f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -127,8 +127,8 @@ void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) kvm->arch.hpt = *info; kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); - pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", - info->virt, (long)info->order, kvm->arch.lpid); + pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n", + info->virt, (long)info->order, kvm->arch.lpid); } long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) -- cgit v1.2.3-70-g09d2 From 460df4c1fc7c00829050c08d6368dc6e6beef307 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Feb 2017 11:50:15 +0100 Subject: KVM: race-free exit from KVM_RUN without POSIX signals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The purpose of the KVM_SET_SIGNAL_MASK API is to let userspace "kick" a VCPU out of KVM_RUN through a POSIX signal. A signal is attached to a dummy signal handler; by blocking the signal outside KVM_RUN and unblocking it inside, this possible race is closed: VCPU thread service thread -------------------------------------------------------------- check flag set flag raise signal (signal handler does nothing) KVM_RUN However, one issue with KVM_SET_SIGNAL_MASK is that it has to take tsk->sighand->siglock on every KVM_RUN. This lock is often on a remote NUMA node, because it is on the node of a thread's creator. Taking this lock can be very expensive if there are many userspace exits (as is the case for SMP Windows VMs without Hyper-V reference time counter). As an alternative, we can put the flag directly in kvm_run so that KVM can see it: VCPU thread service thread -------------------------------------------------------------- raise signal signal handler set run->immediate_exit KVM_RUN check run->immediate_exit Reviewed-by: Radim Krčmář Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 13 ++++++++++++- arch/arm/kvm/arm.c | 4 ++++ arch/mips/kvm/mips.c | 7 ++++++- arch/powerpc/kvm/powerpc.c | 6 +++++- arch/s390/kvm/kvm-s390.c | 4 ++++ arch/x86/kvm/x86.c | 6 +++++- include/uapi/linux/kvm.h | 4 +++- 7 files changed, 39 insertions(+), 5 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e4f2cdcf78eb..069450938b79 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3389,7 +3389,18 @@ struct kvm_run { Request that KVM_RUN return when it becomes possible to inject external interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. - __u8 padding1[7]; + __u8 immediate_exit; + +This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN +exits immediately, returning -EINTR. In the common scenario where a +signal is used to "kick" a VCPU out of KVM_RUN, this field can be used +to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability. +Rather than blocking the signal outside KVM_RUN, userspace can set up +a signal handler that sets run->immediate_exit to a non-zero value. + +This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available. + + __u8 padding1[6]; /* out */ __u32 exit_reason; diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 21c493a9e5c9..c9a2103faeb9 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_PSCI_0_2: case KVM_CAP_READONLY_MEM: case KVM_CAP_MP_STATE: + case KVM_CAP_IMMEDIATE_EXIT: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -604,6 +605,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) return ret; } + if (run->immediate_exit) + return -EINTR; + if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 31ee5ee0010b..ed81e5ac1426 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -397,7 +397,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { - int r = 0; + int r = -EINTR; sigset_t sigsaved; if (vcpu->sigset_active) @@ -409,6 +409,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->mmio_needed = 0; } + if (run->immediate_exit) + goto out; + lose_fpu(1); local_irq_disable(); @@ -429,6 +432,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) guest_exit_irqoff(); local_irq_enable(); +out: if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -1021,6 +1025,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_READONLY_MEM: case KVM_CAP_SYNC_MMU: + case KVM_CAP_IMMEDIATE_EXIT: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2b3e4e620078..1fe1391ba2c2 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ONE_REG: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: + case KVM_CAP_IMMEDIATE_EXIT: r = 1; break; case KVM_CAP_PPC_PAIRED_SINGLES: @@ -1117,7 +1118,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) #endif } - r = kvmppc_vcpu_run(run, vcpu); + if (run->immediate_exit) + r = -EINTR; + else + r = kvmppc_vcpu_run(run, vcpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 502de74ea984..99e35fe0dea8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -370,6 +370,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: + case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_S390_INJECT_IRQ: case KVM_CAP_S390_USER_SIGP: case KVM_CAP_S390_USER_STSI: @@ -2798,6 +2799,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int rc; sigset_t sigsaved; + if (kvm_run->immediate_exit) + return -EINTR; + if (guestdbg_exit_pending(vcpu)) { kvm_s390_prepare_debug_exit(vcpu); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0aa8db229e0a..8d3047c8cce7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2672,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: + case KVM_CAP_IMMEDIATE_EXIT: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -7202,7 +7203,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); - r = vcpu_run(vcpu); + if (kvm_run->immediate_exit) + r = -EINTR; + else + r = vcpu_run(vcpu); out: post_kvm_run_save(vcpu); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7964b970b9ad..f51d5082a377 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -218,7 +218,8 @@ struct kvm_hyperv_exit { struct kvm_run { /* in */ __u8 request_interrupt_window; - __u8 padding1[7]; + __u8 immediate_exit; + __u8 padding1[6]; /* out */ __u32 exit_reason; @@ -881,6 +882,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SPAPR_RESIZE_HPT 133 #define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_HASH_V3 135 +#define KVM_CAP_IMMEDIATE_EXIT 136 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-70-g09d2 From bd7e5b0899a429445cc6e3037c13f8b5ae3be903 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 3 Feb 2017 21:18:52 -0800 Subject: KVM: x86: remove code for lazy FPU handling The FPU is always active now when running KVM. Reviewed-by: David Matlack Reviewed-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 -- arch/x86/kvm/cpuid.c | 2 - arch/x86/kvm/svm.c | 43 ++------------- arch/x86/kvm/vmx.c | 112 ++++++---------------------------------- arch/x86/kvm/x86.c | 7 +-- include/linux/kvm_host.h | 1 - 6 files changed, 19 insertions(+), 149 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e4f13e714bcf..74ef58c8ff53 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -55,7 +55,6 @@ #define KVM_REQ_TRIPLE_FAULT 10 #define KVM_REQ_MMU_SYNC 11 #define KVM_REQ_CLOCK_UPDATE 12 -#define KVM_REQ_DEACTIVATE_FPU 13 #define KVM_REQ_EVENT 14 #define KVM_REQ_APF_HALT 15 #define KVM_REQ_STEAL_UPDATE 16 @@ -936,8 +935,6 @@ struct kvm_x86_ops { unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); u32 (*get_pkru)(struct kvm_vcpu *vcpu); - void (*fpu_activate)(struct kvm_vcpu *vcpu); - void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c0e2036217ad..1d155cc56629 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -123,8 +123,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - kvm_x86_ops->fpu_activate(vcpu); - /* * The existing code assumes virtual address is 48-bit in the canonical * address checks; exit if it is ever changed. diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4e5905a1ce70..d1efe2c62b3f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1157,7 +1157,6 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - svm->vcpu.fpu_active = 1; svm->vcpu.arch.hflags = 0; set_cr_intercept(svm, INTERCEPT_CR0_READ); @@ -1899,15 +1898,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm) ulong gcr0 = svm->vcpu.arch.cr0; u64 *hcr0 = &svm->vmcb->save.cr0; - if (!svm->vcpu.fpu_active) - *hcr0 |= SVM_CR0_SELECTIVE_MASK; - else - *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) - | (gcr0 & SVM_CR0_SELECTIVE_MASK); + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); mark_dirty(svm->vmcb, VMCB_CR); - if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + if (gcr0 == *hcr0) { clr_cr_intercept(svm, INTERCEPT_CR0_READ); clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); } else { @@ -1938,8 +1934,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!npt_enabled) cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) - cr0 |= X86_CR0_TS; /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at @@ -2158,22 +2152,6 @@ static int ac_interception(struct vcpu_svm *svm) return 1; } -static void svm_fpu_activate(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - clr_exception_intercept(svm, NM_VECTOR); - - svm->vcpu.fpu_active = 1; - update_cr0_intercept(svm); -} - -static int nm_interception(struct vcpu_svm *svm) -{ - svm_fpu_activate(&svm->vcpu); - return 1; -} - static bool is_erratum_383(void) { int err, i; @@ -2571,9 +2549,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) if (!npt_enabled && svm->apf_reason == 0) return NESTED_EXIT_HOST; break; - case SVM_EXIT_EXCP_BASE + NM_VECTOR: - nm_interception(svm); - break; default: break; } @@ -4018,7 +3993,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, - [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, [SVM_EXIT_INTR] = intr_interception, @@ -5072,14 +5046,6 @@ static bool svm_has_wbinvd_exit(void) return true; } -static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - set_exception_intercept(svm, NM_VECTOR); - update_cr0_intercept(svm); -} - #define PRE_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_PRE_EXCEPT, } #define POST_EX(exit) { .exit_code = (exit), \ @@ -5340,9 +5306,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_pkru = svm_get_pkru, - .fpu_activate = svm_fpu_activate, - .fpu_deactivate = svm_fpu_deactivate, - .tlb_flush = svm_flush_tlb, .run = svm_vcpu_run, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0e0b5d09597e..9856b73a21ad 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1856,7 +1856,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) u32 eb; eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); + (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) @@ -1865,8 +1865,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ - if (vcpu->fpu_active) - eb &= ~(1u << NM_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass @@ -2340,25 +2338,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) } } -static void vmx_fpu_activate(struct kvm_vcpu *vcpu) -{ - ulong cr0; - - if (vcpu->fpu_active) - return; - vcpu->fpu_active = 1; - cr0 = vmcs_readl(GUEST_CR0); - cr0 &= ~(X86_CR0_TS | X86_CR0_MP); - cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); - vmcs_writel(GUEST_CR0, cr0); - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; - if (is_guest_mode(vcpu)) - vcpu->arch.cr0_guest_owned_bits &= - ~get_vmcs12(vcpu)->cr0_guest_host_mask; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); -} - static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); /* @@ -2377,33 +2356,6 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields) (fields->cr4_read_shadow & fields->cr4_guest_host_mask); } -static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) -{ - /* Note that there is no vcpu->fpu_active = 0 here. The caller must - * set this *before* calling this function. - */ - vmx_decache_cr0_guest_bits(vcpu); - vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = 0; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - if (is_guest_mode(vcpu)) { - /* - * L1's specified read shadow might not contain the TS bit, - * so now that we turned on shadowing of this bit, we need to - * set this bit of the shadow. Like in nested_vmx_run we need - * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet - * up-to-date here because we just decached cr0.TS (and we'll - * only update vmcs12->guest_cr0 on nested exit). - */ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | - (vcpu->arch.cr0 & X86_CR0_TS); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); - } else - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); -} - static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags, save_rflags; @@ -4232,9 +4184,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); - if (!vcpu->fpu_active) - hw_cr0 |= X86_CR0_TS | X86_CR0_MP; - vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; @@ -5321,7 +5270,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) /* 22.2.1, 20.8.1 */ vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); - vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); + vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); + set_cr4_guest_host_mask(vmx); if (vmx_xsaves_supported()) @@ -5425,7 +5376,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); - vmx_fpu_activate(vcpu); + update_exception_bitmap(vcpu); vpid_sync_context(vmx->vpid); @@ -5698,11 +5649,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_nmi(intr_info)) return 1; /* already handled by vmx_vcpu_run() */ - if (is_no_device(intr_info)) { - vmx_fpu_activate(vcpu); - return 1; - } - if (is_invalid_opcode(intr_info)) { if (is_guest_mode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5892,22 +5838,6 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) return kvm_set_cr4(vcpu, val); } -/* called to set cr0 as appropriate for clts instruction exit. */ -static void handle_clts(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu)) { - /* - * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS - * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, - * just pretend it's off (also in arch.cr0 for fpu_activate). - */ - vmcs_writel(CR0_READ_SHADOW, - vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); - vcpu->arch.cr0 &= ~X86_CR0_TS; - } else - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); -} - static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; @@ -5953,9 +5883,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 2: /* clts */ - handle_clts(vcpu); + WARN_ONCE(1, "Guest should always own CR0.TS"); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - vmx_fpu_activate(vcpu); return kvm_skip_emulated_instruction(vcpu); case 1: /*mov from cr*/ switch (cr) { @@ -10349,8 +10279,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified - * TS bit (for lazy fpu) and bits which we consider mandatory enabled. + * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those + * bits which we consider mandatory enabled. * The CR0_READ_SHADOW is what L2 should have expected to read given * the specifications by L1; It's not enough to take * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we @@ -10963,24 +10893,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); /* * Note that calling vmx_set_cr0 is important, even if cr0 hasn't - * actually changed, because it depends on the current state of - * fpu_active (which may have changed). - * Note that vmx_set_cr0 refers to efer set above. + * actually changed, because vmx_set_cr0 refers to efer set above. + * + * CR0_GUEST_HOST_MASK is already set in the original vmcs01 + * (KVM doesn't change it); */ + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; vmx_set_cr0(vcpu, vmcs12->host_cr0); - /* - * If we did fpu_activate()/fpu_deactivate() during L2's run, we need - * to apply the same changes to L1's vmcs. We just set cr0 correctly, - * but we also need to update cr0_guest_host_mask and exception_bitmap. - */ - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* - * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 - * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); - */ + /* Same as above - no reason to call set_cr4_guest_host_mask(). */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); @@ -11609,9 +11530,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_pkru = vmx_get_pkru, - .fpu_activate = vmx_fpu_activate, - .fpu_deactivate = vmx_fpu_deactivate, - .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8d3047c8cce7..c48404017e4f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6751,10 +6751,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } - if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { - vcpu->fpu_active = 0; - kvm_x86_ops->fpu_deactivate(vcpu); - } if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { /* Page is swapped out. Do synthetic halt */ vcpu->arch.apf.halted = true; @@ -6856,8 +6852,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - if (vcpu->fpu_active) - kvm_load_guest_fpu(vcpu); + kvm_load_guest_fpu(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2db458ee94b0..8d69d5150748 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -221,7 +221,6 @@ struct kvm_vcpu { struct mutex mutex; struct kvm_run *run; - int fpu_active; int guest_fpu_loaded, guest_xcr0_loaded; struct swait_queue_head wq; struct pid *pid; -- cgit v1.2.3-70-g09d2 From f6a3b168e56abe094b5c93dbafc2f0a0bf64e702 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 22 Jan 2017 11:30:21 +0100 Subject: KVM: Return directly after a failed copy_from_user() in kvm_vm_compat_ioctl() * Return directly after a call of the function "copy_from_user" failed in a case block. This issue was detected by using the Coccinelle software. * Delete the jump label "out" which became unnecessary with this refactoring. Signed-off-by: Markus Elfring Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 263a80513ad9..bd131c539c1e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3148,10 +3148,9 @@ static long kvm_vm_compat_ioctl(struct file *filp, struct compat_kvm_dirty_log compat_log; struct kvm_dirty_log log; - r = -EFAULT; if (copy_from_user(&compat_log, (void __user *)arg, sizeof(compat_log))) - goto out; + return -EFAULT; log.slot = compat_log.slot; log.padding1 = compat_log.padding1; log.padding2 = compat_log.padding2; @@ -3163,8 +3162,6 @@ static long kvm_vm_compat_ioctl(struct file *filp, default: r = kvm_vm_ioctl(filp, ioctl, arg); } - -out: return r; } #endif -- cgit v1.2.3-70-g09d2 From 58d6db34917278614cbb3a329b66498658560a46 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 22 Jan 2017 17:30:16 +0100 Subject: KVM: Return an error code only as a constant in kvm_get_dirty_log_protect() * Return an error code without storing it in an intermediate variable. * Delete the local variable "r" and the jump label "out" which became unnecessary with this refactoring. Signed-off-by: Markus Elfring Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bd131c539c1e..e109d9e663b5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1164,24 +1164,22 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int r, i, as_id, id; + int i, as_id, id; unsigned long n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; - r = -EINVAL; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) - goto out; + return -EINVAL; slots = __kvm_memslots(kvm, as_id); memslot = id_to_memslot(slots, id); dirty_bitmap = memslot->dirty_bitmap; - r = -ENOENT; if (!dirty_bitmap) - goto out; + return -ENOENT; n = kvm_dirty_bitmap_bytes(memslot); @@ -1210,14 +1208,9 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, } spin_unlock(&kvm->mmu_lock); - - r = -EFAULT; if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) - goto out; - - r = 0; -out: - return r; + return -EFAULT; + return 0; } EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); #endif -- cgit v1.2.3-70-g09d2 From 843574a3ede1d4d3f3c88317a50d5c76d5a9515c Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 22 Jan 2017 17:41:07 +0100 Subject: KVM: Return an error code only as a constant in kvm_get_dirty_log() * Return an error code without storing it in an intermediate variable. * Delete the local variable "r" and the jump label "out" which became unnecessary with this refactoring. Signed-off-by: Markus Elfring Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e109d9e663b5..cc4d6e0dd2a2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1102,37 +1102,31 @@ int kvm_get_dirty_log(struct kvm *kvm, { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int r, i, as_id, id; + int i, as_id, id; unsigned long n; unsigned long any = 0; - r = -EINVAL; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) - goto out; + return -EINVAL; slots = __kvm_memslots(kvm, as_id); memslot = id_to_memslot(slots, id); - r = -ENOENT; if (!memslot->dirty_bitmap) - goto out; + return -ENOENT; n = kvm_dirty_bitmap_bytes(memslot); for (i = 0; !any && i < n/sizeof(long); ++i) any = memslot->dirty_bitmap[i]; - r = -EFAULT; if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) - goto out; + return -EFAULT; if (any) *is_dirty = 1; - - r = 0; -out: - return r; + return 0; } EXPORT_SYMBOL_GPL(kvm_get_dirty_log); -- cgit v1.2.3-70-g09d2 From bcd3bb63dbc87a3bbb21e95a09cd26bb6479c332 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 18 Feb 2017 08:30:44 +1100 Subject: KVM: PPC: Book3S HV: Disable HPT resizing on POWER9 for now The new HPT resizing code added in commit b5baa6877315 ("KVM: PPC: Book3S HV: KVM-HV HPT resizing implementation", 2016-12-20) doesn't have code to handle the new HPTE format which POWER9 uses. Thus it would be best not to advertise it to userspace on POWER9 systems until it works properly. Also, since resize_hpt_rehash_hpte() contains BUG_ON() calls that could be hit on POWER9, let's prevent it from being called on POWER9 for now. Acked-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 6 ++++++ arch/powerpc/kvm/powerpc.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index b68b342dd01f..f3158fb16de3 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1370,6 +1370,12 @@ static int resize_hpt_rehash(struct kvm_resize_hpt *resize) unsigned long i; int rc; + /* + * resize_hpt_rehash_hpte() doesn't handle the new-format HPTEs + * that POWER9 uses, and could well hit a BUG_ON on POWER9. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return -EIO; for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { rc = resize_hpt_rehash_hpte(resize, i); if (rc != 0) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2b3e4e620078..fcb253ba51e5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -613,7 +613,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; case KVM_CAP_SPAPR_RESIZE_HPT: - r = !!hv_enabled; + /* Disable this on POWER9 until code handles new HPTE format */ + r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300); break; #endif case KVM_CAP_PPC_HTM: -- cgit v1.2.3-70-g09d2 From 06ce521af9558814b8606c0476c54497cf83a653 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 24 Jan 2017 11:56:21 +0100 Subject: kvm: fix page struct leak in handle_vmon handle_vmon gets a reference on VMXON region page, but does not release it. Release the reference. Found by syzkaller; based on a patch by Dmitry. Reported-by: Dmitry Vyukov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9856b73a21ad..d13073c841ff 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6996,13 +6996,18 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, } page = nested_get_page(vcpu, vmptr); - if (page == NULL || - *(u32 *)kmap(page) != VMCS12_REVISION) { + if (page == NULL) { nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + if (*(u32 *)kmap(page) != VMCS12_REVISION) { kunmap(page); + nested_release_page_clean(page); + nested_vmx_failInvalid(vcpu); return kvm_skip_emulated_instruction(vcpu); } kunmap(page); + nested_release_page_clean(page); vmx->nested.vmxon_ptr = vmptr; break; case EXIT_REASON_VMCLEAR: -- cgit v1.2.3-70-g09d2 From 4f53ab14285802b298261f8b52af322039d1dfd0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 20 Feb 2017 08:56:09 -0800 Subject: x86/asm: Define the kernel TSS limit in a macro Rather than open-coding the kernel TSS limit in set_tss_desc(), make it a real macro near the TSS layout definition. This is purely a cleanup. Cc: Thomas Garnier Cc: Jim Mattson Signed-off-by: Andy Lutomirski Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/desc.h | 10 +--------- arch/x86/include/asm/processor.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 12080d87da3b..2e781bcc5e12 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -177,16 +177,8 @@ static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) struct desc_struct *d = get_cpu_gdt_table(cpu); tss_desc tss; - /* - * sizeof(unsigned long) coming from an extra "long" at the end - * of the iobitmap. See tss_struct definition in processor.h - * - * -1? seg base+limit should be pointing to the address of the - * last valid byte - */ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, - IO_BITMAP_OFFSET + IO_BITMAP_BYTES + - sizeof(unsigned long) - 1); + __KERNEL_TSS_LIMIT); write_gdt_entry(d, entry, &tss, DESC_TSS); } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1be64da0384e..f8f1b7537abe 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -341,6 +341,16 @@ struct tss_struct { DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +/* + * sizeof(unsigned long) coming from an extra "long" at the end + * of the iobitmap. + * + * -1? seg base+limit should be pointing to the address of the + * last valid byte + */ +#define __KERNEL_TSS_LIMIT \ + (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) + #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #endif -- cgit v1.2.3-70-g09d2 From e0c230634af99967da79a6ed1faecc720fb623ca Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 20 Feb 2017 08:56:10 -0800 Subject: x86/kvm/vmx: Don't fetch the TSS base from the GDT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current CPU's TSS base is a foregone conclusion, so there's no need to parse it out of the segment tables. This should save a couple cycles (as STR is surely microcoded and poorly optimized) but, more importantly, it's a cleanup and it means that segment_base() will never be called on 64-bit kernels. Cc: Thomas Garnier Cc: Jim Mattson Cc: Radim Krčmář Cc: Paolo Bonzini Signed-off-by: Andy Lutomirski Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d13073c841ff..3dbbf4ec471f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2086,13 +2086,6 @@ static unsigned long segment_base(u16 selector) return v; } -static inline unsigned long kvm_read_tr_base(void) -{ - u16 tr; - asm("str %0" : "=g"(tr)); - return segment_base(tr); -} - static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2292,10 +2285,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* * Linux uses per-cpu TSS and GDT, so set these when switching - * processors. + * processors. See 22.2.4. */ - vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ + vmcs_writel(HOST_TR_BASE, + (unsigned long)this_cpu_ptr(&cpu_tss)); + vmcs_writel(HOST_GDTR_BASE, gdt->address); rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ -- cgit v1.2.3-70-g09d2 From e28baeadcf0d657c6b6e849ae1b4faccb4faf326 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 20 Feb 2017 08:56:11 -0800 Subject: x86/kvm/vmx: Get rid of segment_base() on 64-bit kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was a bit buggy (it didn't list all segment types that needed 64-bit fixups), but the bug was irrelevant because it wasn't called in any interesting context on 64-bit kernels and was only used for data segents on 32-bit kernels. To avoid confusion, make it explicitly 32-bit only. Cc: Thomas Garnier Cc: Jim Mattson Cc: Radim Krčmář Cc: Paolo Bonzini Signed-off-by: Andy Lutomirski Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3dbbf4ec471f..3ddd72303fe4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2057,6 +2057,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) } } +#ifdef CONFIG_X86_32 +/* + * On 32-bit kernels, VM exits still load the FS and GS bases from the + * VMCS rather than the segment table. KVM uses this helper to figure + * out the current bases to poke them into the VMCS before entry. + */ static unsigned long segment_base(u16 selector) { struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); @@ -2079,12 +2085,9 @@ static unsigned long segment_base(u16 selector) } d = (struct desc_struct *)(table_base + (selector & ~7)); v = get_desc_base(d); -#ifdef CONFIG_X86_64 - if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; -#endif return v; } +#endif static void vmx_save_host_state(struct kvm_vcpu *vcpu) { -- cgit v1.2.3-70-g09d2 From 8c2e41f7ae1234c192ef497472ad306227c77c03 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 20 Feb 2017 08:56:12 -0800 Subject: x86/kvm/vmx: Simplify segment_base() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use actual pointer types for pointers (instead of unsigned long) and replace hardcoded constants with the appropriate self-documenting macros. The function is still a bit messy, but this seems a lot better than before to me. This is mostly borrowed from a patch by Thomas Garnier. Cc: Thomas Garnier Cc: Jim Mattson Cc: Radim Krčmář Cc: Paolo Bonzini Signed-off-by: Andy Lutomirski Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3ddd72303fe4..2dd94cf597cc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2067,24 +2067,23 @@ static unsigned long segment_base(u16 selector) { struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *d; - unsigned long table_base; + struct desc_struct *table; unsigned long v; - if (!(selector & ~3)) + if (!(selector & ~SEGMENT_RPL_MASK)) return 0; - table_base = gdt->address; + table = (struct desc_struct *)gdt->address; - if (selector & 4) { /* from ldt */ + if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { u16 ldt_selector = kvm_read_ldt(); - if (!(ldt_selector & ~3)) + if (!(ldt_selector & ~SEGMENT_RPL_MASK)) return 0; - table_base = segment_base(ldt_selector); + table = (struct desc_struct *)segment_base(ldt_selector); } - d = (struct desc_struct *)(table_base + (selector & ~7)); - v = get_desc_base(d); + v = get_desc_base(&table[selector >> 3]); return v; } #endif -- cgit v1.2.3-70-g09d2 From d3273deac9c0cdae32eb46f928487433eaa37f87 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 20 Feb 2017 08:56:13 -0800 Subject: x86/asm/64: Drop __cacheline_aligned from struct x86_hw_tss Historically, the entire TSS + io bitmap structure was cacheline aligned, but commit ca241c75037b ("x86: unify tss_struct") changed it (presumably inadvertently) so that the fixed-layout hardware part is cacheline-aligned and the io bitmap is after the padding. This wastes 24 bytes (the hardware part should be 104 bytes, but this pads it to 128 bytes) and, serves no purpose, and causes sizeof(struct x86_hw_tss) to have a confusing value. Drop the pointless alignment. Signed-off-by: Andy Lutomirski Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f8f1b7537abe..1879cdf2b6ae 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -303,7 +303,7 @@ struct x86_hw_tss { u16 reserved5; u16 io_bitmap_base; -} __attribute__((packed)) ____cacheline_aligned; +} __attribute__((packed)); #endif /* -- cgit v1.2.3-70-g09d2 From b7ffc44d5b2ea163899d09289ca7743d5c32e926 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 20 Feb 2017 08:56:14 -0800 Subject: x86/kvm/vmx: Defer TR reload after VM exit Intel's VMX is daft and resets the hidden TSS limit register to 0x67 on VMX reload, and the 0x67 is not configurable. KVM currently reloads TR using the LTR instruction on every exit, but this is quite slow because LTR is serializing. The 0x67 limit is entirely harmless unless ioperm() is in use, so defer the reload until a task using ioperm() is actually running. Here's some poorly done benchmarking using kvm-unit-tests: Before: cpuid 1313 vmcall 1195 mov_from_cr8 11 mov_to_cr8 17 inl_from_pmtimer 6770 inl_from_qemu 6856 inl_from_kernel 2435 outl_to_kernel 1402 After: cpuid 1291 vmcall 1181 mov_from_cr8 11 mov_to_cr8 16 inl_from_pmtimer 6457 inl_from_qemu 6209 inl_from_kernel 2339 outl_to_kernel 1391 Signed-off-by: Andy Lutomirski [Force-reload TR in invalidate_tss_limit. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/desc.h | 48 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ioport.c | 5 +++++ arch/x86/kernel/process.c | 10 ++++++++++ arch/x86/kvm/vmx.c | 23 +++++++++------------- 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 2e781bcc5e12..cb8f9149f6c8 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -205,6 +205,54 @@ static inline void native_load_tr_desc(void) asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } +static inline void force_reload_TR(void) +{ + struct desc_struct *d = get_cpu_gdt_table(smp_processor_id()); + tss_desc tss; + + memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc)); + + /* + * LTR requires an available TSS, and the TSS is currently + * busy. Make it be available so that LTR will work. + */ + tss.type = DESC_TSS; + write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS); + + load_TR_desc(); +} + +DECLARE_PER_CPU(bool, need_tr_refresh); + +static inline void refresh_TR(void) +{ + DEBUG_LOCKS_WARN_ON(preemptible()); + + if (unlikely(this_cpu_read(need_tr_refresh))) { + force_reload_TR(); + this_cpu_write(need_tr_refresh, false); + } +} + +/* + * If you do something evil that corrupts the cached TSS limit (I'm looking + * at you, VMX exits), call this function. + * + * The optimization here is that the TSS limit only matters for Linux if the + * IO bitmap is in use. If the TSS limit gets forced to its minimum value, + * everything works except that IO bitmap will be ignored and all CPL 3 IO + * instructions will #GP, which is exactly what we want for normal tasks. + */ +static inline void invalidate_tss_limit(void) +{ + DEBUG_LOCKS_WARN_ON(preemptible()); + + if (unlikely(test_thread_flag(TIF_IO_BITMAP))) + force_reload_TR(); + else + this_cpu_write(need_tr_refresh, true); +} + static inline void native_load_gdt(const struct desc_ptr *dtr) { asm volatile("lgdt %0"::"m" (*dtr)); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 589b3193f102..b01bc8517450 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * this changes the io permissions bitmap in the current task. @@ -45,6 +46,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) memset(bitmap, 0xff, IO_BITMAP_BYTES); t->io_bitmap_ptr = bitmap; set_thread_flag(TIF_IO_BITMAP); + + preempt_disable(); + refresh_TR(); + preempt_enable(); } /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b615a1113f58..7780efa635b9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -32,6 +32,7 @@ #include #include #include +#include /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -64,6 +65,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { }; EXPORT_PER_CPU_SYMBOL(cpu_tss); +DEFINE_PER_CPU(bool, need_tr_refresh); +EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh); + /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. @@ -209,6 +213,12 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, max(prev->io_bitmap_max, next->io_bitmap_max)); + + /* + * Make sure that the TSS limit is correct for the CPU + * to notice the IO bitmap. + */ + refresh_TR(); } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { /* * Clear any possible leftover bits: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2dd94cf597cc..acf6013a0caf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1990,19 +1990,6 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, m->host[i].value = host_val; } -static void reload_tss(void) -{ - /* - * VT restores TR but not its size. Useless. - */ - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); - struct desc_struct *descs; - - descs = (void *)gdt->address; - descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ - load_TR_desc(); -} - static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) { u64 guest_efer = vmx->vcpu.arch.efer; @@ -2172,7 +2159,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) loadsegment(es, vmx->host_state.es_sel); } #endif - reload_tss(); + invalidate_tss_limit(); #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif @@ -2293,6 +2280,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) (unsigned long)this_cpu_ptr(&cpu_tss)); vmcs_writel(HOST_GDTR_BASE, gdt->address); + /* + * VM exits change the host TR limit to 0x67 after a VM + * exit. This is okay, since 0x67 covers everything except + * the IO bitmap and have have code to handle the IO bitmap + * being lost after a VM exit. + */ + BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); + rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ -- cgit v1.2.3-70-g09d2 From 96794e4ed4d758272c486e1529e431efb7045265 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Tue, 21 Feb 2017 03:50:01 -0500 Subject: KVM: VMX: use correct vmcs_read/write for guest segment selector/base Guest segment selector is 16 bit field and guest segment base is natural width field. Fix two incorrect invocations accordingly. Without this patch, build fails when aggressive inlining is used with ICC. Cc: stable@vger.kernel.org Signed-off-by: Chao Peng Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index acf6013a0caf..ef4ba71dbb66 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3905,7 +3905,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) } vmcs_write16(sf->selector, var.selector); - vmcs_write32(sf->base, var.base); + vmcs_writel(sf->base, var.base); vmcs_write32(sf->limit, var.limit); vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); } @@ -8270,7 +8270,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm) static void vmx_dump_sel(char *name, uint32_t sel) { pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(sel), + name, vmcs_read16(sel), vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); -- cgit v1.2.3-70-g09d2 From 6c62985d576c8a816f528c39204207b9f449d923 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 Feb 2017 13:36:03 -0500 Subject: x86/paravirt: Change vcp_is_preempted() arg type to long The cpu argument in the function prototype of vcpu_is_preempted() is changed from int to long. That makes it easier to provide a better optimized assembly version of that function. For Xen, vcpu_is_preempted(long) calls xen_vcpu_stolen(int), the downcast from long to int is not a problem as vCPU number won't exceed 32 bits. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/include/asm/qspinlock.h | 2 +- arch/x86/kernel/kvm.c | 2 +- arch/x86/kernel/paravirt-spinlocks.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 1eea6ca40694..f75fbfe550f2 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -673,7 +673,7 @@ static __always_inline void pv_kick(int cpu) PVOP_VCALL1(pv_lock_ops.kick, cpu); } -static __always_inline bool pv_vcpu_is_preempted(int cpu) +static __always_inline bool pv_vcpu_is_preempted(long cpu) { return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu); } diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index c343ab52579f..48a706f641f2 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -34,7 +34,7 @@ static inline void queued_spin_unlock(struct qspinlock *lock) } #define vcpu_is_preempted vcpu_is_preempted -static inline bool vcpu_is_preempted(int cpu) +static inline bool vcpu_is_preempted(long cpu) { return pv_vcpu_is_preempted(cpu); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 36bc66416021..334173d2665a 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -589,7 +589,7 @@ out: local_irq_restore(flags); } -__visible bool __kvm_vcpu_is_preempted(int cpu) +__visible bool __kvm_vcpu_is_preempted(long cpu) { struct kvm_steal_time *src = &per_cpu(steal_time, cpu); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 6d4bf812af45..8caa8a18472b 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -20,7 +20,7 @@ bool pv_is_native_spin_unlock(void) __raw_callee_save___native_queued_spin_unlock; } -__visible bool __native_vcpu_is_preempted(int cpu) +__visible bool __native_vcpu_is_preempted(long cpu) { return false; } -- cgit v1.2.3-70-g09d2 From dd0fd8bca1850ddadf5d33a9ed28f3707cd98ac7 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 Feb 2017 13:36:04 -0500 Subject: x86/kvm: Provide optimized version of vcpu_is_preempted() for x86-64 It was found when running fio sequential write test with a XFS ramdisk on a KVM guest running on a 2-socket x86-64 system, the %CPU times as reported by perf were as follows: 69.75% 0.59% fio [k] down_write 69.15% 0.01% fio [k] call_rwsem_down_write_failed 67.12% 1.12% fio [k] rwsem_down_write_failed 63.48% 52.77% fio [k] osq_lock 9.46% 7.88% fio [k] __raw_callee_save___kvm_vcpu_is_preempt 3.93% 3.93% fio [k] __kvm_vcpu_is_preempted Making vcpu_is_preempted() a callee-save function has a relatively high cost on x86-64 primarily due to at least one more cacheline of data access from the saving and restoring of registers (8 of them) to and from stack as well as one more level of function call. To reduce this performance overhead, an optimized assembly version of the the __raw_callee_save___kvm_vcpu_is_preempt() function is provided for x86-64. With this patch applied on a KVM guest on a 2-socket 16-core 32-thread system with 16 parallel jobs (8 on each socket), the aggregrate bandwidth of the fio test on an XFS ramdisk were as follows: I/O Type w/o patch with patch -------- --------- ---------- random read 8141.2 MB/s 8497.1 MB/s seq read 8229.4 MB/s 8304.2 MB/s random write 1675.5 MB/s 1701.5 MB/s seq write 1681.3 MB/s 1699.9 MB/s There are some increases in the aggregated bandwidth because of the patch. The perf data now became: 70.78% 0.58% fio [k] down_write 70.20% 0.01% fio [k] call_rwsem_down_write_failed 69.70% 1.17% fio [k] rwsem_down_write_failed 59.91% 55.42% fio [k] osq_lock 10.14% 10.14% fio [k] __kvm_vcpu_is_preempted The assembly code was verified by using a test kernel module to compare the output of C __kvm_vcpu_is_preempted() and that of assembly __raw_callee_save___kvm_vcpu_is_preempt() to verify that they matched. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paolo Bonzini --- arch/x86/kernel/asm-offsets_64.c | 9 +++++++++ arch/x86/kernel/kvm.c | 24 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 210927ee2e74..99332f550c48 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -13,6 +13,10 @@ static char syscalls_ia32[] = { #include }; +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#include +#endif + int main(void) { #ifdef CONFIG_PARAVIRT @@ -22,6 +26,11 @@ int main(void) BLANK(); #endif +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) + OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted); + BLANK(); +#endif + #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(cx); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 334173d2665a..d05797be2f64 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -589,6 +589,7 @@ out: local_irq_restore(flags); } +#ifdef CONFIG_X86_32 __visible bool __kvm_vcpu_is_preempted(long cpu) { struct kvm_steal_time *src = &per_cpu(steal_time, cpu); @@ -597,6 +598,29 @@ __visible bool __kvm_vcpu_is_preempted(long cpu) } PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); +#else + +#include + +extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); + +/* + * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and + * restoring to/from the stack. + */ +asm( +".pushsection .text;" +".global __raw_callee_save___kvm_vcpu_is_preempted;" +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" +"__raw_callee_save___kvm_vcpu_is_preempted:" +"movq __per_cpu_offset(,%rdi,8), %rax;" +"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"setne %al;" +"ret;" +".popsection"); + +#endif + /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ -- cgit v1.2.3-70-g09d2