From b8f8d190fa8fa1909dda12d771df67125d6fbf0c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 22 Dec 2022 09:26:31 +0000 Subject: KVM: arm64: Document the behaviour of S1PTW faults on RO memslots Although the KVM API says that a write to a RO memslot must result in a KVM_EXIT_MMIO describing the write, the arm64 architecture doesn't provide the *data* written by a Stage-1 page table walk (we only get the address). Since there isn't much userspace can do with so little information anyway, document the fact that such an access results in a guest exception, not an exit. This is consistent with the guest being terminally broken anyway. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- Documentation/virt/kvm/api.rst | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'Documentation/virt') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0dd5d8733dd5..42db72a0cbe6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1354,6 +1354,14 @@ the memory region are automatically reflected into the guest. For example, an mmap() that affects the region will be made visible immediately. Another example is madvise(MADV_DROP). +Note: On arm64, a write generated by the page-table walker (to update +the Access and Dirty flags, for example) never results in a +KVM_EXIT_MMIO exit when the slot has the KVM_MEM_READONLY flag. This +is because KVM cannot provide the data that would be written by the +page-table walker, making it impossible to emulate the access. +Instead, an abort (data abort if the cause of the page-table update +was a load or a store, instruction abort if it was an instruction +fetch) is injected in the guest. 4.36 KVM_SET_TSS_ADDR --------------------- -- cgit v1.2.3-70-g09d2 From 45e966fcca03ecdcccac7cb236e16eea38cc18af Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 22 Oct 2022 04:17:53 -0400 Subject: KVM: x86: Do not return host topology information from KVM_GET_SUPPORTED_CPUID Passing the host topology to the guest is almost certainly wrong and will confuse the scheduler. In addition, several fields of these CPUID leaves vary on each processor; it is simply impossible to return the right values from KVM_GET_SUPPORTED_CPUID in such a way that they can be passed to KVM_SET_CPUID2. The values that will most likely prevent confusion are all zeroes. Userspace will have to override it anyway if it wishes to present a specific topology to the guest. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 14 ++++++++++++++ arch/x86/kvm/cpuid.c | 32 ++++++++++++++++---------------- 2 files changed, 30 insertions(+), 16 deletions(-) (limited to 'Documentation/virt') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index deb494f759ed..d8ea37dfddf4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8310,6 +8310,20 @@ CPU[EAX=1]:ECX[24] (TSC_DEADLINE) is not reported by ``KVM_GET_SUPPORTED_CPUID`` It can be enabled if ``KVM_CAP_TSC_DEADLINE_TIMER`` is present and the kernel has enabled in-kernel emulation of the local APIC. +CPU topology +~~~~~~~~~~~~ + +Several CPUID values include topology information for the host CPU: +0x0b and 0x1f for Intel systems, 0x8000001e for AMD systems. Different +versions of KVM return different values for this information and userspace +should not rely on it. Currently they return all zeroes. + +If userspace wishes to set up a guest topology, it should be careful that +the values of these three leaves differ for each CPU. In particular, +the APIC ID is found in EDX for all subleaves of 0x0b and 0x1f, and in EAX +for 0x8000001e; the latter also encodes the core id and node id in bits +7:0 of EBX and ECX respectively. + Obsolete ioctls and capabilities ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b14653b61470..596061c1610e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -770,16 +770,22 @@ struct kvm_cpuid_array { int nent; }; +static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array) +{ + if (array->nent >= array->maxnent) + return NULL; + + return &array->entries[array->nent++]; +} + static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, u32 function, u32 index) { - struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *entry = get_next_cpuid(array); - if (array->nent >= array->maxnent) + if (!entry) return NULL; - entry = &array->entries[array->nent++]; - memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; @@ -956,22 +962,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = edx.full; break; } - /* - * Per Intel's SDM, the 0x1f is a superset of 0xb, - * thus they can be handled by common code. - */ case 0x1f: case 0xb: /* - * Populate entries until the level type (ECX[15:8]) of the - * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is - * the starting entry, filled by the primary do_host_cpuid(). + * No topology; a valid topology is indicated by the presence + * of subleaf 1. */ - for (i = 1; entry->ecx & 0xff00; ++i) { - entry = do_host_cpuid(array, function, i); - if (!entry) - goto out; - } + entry->eax = entry->ebx = entry->ecx = 0; break; case 0xd: { u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); @@ -1202,6 +1199,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = entry->ecx = entry->edx = 0; break; case 0x8000001e: + /* Do not return host topology information. */ + entry->eax = entry->ebx = entry->ecx = 0; + entry->edx = 0; /* reserved */ break; case 0x8000001F: if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) { -- cgit v1.2.3-70-g09d2 From 3a9ae31ac26a58d33008c42f6cd022afc2af2dc0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 9 Jan 2023 06:02:16 -0500 Subject: Documentation: kvm: fix SRCU locking order docs kvm->srcu is taken in KVM_RUN and several other vCPU ioctls, therefore vcpu->mutex is susceptible to the same deadlock that is documented for kvm->slots_lock. The same holds for kvm->lock, since kvm->lock is held outside vcpu->mutex. Fix the documentation and rearrange it to highlight the difference between these locks and kvm->slots_arch_lock, and how kvm->slots_arch_lock can be useful while processing a vmexit. Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'Documentation/virt') diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index a3ca76f9be75..5ee017740d55 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -24,17 +24,18 @@ The acquisition orders for mutexes are as follows: For SRCU: -- ``synchronize_srcu(&kvm->srcu)`` is called _inside_ - the kvm->slots_lock critical section, therefore kvm->slots_lock - cannot be taken inside a kvm->srcu read-side critical section. - Instead, kvm->slots_arch_lock is released before the call - to ``synchronize_srcu()`` and _can_ be taken inside a - kvm->srcu read-side critical section. - -- kvm->lock is taken inside kvm->srcu, therefore - ``synchronize_srcu(&kvm->srcu)`` cannot be called inside - a kvm->lock critical section. If you cannot delay the - call until after kvm->lock is released, use ``call_srcu``. +- ``synchronize_srcu(&kvm->srcu)`` is called inside critical sections + for kvm->lock, vcpu->mutex and kvm->slots_lock. These locks _cannot_ + be taken inside a kvm->srcu read-side critical section; that is, the + following is broken:: + + srcu_read_lock(&kvm->srcu); + mutex_lock(&kvm->slots_lock); + +- kvm->slots_arch_lock instead is released before the call to + ``synchronize_srcu()``. It _can_ therefore be taken inside a + kvm->srcu read-side critical section, for example while processing + a vmexit. On x86: -- cgit v1.2.3-70-g09d2 From 310bc39546a435c83cc27a0eba878afac0d74714 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 11 Jan 2023 18:06:51 +0000 Subject: KVM: x86/xen: Avoid deadlock by adding kvm->arch.xen.xen_lock leaf node lock In commit 14243b387137a ("KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery") the clever version of me left some helpful notes for those who would come after him: /* * For the irqfd workqueue, using the main kvm->lock mutex is * fine since this function is invoked from kvm_set_irq() with * no other lock held, no srcu. In future if it will be called * directly from a vCPU thread (e.g. on hypercall for an IPI) * then it may need to switch to using a leaf-node mutex for * serializing the shared_info mapping. */ mutex_lock(&kvm->lock); In commit 2fd6df2f2b47 ("KVM: x86/xen: intercept EVTCHNOP_send from guests") the other version of me ran straight past that comment without reading it, and introduced a potential deadlock by taking vcpu->mutex and kvm->lock in the wrong order. Solve this as originally suggested, by adding a leaf-node lock in the Xen state rather than using kvm->lock for it. Fixes: 2fd6df2f2b47 ("KVM: x86/xen: intercept EVTCHNOP_send from guests") Signed-off-by: David Woodhouse Message-Id: <20230111180651.14394-4-dwmw2@infradead.org> [Rebase, add docs. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 2 +- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/xen.c | 67 +++++++++++++++++--------------------- 3 files changed, 32 insertions(+), 38 deletions(-) (limited to 'Documentation/virt') diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 5ee017740d55..a0146793d197 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -39,7 +39,7 @@ For SRCU: On x86: -- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock +- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock and kvm->arch.xen.xen_lock - kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f35f1ff4427b..6aaae18f1854 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1111,6 +1111,7 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { + struct mutex xen_lock; u32 xen_version; bool long_mode; bool runstate_update_flag; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 651f9c5b873d..8fd41f5deae3 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -607,26 +607,26 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { r = -EINVAL; } else { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.long_mode = !!data->u.long_mode; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); break; case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; else { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.upcall_vector = data->u.vector; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; @@ -636,9 +636,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_XEN_VERSION: - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.xen_version = data->u.xen_version; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; @@ -647,9 +647,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = -EOPNOTSUPP; break; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; @@ -664,7 +664,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { int r = -ENOENT; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_ATTR_TYPE_LONG_MODE: @@ -703,7 +703,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return r; } @@ -711,7 +711,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int idx, r = -ENOENT; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); idx = srcu_read_lock(&vcpu->kvm->srcu); switch (data->type) { @@ -939,7 +939,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) } srcu_read_unlock(&vcpu->kvm->srcu, idx); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } @@ -947,7 +947,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int r = -ENOENT; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: @@ -1030,7 +1030,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } @@ -1123,7 +1123,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) xhc->blob_size_32 || xhc->blob_size_64)) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); if (xhc->msr && !kvm->arch.xen_hvm_config.msr) static_branch_inc(&kvm_xen_enabled.key); @@ -1132,7 +1132,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return 0; } @@ -1675,15 +1675,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) mm_borrowed = true; } - /* - * For the irqfd workqueue, using the main kvm->lock mutex is - * fine since this function is invoked from kvm_set_irq() with - * no other lock held, no srcu. In future if it will be called - * directly from a vCPU thread (e.g. on hypercall for an IPI) - * then it may need to switch to using a leaf-node mutex for - * serializing the shared_info mapping. - */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); /* * It is theoretically possible for the page to be unmapped @@ -1712,7 +1704,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } while(!rc); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (mm_borrowed) kthread_unuse_mm(kvm->mm); @@ -1828,7 +1820,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, int ret; /* Protect writes to evtchnfd as well as the idr lookup. */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port); ret = -ENOENT; @@ -1859,7 +1851,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, } ret = 0; out_unlock: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return ret; } @@ -1922,10 +1914,10 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1, GFP_KERNEL); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (ret >= 0) return 0; @@ -1943,9 +1935,9 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) { struct evtchnfd *evtchnfd; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (!evtchnfd) return -ENOENT; @@ -1963,7 +1955,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm) int i; int n = 0; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); /* * Because synchronize_srcu() cannot be called inside the @@ -1975,7 +1967,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm) all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL); if (!all_evtchnfds) { - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return -ENOMEM; } @@ -1984,7 +1976,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm) all_evtchnfds[n++] = evtchnfd; idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port); } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); synchronize_srcu(&kvm->srcu); @@ -2086,6 +2078,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) void kvm_xen_init_vm(struct kvm *kvm) { + mutex_init(&kvm->arch.xen.xen_lock); idr_init(&kvm->arch.xen.evtchn_ports); kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN); } -- cgit v1.2.3-70-g09d2