diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2024-03-11 10:29:22 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2024-03-11 10:29:22 -0400 |
commit | 41ebae2ecd1496aaf72596af9f37529d62a160ab (patch) | |
tree | 622acefe722f98cebca79afdd72f1c68149b3f54 /arch/x86 | |
parent | c9cd0beae9d9c3a675eb73ee45e29900d8c11bd2 (diff) | |
parent | a364c014a2c1ad6e011bc5fdb8afb9d4ba316956 (diff) |
Merge tag 'kvm-x86-mmu-6.9' of https://github.com/kvm-x86/linux into HEAD
KVM x86 MMU changes for 6.9:
- Clean up code related to unprotecting shadow pages when retrying a guest
instruction after failed #PF-induced emulation.
- Zap TDP MMU roots at 4KiB granularity to minimize the delay in yielding if
a reschedule is needed, e.g. if a high priority task needs to run. Because
KVM doesn't support yielding in the middle of processing a zapped non-leaf
SPTE, zapping at 1GiB granularity can result in multi-millisecond lag when
attempting to schedule in a high priority.
- Rework TDP MMU root unload, free, and alloc to run with mmu_lock held for
read, e.g. to avoid serializing vCPUs when userspace deletes a memslot.
- Allocate write-tracking metadata on-demand to avoid the memory overhead when
running kernels built with KVMGT support (external write-tracking enabled),
but for workloads that don't use nested virtualization (shadow paging) or
KVMGT.
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 37 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/page_track.c | 68 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 124 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 39 |
6 files changed, 203 insertions, 76 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6b87770ea34d..8116839cb263 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1468,6 +1468,15 @@ struct kvm_arch { */ bool shadow_root_allocated; +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING + /* + * If set, the VM has (or had) an external write tracking user, and + * thus all write tracking metadata has been allocated, even if KVM + * itself isn't using write tracking. + */ + bool external_write_tracking_enabled; +#endif + #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2d6cdeab1f8a..e5e2af69e24d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3575,10 +3575,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (WARN_ON_ONCE(!sp)) return; - if (is_tdp_mmu_page(sp)) + if (is_tdp_mmu_page(sp)) { + lockdep_assert_held_read(&kvm->mmu_lock); kvm_tdp_mmu_put_root(kvm, sp); - else if (!--sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } else { + lockdep_assert_held_write(&kvm->mmu_lock); + if (!--sp->root_count && sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } *root_hpa = INVALID_PAGE; } @@ -3587,6 +3591,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free) { + bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct; int i; LIST_HEAD(invalid_list); bool free_active_root; @@ -3609,7 +3614,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, return; } - write_lock(&kvm->mmu_lock); + if (is_tdp_mmu) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) @@ -3635,8 +3643,13 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, mmu->root.pgd = 0; } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu) { + read_unlock(&kvm->mmu_lock); + WARN_ON_ONCE(!list_empty(&invalid_list)); + } else { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + write_unlock(&kvm->mmu_lock); + } } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3693,15 +3706,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) unsigned i; int r; + if (tdp_mmu_enabled) + return kvm_tdp_mmu_alloc_root(vcpu); + write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock; - if (tdp_mmu_enabled) { - root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); - mmu->root.hpa = root; - } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { + if (shadow_root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { @@ -6997,9 +7010,7 @@ int kvm_mmu_vendor_module_init(void) kvm_mmu_reset_all_pte_masks(); - pte_list_desc_cache = kmem_cache_create("pte_list_desc", - sizeof(struct pte_list_desc), - 0, SLAB_ACCOUNT, NULL); + pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT); if (!pte_list_desc_cache) goto out; diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index c87da11f3a04..f6448284c18e 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -20,10 +20,23 @@ #include "mmu_internal.h" #include "page_track.h" +static bool kvm_external_write_tracking_enabled(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING + /* + * Read external_write_tracking_enabled before related pointers. Pairs + * with the smp_store_release in kvm_page_track_write_tracking_enable(). + */ + return smp_load_acquire(&kvm->arch.external_write_tracking_enabled); +#else + return false; +#endif +} + bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) { - return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || - !tdp_enabled || kvm_shadow_root_allocated(kvm); + return kvm_external_write_tracking_enabled(kvm) || + kvm_shadow_root_allocated(kvm) || !tdp_enabled; } void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) @@ -153,6 +166,50 @@ int kvm_page_track_init(struct kvm *kvm) return init_srcu_struct(&head->track_srcu); } +static int kvm_enable_external_write_tracking(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + int r = 0, i, bkt; + + mutex_lock(&kvm->slots_arch_lock); + + /* + * Check for *any* write tracking user (not just external users) under + * lock. This avoids unnecessary work, e.g. if KVM itself is using + * write tracking, or if two external users raced when registering. + */ + if (kvm_page_track_write_tracking_enabled(kvm)) + goto out_success; + + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, bkt, slots) { + /* + * Intentionally do NOT free allocations on failure to + * avoid having to track which allocations were made + * now versus when the memslot was created. The + * metadata is guaranteed to be freed when the slot is + * freed, and will be kept/used if userspace retries + * the failed ioctl() instead of killing the VM. + */ + r = kvm_page_track_write_tracking_alloc(slot); + if (r) + goto out_unlock; + } + } + +out_success: + /* + * Ensure that external_write_tracking_enabled becomes true strictly + * after all the related pointers are set. + */ + smp_store_release(&kvm->arch.external_write_tracking_enabled, true); +out_unlock: + mutex_unlock(&kvm->slots_arch_lock); + return r; +} + /* * register the notifier so that event interception for the tracked guest * pages can be received. @@ -161,10 +218,17 @@ int kvm_page_track_register_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; + int r; if (!kvm || kvm->mm != current->mm) return -ESRCH; + if (!kvm_external_write_tracking_enabled(kvm)) { + r = kvm_enable_external_write_tracking(kvm); + if (r) + return r; + } + kvm_get_kvm(kvm); head = &kvm->arch.track_notifier_head; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6ae19b4ee5b1..d078157e62aa 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -149,11 +149,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * If shared is set, this function is operating under the MMU lock in read * mode. */ -#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\ - for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ - ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ - if (kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ + ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ + if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \ } else #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ @@ -171,12 +171,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * Holding mmu_lock for write obviates the need for RCU protection as the list * is guaranteed to be stable. */ -#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ - if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ - kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ + ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + ((_only_valid) && (_root)->role.invalid))) { \ } else +#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root(_kvm, _root, _as_id, false) + +#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root(_kvm, _root, _as_id, true) + static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; @@ -216,22 +223,41 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); } -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) +int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu) { - union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; + struct kvm_mmu *mmu = vcpu->arch.mmu; + union kvm_mmu_page_role role = mmu->root_role; + int as_id = kvm_mmu_role_as_id(role); struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; - lockdep_assert_held_write(&kvm->mmu_lock); + /* + * Check for an existing root before acquiring the pages lock to avoid + * unnecessary serialization if multiple vCPUs are loading a new root. + * E.g. when bringing up secondary vCPUs, KVM will already have created + * a valid root on behalf of the primary vCPU. + */ + read_lock(&kvm->mmu_lock); + + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) { + if (root->role.word == role.word) + goto out_read_unlock; + } + + spin_lock(&kvm->arch.tdp_mmu_pages_lock); /* - * Check for an existing root before allocating a new one. Note, the - * role check prevents consuming an invalid root. + * Recheck for an existing root after acquiring the pages lock, another + * vCPU may have raced ahead and created a new usable root. Manually + * walk the list of roots as the standard macros assume that the pages + * lock is *not* held. WARN if grabbing a reference to a usable root + * fails, as the last reference to a root can only be put *after* the + * root has been invalidated, which requires holding mmu_lock for write. */ - for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { if (root->role.word == role.word && - kvm_tdp_mmu_get_root(root)) - goto out; + !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) + goto out_spin_unlock; } root = tdp_mmu_alloc_sp(vcpu); @@ -245,13 +271,20 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). */ refcount_set(&root->tdp_mmu_root_count, 2); - - spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); -out: - return __pa(root->spt); +out_spin_unlock: + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +out_read_unlock: + read_unlock(&kvm->mmu_lock); + /* + * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest + * and actually consuming the root if it's invalidated after dropping + * mmu_lock, and the root can't be freed as this vCPU holds a reference. + */ + mmu->root.hpa = __pa(root->spt); + mmu->root.pgd = 0; + return 0; } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, @@ -734,15 +767,26 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); /* - * To avoid RCU stalls due to recursively removing huge swaths of SPs, - * split the zap into two passes. On the first pass, zap at the 1gb - * level, and then zap top-level SPs on the second pass. "1gb" is not - * arbitrary, as KVM must be able to zap a 1gb shadow page without - * inducing a stall to allow in-place replacement with a 1gb hugepage. + * Zap roots in multiple passes of decreasing granularity, i.e. zap at + * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all + * preempt models) or mmu_lock contention (full or real-time models). + * Zapping at finer granularity marginally increases the total time of + * the zap, but in most cases the zap itself isn't latency sensitive. * - * Because zapping a SP recurses on its children, stepping down to - * PG_LEVEL_4K in the iterator itself is unnecessary. + * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps + * in order to mimic the page fault path, which can replace a 1GiB page + * table with an equivalent 1GiB hugepage, i.e. can get saddled with + * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This + * allows verifying that KVM can safely zap 1GiB regions, e.g. without + * inducing RCU stalls, without relying on a relatively rare event + * (zapping roots is orders of magnitude more common). Note, because + * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K + * in the iterator itself is unnecessary. */ + if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K); + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M); + } __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); __tdp_mmu_zap_root(kvm, root, shared, root->role.level); @@ -800,7 +844,13 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, continue; tdp_mmu_iter_set_spte(kvm, &iter, 0); - flush = true; + + /* + * Zappings SPTEs in invalid roots doesn't require a TLB flush, + * see kvm_tdp_mmu_zap_invalidated_roots() for details. + */ + if (!root->role.invalid) + flush = true; } rcu_read_unlock(); @@ -813,16 +863,16 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns - * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or - * more SPTEs were zapped since the MMU lock was last acquired. + * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots. + * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if + * one or more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { struct kvm_mmu_page *root; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1) flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; @@ -896,7 +946,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) * the VM is being destroyed). * * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. - * See kvm_tdp_mmu_get_vcpu_root_hpa(). + * See kvm_tdp_mmu_alloc_root(). */ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) { @@ -1622,7 +1672,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root(kvm, root, slot->as_id) + for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } @@ -1740,7 +1790,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, bool spte_set = false; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root, slot->as_id) + for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) spte_set |= write_protect_gfn(kvm, root, gfn, min_level); return spte_set; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 20d97aa46c49..6e1ea04ca885 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -10,7 +10,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); +int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e6b1b85dca8a..064862d87b9e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8792,31 +8792,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_release_pfn_clean(pfn); - /* The instructions are well-emulated on direct mmu. */ - if (vcpu->arch.mmu->root_role.direct) { - unsigned int indirect_shadow_pages; - - write_lock(&vcpu->kvm->mmu_lock); - indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; - write_unlock(&vcpu->kvm->mmu_lock); - - if (indirect_shadow_pages) - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - - return true; - } - /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-enter the - * guest to let CPU execute the instruction. + * If emulation may have been triggered by a write to a shadowed page + * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the + * guest to let the CPU re-execute the instruction in the hope that the + * CPU can cleanly execute the instruction that KVM failed to emulate. */ - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - - /* - * If the access faults on its page table, it can not - * be fixed by unprotecting shadow page and it should - * be reported to userspace. + if (vcpu->kvm->arch.indirect_shadow_pages) + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + + /* + * If the failed instruction faulted on an access to page tables that + * are used to translate any part of the instruction, KVM can't resolve + * the issue by unprotecting the gfn, as zapping the shadow page will + * result in the instruction taking a !PRESENT page fault and thus put + * the vCPU into an infinite loop of page faults. E.g. KVM will create + * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and + * then zap the SPTE to unprotect the gfn, and then do it all over + * again. Report the error to userspace. */ return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); } |