diff options
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 201 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 99 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 76 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 253 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 18 | ||||
-rw-r--r-- | virt/kvm/Kconfig | 4 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 20 |
11 files changed, 278 insertions, 408 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6d9f763a7bb9..433e65974e2b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1306,7 +1306,6 @@ struct kvm_arch { bool pre_fault_allowed; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; - struct list_head zapped_obsolete_pages; /* * A list of kvm_mmu_page structs that, if zapped, could possibly be * replaced by an NX huge page. A shadow page is on this list if its @@ -1955,8 +1954,8 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level); -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot); +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f09f13c01c6b..1ed1e4f5d51c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM_X86 depends on X86_LOCAL_APIC select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER + select KVM_ELIDE_TLB_FLUSH_IF_YOUNG select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_DIRTY_RING_TSO diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 084e6c4d1eaf..38128e7b9af1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator { static struct kmem_cache *pte_list_desc_cache; struct kmem_cache *mmu_page_header_cache; -static struct percpu_counter kvm_total_used_mmu_pages; static void mmu_spte_set(u64 *sptep, u64 spte); @@ -485,11 +484,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) __set_spte(sptep, new_spte); } -/* - * Update the SPTE (excluding the PFN), but do not track changes in its - * accessed/dirty status. +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changed. + * + * Returns true if the TLB needs to be flushed */ -static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; @@ -498,7 +498,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return old_spte; + return false; } if (!spte_has_volatile_bits(old_spte)) @@ -506,49 +506,10 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); - WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); - - return old_spte; -} - -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changed. - * - * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote - * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only - * spte, even though the writable spte might be cached on a CPU's TLB. - * - * Returns true if the TLB needs to be flushed - */ -static bool mmu_spte_update(u64 *sptep, u64 new_spte) -{ - bool flush = false; - u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); - - if (!is_shadow_present_pte(old_spte)) - return false; - - /* - * For the spte updated out of mmu-lock is safe, since - * we always atomically update it, see the comments in - * spte_has_volatile_bits(). - */ - if (is_mmu_writable_spte(old_spte) && - !is_writable_pte(new_spte)) - flush = true; - - /* - * Flush TLB when accessed/dirty states are changed in the page tables, - * to guarantee consistency between TLB and page tables. - */ - - if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) - flush = true; - - if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) - flush = true; + WARN_ON_ONCE(!is_shadow_present_pte(old_spte) || + spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); - return flush; + return leaf_spte_change_needs_tlb_flush(old_spte, new_spte); } /* @@ -1606,8 +1567,13 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm, clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } else { + /* + * WARN if mmu_spte_update() signals the need + * for a TLB flush, as Access tracking a SPTE + * should never trigger an _immediate_ flush. + */ spte = mark_spte_for_access_track(spte); - mmu_spte_update_no_track(sptep, spte); + WARN_ON_ONCE(mmu_spte_update(sptep, spte)); } young = true; } @@ -1655,27 +1621,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) #endif } -/* - * This value is the sum of all of the kvm instances's - * kvm->arch.n_used_mmu_pages values. We need a global, - * aggregate version in order to make the slab shrinker - * faster - */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) -{ - kvm->arch.n_used_mmu_pages += nr; - percpu_counter_add(&kvm_total_used_mmu_pages, nr); -} - static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm_mod_used_mmu_pages(kvm, +1); + kvm->arch.n_used_mmu_pages++; kvm_account_pgtable_pages((void *)sp->spt, +1); } static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm_mod_used_mmu_pages(kvm, -1); + kvm->arch.n_used_mmu_pages--; kvm_account_pgtable_pages((void *)sp->spt, -1); } @@ -3147,13 +3101,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm, } int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level) + const struct kvm_memory_slot *slot, gfn_t gfn) { bool is_private = kvm_slot_can_be_private(slot) && kvm_mem_is_private(kvm, gfn); - return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private); + return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); } void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -3373,7 +3326,7 @@ static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault * by setting the Writable bit, which can be done out of mmu_lock. */ if (!fault->present) - return !kvm_ad_enabled(); + return !kvm_ad_enabled; /* * Note, instruction fetches and writes are mutually exclusive, ignore @@ -3508,8 +3461,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * uses A/D bits for non-nested MMUs. Thus, if A/D bits are * enabled, the SPTE can't be an access-tracked SPTE. */ - if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte)) - new_spte = restore_acc_track_spte(new_spte); + if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte)) + new_spte = restore_acc_track_spte(new_spte) | + shadow_accessed_mask; /* * To keep things simple, only SPTEs that are MMU-writable can @@ -5485,7 +5439,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.efer_nx = true; role.smm = cpu_role.base.smm; role.guest_mode = cpu_role.base.guest_mode; - role.ad_disabled = !kvm_ad_enabled(); + role.ad_disabled = !kvm_ad_enabled; role.level = kvm_mmu_get_tdp_level(vcpu); role.direct = true; role.has_4_byte_gpte = false; @@ -6413,8 +6367,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; + LIST_HEAD(invalid_list); bool unstable; + lockdep_assert_held(&kvm->slots_lock); + restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { @@ -6446,7 +6403,7 @@ restart: } unstable = __kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages, &nr_zapped); + &invalid_list, &nr_zapped); batch += nr_zapped; if (unstable) @@ -6462,7 +6419,7 @@ restart: * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are * running with an obsolete MMU. */ - kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } /* @@ -6525,16 +6482,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_tdp_mmu_zap_invalidated_roots(kvm); } -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) -{ - return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); -} - void kvm_mmu_init_vm(struct kvm *kvm) { kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); @@ -6768,7 +6719,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm, continue; } - spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); + spte = make_small_spte(kvm, huge_spte, sp->role, index); mmu_spte_set(sptep, spte); __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); } @@ -6951,8 +6902,7 @@ restart: * mapping if the indirect sp has level = 1. */ if (sp->role.direct && - sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - PG_LEVEL_NUM)) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_remote_tlbs_range()) @@ -6980,8 +6930,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, kvm_flush_remote_tlbs_memslot(kvm, slot); } -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); @@ -6991,7 +6941,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); + kvm_tdp_mmu_recover_huge_pages(kvm, slot); read_unlock(&kvm->mmu_lock); } } @@ -7146,72 +7096,6 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) } } -static unsigned long mmu_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct kvm *kvm; - int nr_to_scan = sc->nr_to_scan; - unsigned long freed = 0; - - mutex_lock(&kvm_lock); - - list_for_each_entry(kvm, &vm_list, vm_list) { - int idx; - - /* - * Never scan more than sc->nr_to_scan VM instances. - * Will not hit this condition practically since we do not try - * to shrink more than one VM and it is very unlikely to see - * !n_used_mmu_pages so many times. - */ - if (!nr_to_scan--) - break; - /* - * n_used_mmu_pages is accessed without holding kvm->mmu_lock - * here. We may skip a VM instance errorneosly, but we do not - * want to shrink a VM that only started to populate its MMU - * anyway. - */ - if (!kvm->arch.n_used_mmu_pages && - !kvm_has_zapped_obsolete_pages(kvm)) - continue; - - idx = srcu_read_lock(&kvm->srcu); - write_lock(&kvm->mmu_lock); - - if (kvm_has_zapped_obsolete_pages(kvm)) { - kvm_mmu_commit_zap_page(kvm, - &kvm->arch.zapped_obsolete_pages); - goto unlock; - } - - freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); - -unlock: - write_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - - /* - * unfair on small ones - * per-vm shrinkers cry out - * sadness comes quickly - */ - list_move_tail(&kvm->vm_list, &vm_list); - break; - } - - mutex_unlock(&kvm_lock); - return freed; -} - -static unsigned long mmu_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - return percpu_counter_read_positive(&kvm_total_used_mmu_pages); -} - -static struct shrinker *mmu_shrinker; - static void mmu_destroy_caches(void) { kmem_cache_destroy(pte_list_desc_cache); @@ -7338,23 +7222,8 @@ int kvm_mmu_vendor_module_init(void) if (!mmu_page_header_cache) goto out; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto out; - - mmu_shrinker = shrinker_alloc(0, "x86-mmu"); - if (!mmu_shrinker) - goto out_shrinker; - - mmu_shrinker->count_objects = mmu_shrink_count; - mmu_shrinker->scan_objects = mmu_shrink_scan; - mmu_shrinker->seeks = DEFAULT_SEEKS * 10; - - shrinker_register(mmu_shrinker); - return 0; -out_shrinker: - percpu_counter_destroy(&kvm_total_used_mmu_pages); out: mmu_destroy_caches(); return ret; @@ -7371,8 +7240,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); - percpu_counter_destroy(&kvm_total_used_mmu_pages); - shrinker_free(mmu_shrinker); } /* diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index fabbea504a69..b00abbe3f6cf 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -346,8 +346,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level); + const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index f1a50a78badb..22551e2f1d00 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -24,6 +24,8 @@ static bool __ro_after_init allow_mmio_caching; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); EXPORT_SYMBOL_GPL(enable_mmio_caching); +bool __read_mostly kvm_ad_enabled; + u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; @@ -133,12 +135,6 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) */ bool spte_has_volatile_bits(u64 spte) { - /* - * Always atomically update spte if it can be updated - * out of mmu-lock, it can ensure dirty bit is not lost, - * also, it can help us to get a stable is_writable_pte() - * to ensure tlb flush is not missed. - */ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; @@ -179,7 +175,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= shadow_present_mask; if (!prefetch || synchronizing) - spte |= spte_shadow_accessed_mask(spte); + spte |= shadow_accessed_mask; /* * For simplicity, enforce the NX huge page mitigation even if not @@ -223,42 +219,27 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; - - /* - * When overwriting an existing leaf SPTE, and the old SPTE was - * writable, skip trying to unsync shadow pages as any relevant - * shadow pages must already be unsync, i.e. the hash lookup is - * unnecessary (and expensive). - * - * The same reasoning applies to dirty page/folio accounting; - * KVM marked the folio dirty when the old SPTE was created, - * thus there's no need to mark the folio dirty again. - * - * Note, both cases rely on KVM not changing PFNs without first - * zapping the old SPTE, which is guaranteed by both the shadow - * MMU and the TDP MMU. - */ - if (is_last_spte(old_spte, level) && is_writable_pte(old_spte)) - goto out; - /* * Unsync shadow pages that are reachable by the new, writable * SPTE. Write-protect the SPTE if the page can't be unsync'd, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. + * + * When overwriting an existing leaf SPTE, and the old SPTE was + * writable, skip trying to unsync shadow pages as any relevant + * shadow pages must already be unsync, i.e. the hash lookup is + * unnecessary (and expensive). Note, this relies on KVM not + * changing PFNs without first zapping the old SPTE, which is + * guaranteed by both the shadow MMU and the TDP MMU. */ - if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) { + if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) && + mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) wrprot = true; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); - } + else + spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask | + shadow_dirty_mask; } - if (pte_access & ACC_WRITE_MASK) - spte |= spte_shadow_dirty_mask(spte); - -out: if (prefetch && !synchronizing) spte = mark_spte_for_access_track(spte); @@ -281,15 +262,15 @@ out: return wrprot; } -static u64 make_spte_executable(u64 spte) +static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) { bool is_access_track = is_access_track_spte(spte); if (is_access_track) spte = restore_acc_track_spte(spte); - spte &= ~shadow_nx_mask; - spte |= shadow_x_mask; + KVM_MMU_WARN_ON(set & clear); + spte = (spte | set) & ~clear; if (is_access_track) spte = mark_spte_for_access_track(spte); @@ -297,6 +278,16 @@ static u64 make_spte_executable(u64 spte) return spte; } +static u64 make_spte_executable(u64 spte) +{ + return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask); +} + +static u64 make_spte_nonexecutable(u64 spte) +{ + return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask); +} + /* * Construct an SPTE that maps a sub-page of the given huge page SPTE where * `index` identifies which sub-page. @@ -304,8 +295,8 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, - union kvm_mmu_page_role role, int index) +u64 make_small_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index) { u64 child_spte = huge_spte; @@ -333,6 +324,26 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, return child_spte; } +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level) +{ + u64 huge_spte; + + KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm); + + huge_spte = small_spte | PT_PAGE_SIZE_MASK; + + /* + * huge_spte already has the address of the sub-page being collapsed + * from small_spte, so just clear the lower address bits to create the + * huge page address. + */ + huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK; + + if (is_nx_huge_page_enabled(kvm)) + huge_spte = make_spte_nonexecutable(huge_spte); + + return huge_spte; +} u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { @@ -365,7 +376,7 @@ u64 mark_spte_for_access_track(u64 spte) spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; - spte &= ~shadow_acc_track_mask; + spte &= ~(shadow_acc_track_mask | shadow_accessed_mask); return spte; } @@ -435,9 +446,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { + kvm_ad_enabled = has_ad_bits; + shadow_user_mask = VMX_EPT_READABLE_MASK; - shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; - shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; + shadow_accessed_mask = VMX_EPT_ACCESS_BIT; + shadow_dirty_mask = VMX_EPT_DIRTY_BIT; shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ @@ -468,6 +481,8 @@ void kvm_mmu_reset_all_pte_masks(void) u8 low_phys_bits; u64 mask; + kvm_ad_enabled = true; + /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index c81cac9358e0..f332b33bc817 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -167,6 +167,15 @@ static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); #define SHADOW_NONPRESENT_VALUE 0ULL #endif + +/* + * True if A/D bits are supported in hardware and are enabled by KVM. When + * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable + * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario + * where KVM is using A/D bits for L1, but not L2. + */ +extern bool __read_mostly kvm_ad_enabled; + extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; @@ -285,17 +294,6 @@ static inline bool is_ept_ve_possible(u64 spte) (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; } -/* - * Returns true if A/D bits are supported in hardware and are enabled by KVM. - * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can - * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the - * scenario where KVM is using A/D bits for L1, but not L2. - */ -static inline bool kvm_ad_enabled(void) -{ - return !!shadow_accessed_mask; -} - static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; @@ -318,18 +316,6 @@ static inline bool spte_ad_need_write_protect(u64 spte) return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; -} - static inline bool is_access_track_spte(u64 spte) { return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; @@ -357,17 +343,7 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte) static inline bool is_accessed_spte(u64 spte) { - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static inline bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; + return spte & shadow_accessed_mask; } static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, @@ -485,6 +461,33 @@ static inline bool is_mmu_writable_spte(u64 spte) return spte & shadow_mmu_writable_mask; } +/* + * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for + * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, + * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM + * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped, + * not whether or not SPTEs were modified, i.e. only the write-tracking case + * needs to flush at the time the SPTEs is modified, before dropping mmu_lock. + * + * Don't flush if the Accessed bit is cleared, as access tracking tolerates + * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a + * mmu_notifier.clear_flush_young() event. + * + * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally + * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and + * when clearing dirty logs, KVM flushes based on whether or not dirty entries + * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found. + * + * Note, this logic only applies to shadow-present leaf SPTEs. The caller is + * responsible for checking that the old SPTE is shadow-present, and is also + * responsible for determining whether or not a TLB flush is required when + * modifying a shadow-present non-leaf SPTE. + */ +static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte) +{ + return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte); +} + static inline u64 get_mmio_spte_generation(u64 spte) { u64 gen; @@ -501,8 +504,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, - union kvm_mmu_page_role role, int index); +u64 make_small_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index); +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 91caa73a905b..4508d868f1cd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -583,48 +583,6 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, return 0; } -static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) -{ - int ret; - - lockdep_assert_held_read(&kvm->mmu_lock); - - /* - * Freeze the SPTE by setting it to a special, non-present value. This - * will stop other threads from immediately installing a present entry - * in its place before the TLBs are flushed. - * - * Delay processing of the zapped SPTE until after TLBs are flushed and - * the FROZEN_SPTE is replaced (see below). - */ - ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE); - if (ret) - return ret; - - kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); - - /* - * No other thread can overwrite the frozen SPTE as they must either - * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not - * overwrite the special frozen SPTE value. Use the raw write helper to - * avoid an unnecessary check on volatile bits. - */ - __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE); - - /* - * Process the zapped SPTE after flushing TLBs, and after replacing - * FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are - * blocked by the FROZEN_SPTE and reduces contention on the child - * SPTEs. - */ - handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - SHADOW_NONPRESENT_VALUE, iter->level, true); - - return 0; -} - - /* * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance @@ -680,6 +638,16 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) +static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, + struct tdp_iter *iter) +{ + if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock)) + return false; + + /* Ensure forward progress has been made before yielding. */ + return iter->next_last_level_gfn != iter->yielded_gfn; +} + /* * Yield if the MMU lock is contended or this thread needs to return control * to the scheduler. @@ -698,31 +666,27 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush, bool shared) { - WARN_ON_ONCE(iter->yielded); + KVM_MMU_WARN_ON(iter->yielded); - /* Ensure forward progress has been made before yielding. */ - if (iter->next_last_level_gfn == iter->yielded_gfn) + if (!tdp_mmu_iter_need_resched(kvm, iter)) return false; - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush) - kvm_flush_remote_tlbs(kvm); - - rcu_read_unlock(); + if (flush) + kvm_flush_remote_tlbs(kvm); - if (shared) - cond_resched_rwlock_read(&kvm->mmu_lock); - else - cond_resched_rwlock_write(&kvm->mmu_lock); + rcu_read_unlock(); - rcu_read_lock(); + if (shared) + cond_resched_rwlock_read(&kvm->mmu_lock); + else + cond_resched_rwlock_write(&kvm->mmu_lock); - WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); + rcu_read_lock(); - iter->yielded = true; - } + WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); - return iter->yielded; + iter->yielded = true; + return true; } static inline gfn_t tdp_mmu_max_gfn_exclusive(void) @@ -1033,7 +997,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; else if (is_shadow_present_pte(iter->old_spte) && - !is_last_spte(iter->old_spte, iter->level)) + (!is_last_spte(iter->old_spte, iter->level) || + WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte)))) kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); /* @@ -1073,7 +1038,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { - u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); + u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled); int ret = 0; if (shared) { @@ -1190,33 +1155,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, return flush; } -typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range); - -static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, - struct kvm_gfn_range *range, - tdp_handler_t handler) -{ - struct kvm_mmu_page *root; - struct tdp_iter iter; - bool ret = false; - - /* - * Don't support rescheduling, none of the MMU notifiers that funnel - * into this helper allow blocking; it'd be dead, wasteful code. - */ - for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { - rcu_read_lock(); - - tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) - ret |= handler(kvm, &iter, range); - - rcu_read_unlock(); - } - - return ret; -} - /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. @@ -1225,15 +1163,10 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ -static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) +static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter) { u64 new_spte; - /* If we have a non-accessed entry we don't need to change the pte. */ - if (!is_accessed_spte(iter->old_spte)) - return false; - if (spte_ad_enabled(iter->old_spte)) { iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, iter->old_spte, @@ -1249,23 +1182,48 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, iter->old_spte, new_spte); - return true; } -bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, + bool test_only) { - return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); + struct kvm_mmu_page *root; + struct tdp_iter iter; + bool ret = false; + + /* + * Don't support rescheduling, none of the MMU notifiers that funnel + * into this helper allow blocking; it'd be dead, wasteful code. Note, + * this helper must NOT be used to unmap GFNs, as it processes only + * valid roots! + */ + for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) { + guard(rcu)(); + + tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) { + if (!is_accessed_spte(iter.old_spte)) + continue; + + if (test_only) + return true; + + ret = true; + kvm_tdp_mmu_age_spte(&iter); + } + } + + return ret; } -static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return is_accessed_spte(iter->old_spte); + return __kvm_tdp_mmu_age_gfn_range(kvm, range, false); } bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); + return __kvm_tdp_mmu_age_gfn_range(kvm, range, true); } /* @@ -1356,7 +1314,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * not been linked in yet and thus is not reachable from any other CPU. */ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); + sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level @@ -1489,16 +1447,15 @@ static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) * from level, so it is valid to key off any shadow page to determine if * write protection is needed for an entire tree. */ - return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled(); + return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled; } -static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) +static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) { const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; - bool spte_set = false; rcu_read_lock(); @@ -1519,31 +1476,24 @@ retry: if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) goto retry; - - spte_set = true; } rcu_read_unlock(); - return spte_set; } /* * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the - * memslot. Returns true if an SPTE has been changed and the TLBs need to be - * flushed. + * memslot. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, +void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; - bool spte_set = false; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) - spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); - - return spte_set; + clear_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); } static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, @@ -1602,21 +1552,55 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -static void zap_collapsible_spte_range(struct kvm *kvm, - struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot) +static int tdp_mmu_make_huge_spte(struct kvm *kvm, + struct tdp_iter *parent, + u64 *huge_spte) +{ + struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte); + gfn_t start = parent->gfn; + gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level); + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + /* + * Use the parent iterator when checking for forward progress so + * that KVM doesn't get stuck continuously trying to yield (i.e. + * returning -EAGAIN here and then failing the forward progress + * check in the caller ad nauseam). + */ + if (tdp_mmu_iter_need_resched(kvm, parent)) + return -EAGAIN; + + *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level); + return 0; + } + + return -ENOENT; +} + +static void recover_huge_pages_range(struct kvm *kvm, + struct kvm_mmu_page *root, + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; + bool flush = false; + u64 huge_spte; + int r; + + if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot))) + return; rcu_read_lock(); for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) + if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { + flush = false; continue; + } if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || !is_shadow_present_pte(iter.old_spte)) @@ -1640,31 +1624,40 @@ retry: if (iter.gfn < start || iter.gfn >= end) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, PG_LEVEL_NUM); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); if (max_mapping_level < iter.level) continue; - /* Note, a successful atomic zap also does a remote TLB flush. */ - if (tdp_mmu_zap_spte_atomic(kvm, &iter)) + r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte); + if (r == -EAGAIN) + goto retry; + else if (r) + continue; + + if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte)) goto retry; + + flush = true; } + if (flush) + kvm_flush_remote_tlbs_memslot(kvm, slot); + rcu_read_unlock(); } /* - * Zap non-leaf SPTEs (and free their associated page tables) which could - * be replaced by huge pages, for GFNs within the slot. + * Recover huge page mappings within the slot by replacing non-leaf SPTEs with + * huge SPTEs where possible. */ -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) - zap_collapsible_spte_range(kvm, root, slot); + recover_huge_pages_range(kvm, root, slot); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1b74e058a81c..f03ca0dd13d9 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -34,14 +34,14 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level); -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, +void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 83fe0a78146f..8507b300ff43 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13104,19 +13104,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (!log_dirty_pages) { /* - * Dirty logging tracks sptes in 4k granularity, meaning that - * large sptes have to be split. If live migration succeeds, - * the guest in the source machine will be destroyed and large - * sptes will be created in the destination. However, if the - * guest continues to run in the source machine (for example if - * live migration fails), small sptes will remain around and - * cause bad performance. + * Recover huge page mappings in the slot now that dirty logging + * is disabled, i.e. now that KVM does not have to track guest + * writes at 4KiB granularity. * - * Scan sptes if dirty logging has been stopped, dropping those - * which can be collapsed into a single large-page spte. Later - * page faults will create the large-page sptes. + * Dirty logging might be disabled by userspace if an ongoing VM + * live migration is cancelled and the VM must continue running + * on the source. */ - kvm_mmu_zap_collapsible_sptes(kvm, new); + kvm_mmu_recover_huge_pages(kvm, new); } else { /* * Initially-all-set does not require write protecting any page, diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index fd6a3010afa8..54e959e7d68f 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -100,6 +100,10 @@ config KVM_GENERIC_MMU_NOTIFIER select MMU_NOTIFIER bool +config KVM_ELIDE_TLB_FLUSH_IF_YOUNG + depends on KVM_GENERIC_MMU_NOTIFIER + bool + config KVM_GENERIC_MEMORY_ATTRIBUTES depends on KVM_GENERIC_MMU_NOTIFIER bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ec6fc8164f66..27186b06518a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -631,7 +631,8 @@ mmu_unlock: static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, unsigned long start, unsigned long end, - gfn_handler_t handler) + gfn_handler_t handler, + bool flush_on_ret) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range range = { @@ -639,7 +640,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .end = end, .handler = handler, .on_lock = (void *)kvm_null_fn, - .flush_on_ret = true, + .flush_on_ret = flush_on_ret, .may_block = false, }; @@ -651,17 +652,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn unsigned long end, gfn_handler_t handler) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); - const struct kvm_mmu_notifier_range range = { - .start = start, - .end = end, - .handler = handler, - .on_lock = (void *)kvm_null_fn, - .flush_on_ret = false, - .may_block = false, - }; - - return __kvm_handle_hva_range(kvm, &range).ret; + return kvm_handle_hva_range(mn, start, end, handler, false); } void kvm_mmu_invalidate_begin(struct kvm *kvm) @@ -826,7 +817,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, { trace_kvm_age_hva(start, end); - return kvm_handle_hva_range(mn, start, end, kvm_age_gfn); + return kvm_handle_hva_range(mn, start, end, kvm_age_gfn, + !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, |