diff options
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 209 |
1 files changed, 127 insertions, 82 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 559634b59d2a..46ba2e03a892 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -781,7 +781,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu) eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; - if (enable_ept) + if (!vmx_need_pf_intercept(vcpu)) eb &= ~(1u << PF_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a @@ -1816,7 +1816,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) msr->data = vmx_get_perf_capabilities(); return 0; default: - return 1; + return KVM_MSR_RET_INVALID; } } @@ -2063,7 +2063,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; - if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) + if (kvm_spec_ctrl_test_value(data)) return 1; vmx->spec_ctrl = data; @@ -2934,14 +2934,16 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { - u64 root_hpa = vcpu->arch.mmu->root_hpa; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 root_hpa = mmu->root_hpa; /* No flush required if the current context is invalid. */ if (!VALID_PAGE(root_hpa)) return; if (enable_ept) - ept_sync_context(construct_eptp(vcpu, root_hpa)); + ept_sync_context(construct_eptp(vcpu, root_hpa, + mmu->shadow_root_level)); else if (!is_guest_mode(vcpu)) vpid_sync_context(to_vmx(vcpu)->vpid); else @@ -3064,26 +3066,19 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } -static int vmx_get_tdp_level(struct kvm_vcpu *vcpu) +static int vmx_get_max_tdp_level(void) { - if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) + if (cpu_has_vmx_ept_5levels()) return 5; return 4; } -static int get_ept_level(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); - - return vmx_get_tdp_level(vcpu); -} - -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, + int root_level) { u64 eptp = VMX_EPTP_MT_WB; - eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) @@ -3093,7 +3088,8 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) return eptp; } -void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) +static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, + int pgd_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3101,7 +3097,7 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, pgd); + eptp = construct_eptp(vcpu, pgd, pgd_level); vmcs_write64(EPT_POINTER, eptp); if (kvm_x86_ops.tlb_remote_flush) { @@ -4356,6 +4352,16 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + + /* + * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched + * between guest and host. In that case we only care about present + * faults. + */ + if (enable_ept) { + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -4782,18 +4788,25 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 3; + vcpu->run->internal.ndata = 4; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; vcpu->run->internal.data[2] = error_code; + vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; return 0; } if (is_page_fault(intr_info)) { cr2 = vmx_get_exit_qual(vcpu); - /* EPT won't cause page fault directly */ - WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept); - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); + if (enable_ept && !vcpu->arch.apf.host_apf_flags) { + /* + * EPT will cause page fault only if we need to + * detect illegal GPAs. + */ + kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); + return 1; + } else + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -5309,6 +5322,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; vcpu->arch.exit_qualification = exit_qualification; + + /* + * Check that the GPA doesn't exceed physical memory limits, as that is + * a guest page fault. We have to emulate the instruction here, because + * if the illegal address is that of a paging structure, then + * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we + * would also use advanced VM-exit information for EPT violations to + * reconstruct the page fault error code. + */ + if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa))) + return kvm_emulate_instruction(vcpu, 0); + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } @@ -6005,6 +6030,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6013,6 +6039,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6039,6 +6066,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->internal.data[3] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); } + vcpu->run->internal.data[vcpu->run->internal.ndata++] = + vcpu->arch.last_vmentry_cpu; return 0; } @@ -6094,8 +6123,9 @@ unexpected_vmexit: vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6109,7 +6139,7 @@ unexpected_vmexit: * information but as all relevant affected CPUs have 32KiB L1D cache size * there is no point in doing so. */ -static void vmx_l1d_flush(struct kvm_vcpu *vcpu) +static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) { int size = PAGE_SIZE << L1D_CACHE_ORDER; @@ -6142,7 +6172,7 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu) vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } @@ -6628,7 +6658,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) } } -void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) +void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) { if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { vmx->loaded_vmcs->host_state.rsp = host_rsp; @@ -6650,6 +6680,63 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); +static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct vcpu_vmx *vmx) +{ + /* + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * This ensures that e.g. latency analysis on the host observes + * guest mode as interrupt enabled. + * + * guest_enter_irqoff() informs context tracking about the + * transition to guest mode and if enabled adjusts RCU state + * accordingly. + */ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); + + /* L1D Flush includes CPU buffer clear to mitigate MDS */ + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + else if (static_branch_unlikely(&mds_user_clear)) + mds_clear_cpu_buffers(); + + if (vcpu->arch.cr2 != native_read_cr2()) + native_write_cr2(vcpu->arch.cr2); + + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); + + vcpu->arch.cr2 = native_read_cr2(); + + /* + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on' as recorded before entering guest mode. + * Same as enter_from_user_mode(). + * + * guest_exit_irqoff() restores host context and reinstates RCU if + * enabled and required. + * + * This needs to be done before the below as native_read_msr() + * contains a tracepoint and x86_spec_ctrl_restore_host() calls + * into world and some more. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { fastpath_t exit_fastpath; @@ -6724,19 +6811,8 @@ reenter_guest: */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - /* L1D Flush includes CPU buffer clear to mitigate MDS */ - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mds_user_clear)) - mds_clear_cpu_buffers(); - - if (vcpu->arch.cr2 != read_cr2()) - write_cr2(vcpu->arch.cr2); - - vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - vmx->loaded_vmcs->launched); - - vcpu->arch.cr2 = read_cr2(); + /* The actual VMENTER/EXIT is in the .noinstr.text section. */ + vmx_vcpu_enter_exit(vcpu, vmx); /* * We do not use IBRS in the kernel. If this vCPU has used the @@ -7229,7 +7305,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } -static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7478,42 +7554,6 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } -static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - struct vmcs12 *vmcs12; - struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t dst; - - if (is_guest_mode(vcpu)) { - WARN_ON_ONCE(vmx->nested.pml_full); - - /* - * Check if PML is enabled for the nested guest. - * Whether eptp bit 6 is set is already checked - * as part of A/D emulation. - */ - vmcs12 = get_vmcs12(vcpu); - if (!nested_cpu_has_pml(vmcs12)) - return 0; - - if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { - vmx->nested.pml_full = true; - return 1; - } - - gpa &= ~0xFFFull; - dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; - - if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, - offset_in_page(dst), sizeof(gpa))) - return 0; - - vmcs12->guest_pml_index--; - } - - return 0; -} - static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t offset, unsigned long mask) @@ -7858,7 +7898,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .update_bp_intercept = update_exception_bitmap, + .update_exception_bitmap = update_exception_bitmap, .get_msr_feature = vmx_get_msr_feature, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, @@ -7918,12 +7958,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, - .get_tdp_level = vmx_get_tdp_level, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, - .cpuid_update = vmx_cpuid_update, + .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, @@ -7942,7 +7981,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, - .write_log_dirty = vmx_write_pml_buffer, .pre_block = vmx_pre_block, .post_block = vmx_post_block, @@ -8070,7 +8108,7 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, ept_lpage_level); + kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -8265,6 +8303,13 @@ static int __init vmx_init(void) #endif vmx_check_vmcs12_offsets(); + /* + * Intel processors don't have problems with + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable + * it for VMX by default + */ + allow_smaller_maxphyaddr = true; + return 0; } module_init(vmx_init); |