summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-02-21 13:31:43 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2021-02-21 13:31:43 -0800
commit3e10585335b7967326ca7b4118cada0d2d00a2ab (patch)
treee1655bc4f093f7de3a54dc3b2d83a54159aca10b
parent9c5b80b795e9c847a7b7f5e63c6bcf07873fbcdf (diff)
parent8c6e67bec3192f16fa624203c8131e10cc4814ba (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "x86: - Support for userspace to emulate Xen hypercalls - Raise the maximum number of user memslots - Scalability improvements for the new MMU. Instead of the complex "fast page fault" logic that is used in mmu.c, tdp_mmu.c uses an rwlock so that page faults are concurrent, but the code that can run against page faults is limited. Right now only page faults take the lock for reading; in the future this will be extended to some cases of page table destruction. I hope to switch the default MMU around 5.12-rc3 (some testing was delayed due to Chinese New Year). - Cleanups for MAXPHYADDR checks - Use static calls for vendor-specific callbacks - On AMD, use VMLOAD/VMSAVE to save and restore host state - Stop using deprecated jump label APIs - Workaround for AMD erratum that made nested virtualization unreliable - Support for LBR emulation in the guest - Support for communicating bus lock vmexits to userspace - Add support for SEV attestation command - Miscellaneous cleanups PPC: - Support for second data watchpoint on POWER10 - Remove some complex workarounds for buggy early versions of POWER9 - Guest entry/exit fixes ARM64: - Make the nVHE EL2 object relocatable - Cleanups for concurrent translation faults hitting the same page - Support for the standard TRNG hypervisor call - A bunch of small PMU/Debug fixes - Simplification of the early init hypercall handling Non-KVM changes (with acks): - Detection of contended rwlocks (implemented only for qrwlocks, because KVM only needs it for x86) - Allow __DISABLE_EXPORTS from assembly code - Provide a saner follow_pfn replacements for modules" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (192 commits) KVM: x86/xen: Explicitly pad struct compat_vcpu_info to 64 bytes KVM: selftests: Don't bother mapping GVA for Xen shinfo test KVM: selftests: Fix hex vs. decimal snafu in Xen test KVM: selftests: Fix size of memslots created by Xen tests KVM: selftests: Ignore recently added Xen tests' build output KVM: selftests: Add missing header file needed by xAPIC IPI tests KVM: selftests: Add operand to vmsave/vmload/vmrun in svm.c KVM: SVM: Make symbol 'svm_gp_erratum_intercept' static locking/arch: Move qrwlock.h include after qspinlock.h KVM: PPC: Book3S HV: Fix host radix SLB optimisation with hash guests KVM: PPC: Book3S HV: Ensure radix guest has no SLB entries KVM: PPC: Don't always report hash MMU capability for P9 < DD2.2 KVM: PPC: Book3S HV: Save and restore FSCR in the P9 path KVM: PPC: remove unneeded semicolon KVM: PPC: Book3S HV: Use POWER9 SLBIA IH=6 variant to clear SLB KVM: PPC: Book3S HV: No need to clear radix host SLB before loading HPT guest KVM: PPC: Book3S HV: Fix radix guest SLB side channel KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host without mixed mode support KVM: PPC: Book3S HV: Introduce new capability for 2nd DAWR KVM: PPC: Book3S HV: Add infrastructure to support 2nd DAWR ...
-rw-r--r--Documentation/virt/kvm/amd-memory-encryption.rst21
-rw-r--r--Documentation/virt/kvm/api.rst228
-rw-r--r--Documentation/virt/kvm/locking.rst9
-rw-r--r--arch/arm64/include/asm/hyp_image.h29
-rw-r--r--arch/arm64/include/asm/kvm_asm.h20
-rw-r--r--arch/arm64/include/asm/kvm_host.h3
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h61
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h5
-rw-r--r--arch/arm64/include/asm/sections.h3
-rw-r--r--arch/arm64/include/asm/spinlock.h2
-rw-r--r--arch/arm64/include/asm/sysreg.h3
-rw-r--r--arch/arm64/kernel/image-vars.h1
-rw-r--r--arch/arm64/kernel/smp.c4
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S18
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/arm.c7
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/.gitignore2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile33
-rw-r--r--arch/arm64/kvm/hyp/nvhe/gen-hyprel.c438
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S29
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S19
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c11
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-smp.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp.lds.S9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/psci-relay.c24
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c83
-rw-r--r--arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c2
-rw-r--r--arch/arm64/kvm/hypercalls.c6
-rw-r--r--arch/arm64/kvm/mmu.c13
-rw-r--r--arch/arm64/kvm/pmu-emul.c14
-rw-r--r--arch/arm64/kvm/sys_regs.c85
-rw-r--r--arch/arm64/kvm/trng.c85
-rw-r--r--arch/arm64/kvm/va_layout.c34
-rw-r--r--arch/mips/include/asm/kvm_host.h1
-rw-r--r--arch/mips/include/asm/spinlock.h2
-rw-r--r--arch/powerpc/include/asm/hvcall.h25
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h11
-rw-r--r--arch/powerpc/include/asm/kvm_host.h8
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h2
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c9
-rw-r--r--arch/powerpc/kvm/book3s_hv.c149
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c108
-rw-r--r--arch/powerpc/kvm/book3s_hv_nested.c70
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S175
-rw-r--r--arch/powerpc/kvm/booke.c2
-rw-r--r--arch/powerpc/kvm/powerpc.c14
-rw-r--r--arch/s390/include/asm/kvm_host.h1
-rw-r--r--arch/s390/pci/pci_mmio.c4
-rw-r--r--arch/sparc/include/asm/spinlock_64.h2
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h127
-rw-r--r--arch/x86/include/asm/kvm_host.h89
-rw-r--r--arch/x86/include/asm/virtext.h25
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/asm/vmxfeatures.h1
-rw-r--r--arch/x86/include/asm/xen/interface.h3
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h4
-rw-r--r--arch/x86/kernel/apic/apic.c1
-rw-r--r--arch/x86/kernel/reboot.c30
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/cpuid.c24
-rw-r--r--arch/x86/kvm/cpuid.h24
-rw-r--r--arch/x86/kvm/emulate.c14
-rw-r--r--arch/x86/kvm/hyperv.c343
-rw-r--r--arch/x86/kvm/hyperv.h54
-rw-r--r--arch/x86/kvm/irq.c10
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h10
-rw-r--r--arch/x86/kvm/kvm_emulate.h2
-rw-r--r--arch/x86/kvm/lapic.c60
-rw-r--r--arch/x86/kvm/lapic.h20
-rw-r--r--arch/x86/kvm/mmu.h8
-rw-r--r--arch/x86/kvm/mmu/mmu.c353
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c8
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h7
-rw-r--r--arch/x86/kvm/mmu/page_track.c8
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h8
-rw-r--r--arch/x86/kvm/mmu/spte.c2
-rw-r--r--arch/x86/kvm/mmu/spte.h33
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c46
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h21
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c554
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h32
-rw-r--r--arch/x86/kvm/mtrr.c12
-rw-r--r--arch/x86/kvm/pmu.c10
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/avic.c35
-rw-r--r--arch/x86/kvm/svm/nested.c8
-rw-r--r--arch/x86/kvm/svm/sev.c104
-rw-r--r--arch/x86/kvm/svm/svm.c303
-rw-r--r--arch/x86/kvm/svm/svm.h29
-rw-r--r--arch/x86/kvm/svm/svm_ops.h69
-rw-r--r--arch/x86/kvm/trace.h40
-rw-r--r--arch/x86/kvm/vmx/capabilities.h28
-rw-r--r--arch/x86/kvm/vmx/nested.c106
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c294
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c6
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c282
-rw-r--r--arch/x86/kvm/vmx/vmx.h56
-rw-r--r--arch/x86/kvm/x86.c718
-rw-r--r--arch/x86/kvm/x86.h12
-rw-r--r--arch/x86/kvm/xen.c431
-rw-r--r--arch/x86/kvm/xen.h78
-rw-r--r--arch/xtensa/include/asm/spinlock.h2
-rw-r--r--drivers/crypto/ccp/sev-dev.c1
-rw-r--r--drivers/gpu/drm/i915/gvt/kvmgt.c12
-rw-r--r--fs/dax.c5
-rw-r--r--include/asm-generic/export.h2
-rw-r--r--include/asm-generic/qrwlock.h25
-rw-r--r--include/linux/kvm_host.h10
-rw-r--r--include/linux/mm.h6
-rw-r--r--include/linux/psp-sev.h17
-rw-r--r--include/linux/rwlock.h7
-rw-r--r--include/linux/sched.h29
-rw-r--r--include/uapi/linux/kvm.h74
-rw-r--r--include/xen/interface/xen.h4
-rw-r--r--kernel/locking/qrwlock.c1
-rw-r--r--kernel/sched/core.c40
-rw-r--r--mm/memory.c41
-rw-r--r--tools/arch/powerpc/include/uapi/asm/kvm.h2
-rw-r--r--tools/include/uapi/linux/kvm.h1
-rw-r--r--tools/testing/selftests/kvm/.gitignore6
-rw-r--r--tools/testing/selftests/kvm/Makefile6
-rw-r--r--tools/testing/selftests/kvm/demand_paging_test.c43
-rw-r--r--tools/testing/selftests/kvm/dirty_log_perf_test.c92
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h6
-rw-r--r--tools/testing/selftests/kvm/include/numaif.h55
-rw-r--r--tools/testing/selftests/kvm/include/perf_test_util.h7
-rw-r--r--tools/testing/selftests/kvm/include/test_util.h16
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h41
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c1
-rw-r--r--tools/testing/selftests/kvm/lib/perf_test_util.c31
-rw-r--r--tools/testing/selftests/kvm/lib/test_util.c31
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c144
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/svm.c8
-rw-r--r--tools/testing/selftests/kvm/memslot_modification_stress_test.c212
-rw-r--r--tools/testing/selftests/kvm/settings1
-rw-r--r--tools/testing/selftests/kvm/x86_64/evmcs_test.c3
-rw-r--r--tools/testing/selftests/kvm/x86_64/get_cpuid_test.c175
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c31
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c131
-rw-r--r--tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c544
-rw-r--r--tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c167
-rw-r--r--tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c149
-rw-r--r--virt/kvm/dirty_ring.c8
-rw-r--r--virt/kvm/kvm_main.c54
-rw-r--r--virt/kvm/mmu_lock.h23
150 files changed, 6655 insertions, 2060 deletions
diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst
index 09a8f2a34e39..469a6308765b 100644
--- a/Documentation/virt/kvm/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/amd-memory-encryption.rst
@@ -263,6 +263,27 @@ Returns: 0 on success, -negative on error
__u32 trans_len;
};
+10. KVM_SEV_GET_ATTESTATION_REPORT
+----------------------------------
+
+The KVM_SEV_GET_ATTESTATION_REPORT command can be used by the hypervisor to query the attestation
+report containing the SHA-256 digest of the guest memory and VMSA passed through the KVM_SEV_LAUNCH
+commands and signed with the PEK. The digest returned by the command should match the digest
+used by the guest owner with the KVM_SEV_LAUNCH_MEASURE.
+
+Parameters (in): struct kvm_sev_attestation
+
+Returns: 0 on success, -negative on error
+
+::
+
+ struct kvm_sev_attestation_report {
+ __u8 mnonce[16]; /* A random mnonce that will be placed in the report */
+
+ __u64 uaddr; /* userspace address where the report should be copied */
+ __u32 len;
+ };
+
References
==========
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 99ceb978c8b0..45fd862ac128 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -960,6 +960,14 @@ memory.
__u8 pad2[30];
};
+If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the
+KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl.
+This requests KVM to generate the contents of the hypercall page
+automatically; hypercalls will be intercepted and passed to userspace
+through KVM_EXIT_XEN. In this case, all of the blob size and address
+fields must be zero.
+
+No other flags are currently valid in the struct kvm_xen_hvm_config.
4.29 KVM_GET_CLOCK
------------------
@@ -2268,6 +2276,8 @@ registers, find a list below:
PPC KVM_REG_PPC_PSSCR 64
PPC KVM_REG_PPC_DEC_EXPIRY 64
PPC KVM_REG_PPC_PTCR 64
+ PPC KVM_REG_PPC_DAWR1 64
+ PPC KVM_REG_PPC_DAWRX1 64
PPC KVM_REG_PPC_TM_GPR0 64
...
PPC KVM_REG_PPC_TM_GPR31 64
@@ -4831,6 +4841,101 @@ into user space.
If a vCPU is in running state while this ioctl is invoked, the vCPU may
experience inconsistent filtering behavior on MSR accesses.
+4.127 KVM_XEN_HVM_SET_ATTR
+--------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_xen_hvm_attr
+:Returns: 0 on success, < 0 on error
+
+::
+
+ struct kvm_xen_hvm_attr {
+ __u16 type;
+ __u16 pad[3];
+ union {
+ __u8 long_mode;
+ __u8 vector;
+ struct {
+ __u64 gfn;
+ } shared_info;
+ __u64 pad[4];
+ } u;
+ };
+
+type values:
+
+KVM_XEN_ATTR_TYPE_LONG_MODE
+ Sets the ABI mode of the VM to 32-bit or 64-bit (long mode). This
+ determines the layout of the shared info pages exposed to the VM.
+
+KVM_XEN_ATTR_TYPE_SHARED_INFO
+ Sets the guest physical frame number at which the Xen "shared info"
+ page resides. Note that although Xen places vcpu_info for the first
+ 32 vCPUs in the shared_info page, KVM does not automatically do so
+ and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used
+ explicitly even when the vcpu_info for a given vCPU resides at the
+ "default" location in the shared_info page. This is because KVM is
+ not aware of the Xen CPU id which is used as the index into the
+ vcpu_info[] array, so cannot know the correct default location.
+
+KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
+ Sets the exception vector used to deliver Xen event channel upcalls.
+
+4.128 KVM_XEN_HVM_GET_ATTR
+--------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_xen_hvm_attr
+:Returns: 0 on success, < 0 on error
+
+Allows Xen VM attributes to be read. For the structure and types,
+see KVM_XEN_HVM_SET_ATTR above.
+
+4.129 KVM_XEN_VCPU_SET_ATTR
+---------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: struct kvm_xen_vcpu_attr
+:Returns: 0 on success, < 0 on error
+
+::
+
+ struct kvm_xen_vcpu_attr {
+ __u16 type;
+ __u16 pad[3];
+ union {
+ __u64 gpa;
+ __u64 pad[4];
+ } u;
+ };
+
+type values:
+
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
+ Sets the guest physical address of the vcpu_info for a given vCPU.
+
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
+ Sets the guest physical address of an additional pvclock structure
+ for a given vCPU. This is typically used for guest vsyscall support.
+
+4.130 KVM_XEN_VCPU_GET_ATTR
+---------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: struct kvm_xen_vcpu_attr
+:Returns: 0 on success, < 0 on error
+
+Allows Xen vCPU attributes to be read. For the structure and types,
+see KVM_XEN_VCPU_SET_ATTR above.
5. The kvm_run structure
========================
@@ -4893,9 +4998,11 @@ local APIC is not used.
__u16 flags;
More architecture-specific flags detailing state of the VCPU that may
-affect the device's behavior. The only currently defined flag is
-KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the
-VCPU is in system management mode.
+affect the device's behavior. Current defined flags:
+ /* x86, set if the VCPU is in system management mode */
+ #define KVM_RUN_X86_SMM (1 << 0)
+ /* x86, set if bus lock detected in VM */
+ #define KVM_RUN_BUS_LOCK (1 << 1)
::
@@ -4996,13 +5103,18 @@ to the byte array.
.. note::
- For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR,
+ For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
operations are complete (and guest state is consistent) only after userspace
has re-entered the kernel with KVM_RUN. The kernel side will first finish
- incomplete operations and then check for pending signals. Userspace
- can re-enter the guest with an unmasked signal pending to complete
- pending operations.
+ incomplete operations and then check for pending signals.
+
+ The pending state of the operation is not preserved in state which is
+ visible to userspace, thus userspace should ensure that the operation is
+ completed before performing a live migration. Userspace can re-enter the
+ guest with an unmasked signal pending or with the immediate_exit field set
+ to complete pending operations without allowing any further instructions
+ to be executed.
::
@@ -5329,6 +5441,34 @@ vCPU execution. If the MSR write was unsuccessful, user space also sets the
::
+
+ struct kvm_xen_exit {
+ #define KVM_EXIT_XEN_HCALL 1
+ __u32 type;
+ union {
+ struct {
+ __u32 longmode;
+ __u32 cpl;
+ __u64 input;
+ __u64 result;
+ __u64 params[6];
+ } hcall;
+ } u;
+ };
+ /* KVM_EXIT_XEN */
+ struct kvm_hyperv_exit xen;
+
+Indicates that the VCPU exits into userspace to process some tasks
+related to Xen emulation.
+
+Valid values for 'type' are:
+
+ - KVM_EXIT_XEN_HCALL -- synchronously notify user-space about Xen hypercall.
+ Userspace is expected to place the hypercall result into the appropriate
+ field before invoking KVM_RUN again.
+
+::
+
/* Fix the size of the union. */
char padding[256];
};
@@ -6038,6 +6178,53 @@ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
can then handle to implement model specific MSR handling and/or user notifications
to inform a user that an MSR was not handled.
+7.22 KVM_CAP_X86_BUS_LOCK_EXIT
+-------------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] defines the policy used when bus locks detected in guest
+:Returns: 0 on success, -EINVAL when args[0] contains invalid bits
+
+Valid bits in args[0] are::
+
+ #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0)
+ #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1)
+
+Enabling this capability on a VM provides userspace with a way to select
+a policy to handle the bus locks detected in guest. Userspace can obtain
+the supported modes from the result of KVM_CHECK_EXTENSION and define it
+through the KVM_ENABLE_CAP.
+
+KVM_BUS_LOCK_DETECTION_OFF and KVM_BUS_LOCK_DETECTION_EXIT are supported
+currently and mutually exclusive with each other. More bits can be added in
+the future.
+
+With KVM_BUS_LOCK_DETECTION_OFF set, bus locks in guest will not cause vm exits
+so that no additional actions are needed. This is the default mode.
+
+With KVM_BUS_LOCK_DETECTION_EXIT set, vm exits happen when bus lock detected
+in VM. KVM just exits to userspace when handling them. Userspace can enforce
+its own throttling or other policy based mitigations.
+
+This capability is aimed to address the thread that VM can exploit bus locks to
+degree the performance of the whole system. Once the userspace enable this
+capability and select the KVM_BUS_LOCK_DETECTION_EXIT mode, KVM will set the
+KVM_RUN_BUS_LOCK flag in vcpu-run->flags field and exit to userspace. Concerning
+the bus lock vm exit can be preempted by a higher priority VM exit, the exit
+notifications to userspace can be KVM_EXIT_BUS_LOCK or other reasons.
+KVM_RUN_BUS_LOCK flag is used to distinguish between them.
+
+7.22 KVM_CAP_PPC_DAWR1
+----------------------
+
+:Architectures: ppc
+:Parameters: none
+:Returns: 0 on success, -EINVAL when CPU doesn't support 2nd DAWR
+
+This capability can be used to check / enable 2nd DAWR feature provided
+by POWER10 processor.
+
8. Other capabilities.
======================
@@ -6415,7 +6602,6 @@ guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf
(0x40000001). Otherwise, a guest may use the paravirtual features
regardless of what has actually been exposed through the CPUID leaf.
-
8.29 KVM_CAP_DIRTY_LOG_RING
---------------------------
@@ -6502,3 +6688,29 @@ KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG. After enabling
KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring size, the virtual
machine will switch to ring-buffer dirty page tracking and further
KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail.
+
+8.30 KVM_CAP_XEN_HVM
+--------------------
+
+:Architectures: x86
+
+This capability indicates the features that Xen supports for hosting Xen
+PVHVM guests. Valid flags are::
+
+ #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0)
+ #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
+ #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
+
+The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
+ioctl is available, for the guest to set its hypercall page.
+
+If KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL is also set, the same flag may also be
+provided in the flags to KVM_XEN_HVM_CONFIG, without providing hypercall page
+contents, to request that KVM generate hypercall page content automatically
+and also enable interception of guest hypercalls with KVM_EXIT_XEN.
+
+The KVM_XEN_HVM_CONFIG_SHARED_INFO flag indicates the availability of the
+KVM_XEN_HVM_SET_ATTR, KVM_XEN_HVM_GET_ATTR, KVM_XEN_VCPU_SET_ATTR and
+KVM_XEN_VCPU_GET_ATTR ioctls, as well as the delivery of exception vectors
+for event channel upcalls when the evtchn_upcall_pending field of a vcpu's
+vcpu_info is set.
diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index b21a34c34a21..0aa4817b466d 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -16,7 +16,14 @@ The acquisition orders for mutexes are as follows:
- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
them together is quite rare.
-On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock.
+On x86:
+
+- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
+
+- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is
+ taken inside kvm->arch.mmu_lock, and cannot be taken without already
+ holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
+ there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
Everything else is a leaf: no other lock is taken inside the critical
sections.
diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h
index daa1a1da539e..737ded6b6d0d 100644
--- a/arch/arm64/include/asm/hyp_image.h
+++ b/arch/arm64/include/asm/hyp_image.h
@@ -7,6 +7,9 @@
#ifndef __ARM64_HYP_IMAGE_H__
#define __ARM64_HYP_IMAGE_H__
+#define __HYP_CONCAT(a, b) a ## b
+#define HYP_CONCAT(a, b) __HYP_CONCAT(a, b)
+
/*
* KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
* to separate it from the kernel proper.
@@ -21,9 +24,31 @@
*/
#define HYP_SECTION_NAME(NAME) .hyp##NAME
+/* Symbol defined at the beginning of each hyp section. */
+#define HYP_SECTION_SYMBOL_NAME(NAME) \
+ HYP_CONCAT(__hyp_section_, HYP_SECTION_NAME(NAME))
+
+/*
+ * Helper to generate linker script statements starting a hyp section.
+ *
+ * A symbol with a well-known name is defined at the first byte. This
+ * is used as a base for hyp relocations (see gen-hyprel.c). It must
+ * be defined inside the section so the linker of `vmlinux` cannot
+ * separate it from the section data.
+ */
+#define BEGIN_HYP_SECTION(NAME) \
+ HYP_SECTION_NAME(NAME) : { \
+ HYP_SECTION_SYMBOL_NAME(NAME) = .;
+
+/* Helper to generate linker script statements ending a hyp section. */
+#define END_HYP_SECTION \
+ }
+
/* Defines an ELF hyp section from input section @NAME and its subsections. */
-#define HYP_SECTION(NAME) \
- HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) }
+#define HYP_SECTION(NAME) \
+ BEGIN_HYP_SECTION(NAME) \
+ *(NAME NAME##.*) \
+ END_HYP_SECTION
/*
* Defines a linker script alias of a kernel-proper symbol referenced by
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 7ccf770c53d9..22d933e9b59e 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -199,26 +199,6 @@ extern void __vgic_v3_init_lrs(void);
extern u32 __kvm_get_mdcr_el2(void);
-/*
- * Obtain the PC-relative address of a kernel symbol
- * s: symbol
- *
- * The goal of this macro is to return a symbol's address based on a
- * PC-relative computation, as opposed to a loading the VA from a
- * constant pool or something similar. This works well for HYP, as an
- * absolute VA is guaranteed to be wrong. Only use this if trying to
- * obtain the address of a symbol (i.e. not something you obtained by
- * following a pointer).
- */
-#define hyp_symbol_addr(s) \
- ({ \
- typeof(s) *addr; \
- asm("adrp %0, %1\n" \
- "add %0, %0, :lo12:%1\n" \
- : "=r" (addr) : "S" (&s)); \
- addr; \
- })
-
#define __KVM_EXTABLE(from, to) \
" .pushsection __kvm_ex_table, \"a\"\n" \
" .align 3\n" \
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 8fcfab0c2567..3d10e6527f7d 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -30,7 +30,6 @@
#define __KVM_HAVE_ARCH_INTC_INITIALIZED
-#define KVM_USER_MEM_SLOTS 512
#define KVM_HALT_POLL_NS_DEFAULT 500000
#include <kvm/arm_vgic.h>
@@ -771,4 +770,6 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
#define kvm_vcpu_has_pmu(vcpu) \
(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
+int kvm_trng_call(struct kvm_vcpu *vcpu);
+
#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e52d82aeadca..90873851f677 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -73,49 +73,39 @@ alternative_cb_end
.endm
/*
- * Convert a kernel image address to a PA
- * reg: kernel address to be converted in place
+ * Convert a hypervisor VA to a PA
+ * reg: hypervisor address to be converted in place
* tmp: temporary register
- *
- * The actual code generation takes place in kvm_get_kimage_voffset, and
- * the instructions below are only there to reserve the space and
- * perform the register allocation (kvm_get_kimage_voffset uses the
- * specific registers encoded in the instructions).
*/
-.macro kimg_pa reg, tmp
-alternative_cb kvm_get_kimage_voffset
- movz \tmp, #0
- movk \tmp, #0, lsl #16
- movk \tmp, #0, lsl #32
- movk \tmp, #0, lsl #48
-alternative_cb_end
-
- /* reg = __pa(reg) */
- sub \reg, \reg, \tmp
+.macro hyp_pa reg, tmp
+ ldr_l \tmp, hyp_physvirt_offset
+ add \reg, \reg, \tmp
.endm
/*
- * Convert a kernel image address to a hyp VA
- * reg: kernel address to be converted in place
+ * Convert a hypervisor VA to a kernel image address
+ * reg: hypervisor address to be converted in place
* tmp: temporary register
*
* The actual code generation takes place in kvm_get_kimage_voffset, and
* the instructions below are only there to reserve the space and
- * perform the register allocation (kvm_update_kimg_phys_offset uses the
+ * perform the register allocation (kvm_get_kimage_voffset uses the
* specific registers encoded in the instructions).
*/
-.macro kimg_hyp_va reg, tmp
-alternative_cb kvm_update_kimg_phys_offset
+.macro hyp_kimg_va reg, tmp
+ /* Convert hyp VA -> PA. */
+ hyp_pa \reg, \tmp
+
+ /* Load kimage_voffset. */
+alternative_cb kvm_get_kimage_voffset
movz \tmp, #0
movk \tmp, #0, lsl #16
movk \tmp, #0, lsl #32
movk \tmp, #0, lsl #48
alternative_cb_end
- sub \reg, \reg, \tmp
- mov_q \tmp, PAGE_OFFSET
- orr \reg, \reg, \tmp
- kern_hyp_va \reg
+ /* Convert PA -> kimg VA. */
+ add \reg, \reg, \tmp
.endm
#else
@@ -129,6 +119,7 @@ alternative_cb_end
void kvm_update_va_mask(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
void kvm_compute_layout(void);
+void kvm_apply_hyp_relocations(void);
static __always_inline unsigned long __kern_hyp_va(unsigned long v)
{
@@ -144,24 +135,6 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
#define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v))))
-static __always_inline unsigned long __kimg_hyp_va(unsigned long v)
-{
- unsigned long offset;
-
- asm volatile(ALTERNATIVE_CB("movz %0, #0\n"
- "movk %0, #0, lsl #16\n"
- "movk %0, #0, lsl #32\n"
- "movk %0, #0, lsl #48\n",
- kvm_update_kimg_phys_offset)
- : "=r" (offset));
-
- return __kern_hyp_va((v - offset) | PAGE_OFFSET);
-}
-
-#define kimg_fn_hyp_va(v) ((typeof(*v))(__kimg_hyp_va((unsigned long)(v))))
-
-#define kimg_fn_ptr(x) (typeof(x) **)(x)
-
/*
* We currently support using a VM-specified IPA size. For backward
* compatibility, the default IPA size is fixed to 40bits.
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 52ab38db04c7..8886d43cfb11 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -157,6 +157,11 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
* If device attributes are not explicitly requested in @prot, then the
* mapping will be normal, cacheable.
*
+ * Note that the update of a valid leaf PTE in this function will be aborted,
+ * if it's trying to recreate the exact same mapping or only change the access
+ * permissions. Instead, the vCPU will exit one more time from guest if still
+ * needed and then go through the path of relaxing permissions.
+ *
* Note that this function will both coalesce existing table entries and split
* existing block mappings, relying on page-faults to fault back areas outside
* of the new mapping lazily.
diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h
index 8ff579361731..2f36b16a5b5d 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -11,7 +11,8 @@ extern char __alt_instructions[], __alt_instructions_end[];
extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[];
extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
extern char __hyp_text_start[], __hyp_text_end[];
-extern char __hyp_data_ro_after_init_start[], __hyp_data_ro_after_init_end[];
+extern char __hyp_rodata_start[], __hyp_rodata_end[];
+extern char __hyp_reloc_begin[], __hyp_reloc_end[];
extern char __idmap_text_start[], __idmap_text_end[];
extern char __initdata_begin[], __initdata_end[];
extern char __inittext_begin[], __inittext_end[];
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index 9083d6992603..0525c0b089ed 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -5,8 +5,8 @@
#ifndef __ASM_SPINLOCK_H
#define __ASM_SPINLOCK_H
-#include <asm/qrwlock.h>
#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
/* See include/linux/spinlock.h */
#define smp_mb__after_spinlock() smp_mb()
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 767bb2d47be9..f9fbbb4734e8 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -853,7 +853,10 @@
#define ID_DFR0_PERFMON_SHIFT 24
+#define ID_DFR0_PERFMON_8_0 0x3
#define ID_DFR0_PERFMON_8_1 0x4
+#define ID_DFR0_PERFMON_8_4 0x5
+#define ID_DFR0_PERFMON_8_5 0x6
#define ID_ISAR4_SWP_FRAC_SHIFT 28
#define ID_ISAR4_PSR_M_SHIFT 24
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index f676243abac6..23f1a557bd9f 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -64,7 +64,6 @@ __efistub__ctype = _ctype;
/* Alternative callbacks for init-time patching of nVHE hyp code. */
KVM_NVHE_ALIAS(kvm_patch_vector_branch);
KVM_NVHE_ALIAS(kvm_update_va_mask);
-KVM_NVHE_ALIAS(kvm_update_kimg_phys_offset);
KVM_NVHE_ALIAS(kvm_get_kimage_voffset);
/* Global kernel state accessed by nVHE hyp code. */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index ad00f99ee9b0..357590beaabb 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -434,8 +434,10 @@ static void __init hyp_mode_check(void)
"CPU: CPUs started in inconsistent modes");
else
pr_info("CPU: All CPU(s) started at EL1\n");
- if (IS_ENABLED(CONFIG_KVM) && !is_kernel_in_hyp_mode())
+ if (IS_ENABLED(CONFIG_KVM) && !is_kernel_in_hyp_mode()) {
kvm_compute_layout();
+ kvm_apply_hyp_relocations();
+ }
}
void __init smp_cpus_done(unsigned int max_cpus)
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 68f76a96c60b..7eea7888bb02 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -31,10 +31,11 @@ jiffies = jiffies_64;
__stop___kvm_ex_table = .;
#define HYPERVISOR_DATA_SECTIONS \
- HYP_SECTION_NAME(.data..ro_after_init) : { \
- __hyp_data_ro_after_init_start = .; \
+ HYP_SECTION_NAME(.rodata) : { \
+ __hyp_rodata_start = .; \
*(HYP_SECTION_NAME(.data..ro_after_init)) \
- __hyp_data_ro_after_init_end = .; \
+ *(HYP_SECTION_NAME(.rodata)) \
+ __hyp_rodata_end = .; \
}
#define HYPERVISOR_PERCPU_SECTION \
@@ -42,10 +43,19 @@ jiffies = jiffies_64;
HYP_SECTION_NAME(.data..percpu) : { \
*(HYP_SECTION_NAME(.data..percpu)) \
}
+
+#define HYPERVISOR_RELOC_SECTION \
+ .hyp.reloc : ALIGN(4) { \
+ __hyp_reloc_begin = .; \
+ *(.hyp.reloc) \
+ __hyp_reloc_end = .; \
+ }
+
#else /* CONFIG_KVM */
#define HYPERVISOR_EXTABLE
#define HYPERVISOR_DATA_SECTIONS
#define HYPERVISOR_PERCPU_SECTION
+#define HYPERVISOR_RELOC_SECTION
#endif
#define HYPERVISOR_TEXT \
@@ -216,6 +226,8 @@ SECTIONS
PERCPU_SECTION(L1_CACHE_BYTES)
HYPERVISOR_PERCPU_SECTION
+ HYPERVISOR_RELOC_SECTION
+
.rela.dyn : ALIGN(8) {
*(.rela .rela*)
}
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 13b017284bf9..589921392cb1 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -16,7 +16,7 @@ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
inject_fault.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o \
vgic-sys-reg-v3.o fpsimd.o pmu.o \
- arch_timer.o \
+ arch_timer.o trng.o\
vgic/vgic.o vgic/vgic-init.o \
vgic/vgic-irqfd.o vgic/vgic-v2.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 6804c5673312..fc4c95dd2d26 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1750,11 +1750,10 @@ static int init_hyp_mode(void)
goto out_err;
}
- err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_ro_after_init_start),
- kvm_ksym_ref(__hyp_data_ro_after_init_end),
- PAGE_HYP_RO);
+ err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
+ kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
if (err) {
- kvm_err("Cannot map .hyp.data..ro_after_init section\n");
+ kvm_err("Cannot map .hyp.rodata section\n");
goto out_err;
}
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 84473574c2e7..54f4860cd87c 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -505,8 +505,8 @@ static inline void __kvm_unexpected_el2_exception(void)
struct exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
- entry = hyp_symbol_addr(__start___kvm_ex_table);
- end = hyp_symbol_addr(__stop___kvm_ex_table);
+ entry = &__start___kvm_ex_table;
+ end = &__stop___kvm_ex_table;
while (entry < end) {
addr = (unsigned long)&entry->insn + entry->insn;
diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore
index 695d73d0249e..5b6c43cc96f8 100644
--- a/arch/arm64/kvm/hyp/nvhe/.gitignore
+++ b/arch/arm64/kvm/hyp/nvhe/.gitignore
@@ -1,2 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
+gen-hyprel
hyp.lds
+hyp-reloc.S
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 1f1e351c5fe2..a6707df4f6c0 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -3,8 +3,11 @@
# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part
#
-asflags-y := -D__KVM_NVHE_HYPERVISOR__
-ccflags-y := -D__KVM_NVHE_HYPERVISOR__
+asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
+ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
+
+hostprogs := gen-hyprel
+HOST_EXTRACFLAGS += -I$(objtree)/include
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
hyp-main.o hyp-smp.o psci-relay.o
@@ -19,7 +22,7 @@ obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y))
obj-y := kvm_nvhe.o
-extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds
+extra-y := $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o
# 1) Compile all source files to `.nvhe.o` object files. The file extension
# avoids file name clashes for files shared with VHE.
@@ -42,11 +45,31 @@ LDFLAGS_kvm_nvhe.tmp.o := -r -T
$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
$(call if_changed,ld)
-# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
+# 4) Generate list of hyp code/data positions that need to be relocated at
+# runtime. Because the hypervisor is part of the kernel binary, relocations
+# produce a kernel VA. We enumerate relocations targeting hyp at build time
+# and convert the kernel VAs at those positions to hyp VAs.
+$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel
+ $(call if_changed,hyprel)
+
+# 5) Compile hyp-reloc.S and link it into the existing partially linked object.
+# The object file now contains a section with pointers to hyp positions that
+# will contain kernel VAs at runtime. These pointers have relocations on them
+# so that they get updated as the hyp object is linked into `vmlinux`.
+LDFLAGS_kvm_nvhe.rel.o := -r
+$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE
+ $(call if_changed,ld)
+
+# 6) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
# Prefixes names of ELF symbols with '__kvm_nvhe_'.
-$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE
+$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE
$(call if_changed,hypcopy)
+# The HYPREL command calls `gen-hyprel` to generate an assembly file with
+# a list of relocations targeting hyp code/data.
+quiet_cmd_hyprel = HYPREL $@
+ cmd_hyprel = $(obj)/gen-hyprel $< > $@
+
# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
# to avoid clashes with VHE code/data.
quiet_cmd_hypcopy = HYPCOPY $@
diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
new file mode 100644
index 000000000000..ead02c6a7628
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ *
+ * Generates relocation information used by the kernel to convert
+ * absolute addresses in hyp data from kernel VAs to hyp VAs.
+ *
+ * This is necessary because hyp code is linked into the same binary
+ * as the kernel but executes under different memory mappings.
+ * If the compiler used absolute addressing, those addresses need to
+ * be converted before they are used by hyp code.
+ *
+ * The input of this program is the relocatable ELF object containing
+ * all hyp code/data, not yet linked into vmlinux. Hyp section names
+ * should have been prefixed with `.hyp` at this point.
+ *
+ * The output (printed to stdout) is an assembly file containing
+ * an array of 32-bit integers and static relocations that instruct
+ * the linker of `vmlinux` to populate the array entries with offsets
+ * to positions in the kernel binary containing VAs used by hyp code.
+ *
+ * Note that dynamic relocations could be used for the same purpose.
+ * However, those are only generated if CONFIG_RELOCATABLE=y.
+ */
+
+#include <elf.h>
+#include <endian.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <generated/autoconf.h>
+
+#define HYP_SECTION_PREFIX ".hyp"
+#define HYP_RELOC_SECTION ".hyp.reloc"
+#define HYP_SECTION_SYMBOL_PREFIX "__hyp_section_"
+
+/*
+ * AArch64 relocation type constants.
+ * Included in case these are not defined in the host toolchain.
+ */
+#ifndef R_AARCH64_ABS64
+#define R_AARCH64_ABS64 257
+#endif
+#ifndef R_AARCH64_LD_PREL_LO19
+#define R_AARCH64_LD_PREL_LO19 273
+#endif
+#ifndef R_AARCH64_ADR_PREL_LO21
+#define R_AARCH64_ADR_PREL_LO21 274
+#endif
+#ifndef R_AARCH64_ADR_PREL_PG_HI21
+#define R_AARCH64_ADR_PREL_PG_HI21 275
+#endif
+#ifndef R_AARCH64_ADR_PREL_PG_HI21_NC
+#define R_AARCH64_ADR_PREL_PG_HI21_NC 276
+#endif
+#ifndef R_AARCH64_ADD_ABS_LO12_NC
+#define R_AARCH64_ADD_ABS_LO12_NC 277
+#endif
+#ifndef R_AARCH64_LDST8_ABS_LO12_NC
+#define R_AARCH64_LDST8_ABS_LO12_NC 278
+#endif
+#ifndef R_AARCH64_TSTBR14
+#define R_AARCH64_TSTBR14 279
+#endif
+#ifndef R_AARCH64_CONDBR19
+#define R_AARCH64_CONDBR19 280
+#endif
+#ifndef R_AARCH64_JUMP26
+#define R_AARCH64_JUMP26 282
+#endif
+#ifndef R_AARCH64_CALL26
+#define R_AARCH64_CALL26 283
+#endif
+#ifndef R_AARCH64_LDST16_ABS_LO12_NC
+#define R_AARCH64_LDST16_ABS_LO12_NC 284
+#endif
+#ifndef R_AARCH64_LDST32_ABS_LO12_NC
+#define R_AARCH64_LDST32_ABS_LO12_NC 285
+#endif
+#ifndef R_AARCH64_LDST64_ABS_LO12_NC
+#define R_AARCH64_LDST64_ABS_LO12_NC 286
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G0
+#define R_AARCH64_MOVW_PREL_G0 287
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G0_NC
+#define R_AARCH64_MOVW_PREL_G0_NC 288
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G1
+#define R_AARCH64_MOVW_PREL_G1 289
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G1_NC
+#define R_AARCH64_MOVW_PREL_G1_NC 290
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G2
+#define R_AARCH64_MOVW_PREL_G2 291
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G2_NC
+#define R_AARCH64_MOVW_PREL_G2_NC 292
+#endif
+#ifndef R_AARCH64_MOVW_PREL_G3
+#define R_AARCH64_MOVW_PREL_G3 293
+#endif
+#ifndef R_AARCH64_LDST128_ABS_LO12_NC
+#define R_AARCH64_LDST128_ABS_LO12_NC 299
+#endif
+
+/* Global state of the processed ELF. */
+static struct {
+ const char *path;
+ char *begin;
+ size_t size;
+ Elf64_Ehdr *ehdr;
+ Elf64_Shdr *sh_table;
+ const char *sh_string;
+} elf;
+
+#if defined(CONFIG_CPU_LITTLE_ENDIAN)
+
+#define elf16toh(x) le16toh(x)
+#define elf32toh(x) le32toh(x)
+#define elf64toh(x) le64toh(x)
+
+#define ELFENDIAN ELFDATA2LSB
+
+#elif defined(CONFIG_CPU_BIG_ENDIAN)
+
+#define elf16toh(x) be16toh(x)
+#define elf32toh(x) be32toh(x)
+#define elf64toh(x) be64toh(x)
+
+#define ELFENDIAN ELFDATA2MSB
+
+#else
+
+#error PDP-endian sadly unsupported...
+
+#endif
+
+#define fatal_error(fmt, ...) \
+ ({ \
+ fprintf(stderr, "error: %s: " fmt "\n", \
+ elf.path, ## __VA_ARGS__); \
+ exit(EXIT_FAILURE); \
+ __builtin_unreachable(); \
+ })
+
+#define fatal_perror(msg) \
+ ({ \
+ fprintf(stderr, "error: %s: " msg ": %s\n", \
+ elf.path, strerror(errno)); \
+ exit(EXIT_FAILURE); \
+ __builtin_unreachable(); \
+ })
+
+#define assert_op(lhs, rhs, fmt, op) \
+ ({ \
+ typeof(lhs) _lhs = (lhs); \
+ typeof(rhs) _rhs = (rhs); \
+ \
+ if (!(_lhs op _rhs)) { \
+ fatal_error("assertion " #lhs " " #op " " #rhs \
+ " failed (lhs=" fmt ", rhs=" fmt \
+ ", line=%d)", _lhs, _rhs, __LINE__); \
+ } \
+ })
+
+#define assert_eq(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, ==)
+#define assert_ne(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, !=)
+#define assert_lt(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, <)
+#define assert_ge(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, >=)
+
+/*
+ * Return a pointer of a given type at a given offset from
+ * the beginning of the ELF file.
+ */
+#define elf_ptr(type, off) ((type *)(elf.begin + (off)))
+
+/* Iterate over all sections in the ELF. */
+#define for_each_section(var) \
+ for (var = elf.sh_table; var < elf.sh_table + elf16toh(elf.ehdr->e_shnum); ++var)
+
+/* Iterate over all Elf64_Rela relocations in a given section. */
+#define for_each_rela(shdr, var) \
+ for (var = elf_ptr(Elf64_Rela, elf64toh(shdr->sh_offset)); \
+ var < elf_ptr(Elf64_Rela, elf64toh(shdr->sh_offset) + elf64toh(shdr->sh_size)); var++)
+
+/* True if a string starts with a given prefix. */
+static inline bool starts_with(const char *str, const char *prefix)
+{
+ return memcmp(str, prefix, strlen(prefix)) == 0;
+}
+
+/* Returns a string containing the name of a given section. */
+static inline const char *section_name(Elf64_Shdr *shdr)
+{
+ return elf.sh_string + elf32toh(shdr->sh_name);
+}
+
+/* Returns a pointer to the first byte of section data. */
+static inline const char *section_begin(Elf64_Shdr *shdr)
+{
+ return elf_ptr(char, elf64toh(shdr->sh_offset));
+}
+
+/* Find a section by its offset from the beginning of the file. */
+static inline Elf64_Shdr *section_by_off(Elf64_Off off)
+{
+ assert_ne(off, 0UL, "%lu");
+ return elf_ptr(Elf64_Shdr, off);
+}
+
+/* Find a section by its index. */
+static inline Elf64_Shdr *section_by_idx(uint16_t idx)
+{
+ assert_ne(idx, SHN_UNDEF, "%u");
+ return &elf.sh_table[idx];
+}
+
+/*
+ * Memory-map the given ELF file, perform sanity checks, and
+ * populate global state.
+ */
+static void init_elf(const char *path)
+{
+ int fd, ret;
+ struct stat stat;
+
+ /* Store path in the global struct for error printing. */
+ elf.path = path;
+
+ /* Open the ELF file. */
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ fatal_perror("Could not open ELF file");
+
+ /* Get status of ELF file to obtain its size. */
+ ret = fstat(fd, &stat);
+ if (ret < 0) {
+ close(fd);
+ fatal_perror("Could not get status of ELF file");
+ }
+
+ /* mmap() the entire ELF file read-only at an arbitrary address. */
+ elf.begin = mmap(0, stat.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (elf.begin == MAP_FAILED) {
+ close(fd);
+ fatal_perror("Could not mmap ELF file");
+ }
+
+ /* mmap() was successful, close the FD. */
+ close(fd);
+
+ /* Get pointer to the ELF header. */
+ assert_ge(stat.st_size, sizeof(*elf.ehdr), "%lu");
+ elf.ehdr = elf_ptr(Elf64_Ehdr, 0);
+
+ /* Check the ELF magic. */
+ assert_eq(elf.ehdr->e_ident[EI_MAG0], ELFMAG0, "0x%x");
+ assert_eq(elf.ehdr->e_ident[EI_MAG1], ELFMAG1, "0x%x");
+ assert_eq(elf.ehdr->e_ident[EI_MAG2], ELFMAG2, "0x%x");
+ assert_eq(elf.ehdr->e_ident[EI_MAG3], ELFMAG3, "0x%x");
+
+ /* Sanity check that this is an ELF64 relocatable object for AArch64. */
+ assert_eq(elf.ehdr->e_ident[EI_CLASS], ELFCLASS64, "%u");
+ assert_eq(elf.ehdr->e_ident[EI_DATA], ELFENDIAN, "%u");
+ assert_eq(elf16toh(elf.ehdr->e_type), ET_REL, "%u");
+ assert_eq(elf16toh(elf.ehdr->e_machine), EM_AARCH64, "%u");
+
+ /* Populate fields of the global struct. */
+ elf.sh_table = section_by_off(elf64toh(elf.ehdr->e_shoff));
+ elf.sh_string = section_begin(section_by_idx(elf16toh(elf.ehdr->e_shstrndx)));
+}
+
+/* Print the prologue of the output ASM file. */
+static void emit_prologue(void)
+{
+ printf(".data\n"
+ ".pushsection " HYP_RELOC_SECTION ", \"a\"\n");
+}
+
+/* Print ASM statements needed as a prologue to a processed hyp section. */
+static void emit_section_prologue(const char *sh_orig_name)
+{
+ /* Declare the hyp section symbol. */
+ printf(".global %s%s\n", HYP_SECTION_SYMBOL_PREFIX, sh_orig_name);
+}
+
+/*
+ * Print ASM statements to create a hyp relocation entry for a given
+ * R_AARCH64_ABS64 relocation.
+ *
+ * The linker of vmlinux will populate the position given by `rela` with
+ * an absolute 64-bit kernel VA. If the kernel is relocatable, it will
+ * also generate a dynamic relocation entry so that the kernel can shift
+ * the address at runtime for KASLR.
+ *
+ * Emit a 32-bit offset from the current address to the position given
+ * by `rela`. This way the kernel can iterate over all kernel VAs used
+ * by hyp at runtime and convert them to hyp VAs. However, that offset
+ * will not be known until linking of `vmlinux`, so emit a PREL32
+ * relocation referencing a symbol that the hyp linker script put at
+ * the beginning of the relocated section + the offset from `rela`.
+ */
+static void emit_rela_abs64(Elf64_Rela *rela, const char *sh_orig_name)
+{
+ /* Offset of this reloc from the beginning of HYP_RELOC_SECTION. */
+ static size_t reloc_offset;
+
+ /* Create storage for the 32-bit offset. */
+ printf(".word 0\n");
+
+ /*
+ * Create a PREL32 relocation which instructs the linker of `vmlinux`
+ * to insert offset to position <base> + <offset>, where <base> is
+ * a symbol at the beginning of the relocated section, and <offset>
+ * is `rela->r_offset`.
+ */
+ printf(".reloc %lu, R_AARCH64_PREL32, %s%s + 0x%lx\n",
+ reloc_offset, HYP_SECTION_SYMBOL_PREFIX, sh_orig_name,
+ elf64toh(rela->r_offset));
+
+ reloc_offset += 4;
+}
+
+/* Print the epilogue of the output ASM file. */
+static void emit_epilogue(void)
+{
+ printf(".popsection\n");
+}
+
+/*
+ * Iterate over all RELA relocations in a given section and emit
+ * hyp relocation data for all absolute addresses in hyp code/data.
+ *
+ * Static relocations that generate PC-relative-addressing are ignored.
+ * Failure is reported for unexpected relocation types.
+ */
+static void emit_rela_section(Elf64_Shdr *sh_rela)
+{
+ Elf64_Shdr *sh_orig = &elf.sh_table[elf32toh(sh_rela->sh_info)];
+ const char *sh_orig_name = section_name(sh_orig);
+ Elf64_Rela *rela;
+
+ /* Skip all non-hyp sections. */
+ if (!starts_with(sh_orig_name, HYP_SECTION_PREFIX))
+ return;
+
+ emit_section_prologue(sh_orig_name);
+
+ for_each_rela(sh_rela, rela) {
+ uint32_t type = (uint32_t)elf64toh(rela->r_info);
+
+ /* Check that rela points inside the relocated section. */
+ assert_lt(elf64toh(rela->r_offset), elf64toh(sh_orig->sh_size), "0x%lx");
+
+ switch (type) {
+ /*
+ * Data relocations to generate absolute addressing.
+ * Emit a hyp relocation.
+ */
+ case R_AARCH64_ABS64:
+ emit_rela_abs64(rela, sh_orig_name);
+ break;
+ /* Allow relocations to generate PC-relative addressing. */
+ case R_AARCH64_LD_PREL_LO19:
+ case R_AARCH64_ADR_PREL_LO21:
+ case R_AARCH64_ADR_PREL_PG_HI21:
+ case R_AARCH64_ADR_PREL_PG_HI21_NC:
+ case R_AARCH64_ADD_ABS_LO12_NC:
+ case R_AARCH64_LDST8_ABS_LO12_NC:
+ case R_AARCH64_LDST16_ABS_LO12_NC:
+ case R_AARCH64_LDST32_ABS_LO12_NC:
+ case R_AARCH64_LDST64_ABS_LO12_NC:
+ case R_AARCH64_LDST128_ABS_LO12_NC:
+ break;
+ /* Allow relative relocations for control-flow instructions. */
+ case R_AARCH64_TSTBR14:
+ case R_AARCH64_CONDBR19:
+ case R_AARCH64_JUMP26:
+ case R_AARCH64_CALL26:
+ break;
+ /* Allow group relocations to create PC-relative offset inline. */
+ case R_AARCH64_MOVW_PREL_G0:
+ case R_AARCH64_MOVW_PREL_G0_NC:
+ case R_AARCH64_MOVW_PREL_G1:
+ case R_AARCH64_MOVW_PREL_G1_NC:
+ case R_AARCH64_MOVW_PREL_G2:
+ case R_AARCH64_MOVW_PREL_G2_NC:
+ case R_AARCH64_MOVW_PREL_G3:
+ break;
+ default:
+ fatal_error("Unexpected RELA type %u", type);
+ }
+ }
+}
+
+/* Iterate over all sections and emit hyp relocation data for RELA sections. */
+static void emit_all_relocs(void)
+{
+ Elf64_Shdr *shdr;
+
+ for_each_section(shdr) {
+ switch (elf32toh(shdr->sh_type)) {
+ case SHT_REL:
+ fatal_error("Unexpected SHT_REL section \"%s\"",
+ section_name(shdr));
+ case SHT_RELA:
+ emit_rela_section(shdr);
+ break;
+ }
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <elf_input>\n", argv[0]);
+ return EXIT_FAILURE;
+ }
+
+ init_elf(argv[1]);
+
+ emit_prologue();
+ emit_all_relocs();
+ emit_epilogue();
+
+ return EXIT_SUCCESS;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index a820dfdc9c25..6585a7cbbc56 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -74,27 +74,28 @@ SYM_FUNC_END(__host_enter)
* void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
*/
SYM_FUNC_START(__hyp_do_panic)
- /* Load the format arguments into x1-7 */
- mov x6, x3
- get_vcpu_ptr x7, x3
-
- mrs x3, esr_el2
- mrs x4, far_el2
- mrs x5, hpfar_el2
-
/* Prepare and exit to the host's panic funciton. */
mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
PSR_MODE_EL1h)
msr spsr_el2, lr
ldr lr, =panic
+ hyp_kimg_va lr, x6
msr elr_el2, lr
- /*
- * Set the panic format string and enter the host, conditionally
- * restoring the host context.
- */
+ /* Set the panic format string. Use the, now free, LR as scratch. */
+ ldr lr, =__hyp_panic_string
+ hyp_kimg_va lr, x6
+
+ /* Load the format arguments into x1-7. */
+ mov x6, x3
+ get_vcpu_ptr x7, x3
+ mrs x3, esr_el2
+ mrs x4, far_el2
+ mrs x5, hpfar_el2
+
+ /* Enter the host, conditionally restoring the host context. */
cmp x0, xzr
- ldr x0, =__hyp_panic_string
+ mov x0, lr
b.eq __host_enter_without_restoring
b __host_enter_for_panic
SYM_FUNC_END(__hyp_do_panic)
@@ -124,7 +125,7 @@ SYM_FUNC_END(__hyp_do_panic)
* Preserve x0-x4, which may contain stub parameters.
*/
ldr x5, =__kvm_handle_stub_hvc
- kimg_pa x5, x6
+ hyp_pa x5, x6
br x5
.L__vect_end\@:
.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 882fd40d068e..c631e29fb001 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -18,7 +18,7 @@
#include <asm/virt.h>
.text
- .pushsection .hyp.idmap.text, "ax"
+ .pushsection .idmap.text, "ax"
.align 11
@@ -57,17 +57,10 @@ __do_hyp_init:
cmp x0, #HVC_STUB_HCALL_NR
b.lo __kvm_handle_stub_hvc
- // We only actively check bits [24:31], and everything
- // else has to be zero, which we check at build time.
-#if (KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) & 0xFFFFFFFF00FFFFFF)
-#error Unexpected __KVM_HOST_SMCCC_FUNC___kvm_hyp_init value
-#endif
-
- ror x0, x0, #24
- eor x0, x0, #((KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) >> 24) & 0xF)
- ror x0, x0, #4
- eor x0, x0, #((KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) >> 28) & 0xF)
- cbz x0, 1f
+ mov x3, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
+ cmp x0, x3
+ b.eq 1f
+
mov x0, #SMCCC_RET_NOT_SUPPORTED
eret
@@ -141,7 +134,6 @@ alternative_else_nop_endif
/* Set the host vector */
ldr x0, =__kvm_hyp_host_vector
- kimg_hyp_va x0, x1
msr vbar_el2, x0
ret
@@ -200,7 +192,6 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
/* Leave idmap. */
mov x0, x29
ldr x1, =kvm_host_psci_cpu_entry
- kimg_hyp_va x1, x2
br x1
SYM_CODE_END(__kvm_hyp_init_cpu)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index a906f9e2ff34..f012f8665ecc 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -108,9 +108,9 @@ static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
typedef void (*hcall_t)(struct kvm_cpu_context *);
-#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = kimg_fn_ptr(handle_##x)
+#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
-static const hcall_t *host_hcall[] = {
+static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
@@ -130,7 +130,6 @@ static const hcall_t *host_hcall[] = {
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(unsigned long, id, host_ctxt, 0);
- const hcall_t *kfn;
hcall_t hfn;
id -= KVM_HOST_SMCCC_ID(0);
@@ -138,13 +137,11 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
if (unlikely(id >= ARRAY_SIZE(host_hcall)))
goto inval;
- kfn = host_hcall[id];
- if (unlikely(!kfn))
+ hfn = host_hcall[id];
+ if (unlikely(!hfn))
goto inval;
cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS;
-
- hfn = kimg_fn_hyp_va(kfn);
hfn(host_ctxt);
return;
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
index 2997aa156d8e..879559057dee 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@@ -33,8 +33,8 @@ unsigned long __hyp_per_cpu_offset(unsigned int cpu)
if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base))
hyp_panic();
- cpu_base_array = (unsigned long *)hyp_symbol_addr(kvm_arm_hyp_percpu_base);
+ cpu_base_array = (unsigned long *)&kvm_arm_hyp_percpu_base;
this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
- elf_base = (unsigned long)hyp_symbol_addr(__per_cpu_start);
+ elf_base = (unsigned long)&__per_cpu_start;
return this_cpu_base - elf_base;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index 1206d0d754d5..cd119d82d8e3 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -12,14 +12,17 @@
#include <asm/memory.h>
SECTIONS {
+ HYP_SECTION(.idmap.text)
HYP_SECTION(.text)
+ HYP_SECTION(.data..ro_after_init)
+ HYP_SECTION(.rodata)
+
/*
* .hyp..data..percpu needs to be page aligned to maintain the same
* alignment for when linking into vmlinux.
*/
. = ALIGN(PAGE_SIZE);
- HYP_SECTION_NAME(.data..percpu) : {
+ BEGIN_HYP_SECTION(.data..percpu)
PERCPU_INPUT(L1_CACHE_BYTES)
- }
- HYP_SECTION(.data..ro_after_init)
+ END_HYP_SECTION
}
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index 8e7128cb7667..63de71c0481e 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -128,8 +128,8 @@ static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt)
if (cpu_id == INVALID_CPU_ID)
return PSCI_RET_INVALID_PARAMS;
- boot_args = per_cpu_ptr(hyp_symbol_addr(cpu_on_args), cpu_id);
- init_params = per_cpu_ptr(hyp_symbol_addr(kvm_init_params), cpu_id);
+ boot_args = per_cpu_ptr(&cpu_on_args, cpu_id);
+ init_params = per_cpu_ptr(&kvm_init_params, cpu_id);
/* Check if the target CPU is already being booted. */
if (!try_acquire_boot_args(boot_args))
@@ -140,7 +140,7 @@ static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt)
wmb();
ret = psci_call(func_id, mpidr,
- __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_entry)),
+ __hyp_pa(&kvm_hyp_cpu_entry),
__hyp_pa(init_params));
/* If successful, the lock will be released by the target CPU. */
@@ -159,8 +159,8 @@ static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
struct psci_boot_args *boot_args;
struct kvm_nvhe_init_params *init_params;
- boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args));
- init_params = this_cpu_ptr(hyp_symbol_addr(kvm_init_params));
+ boot_args = this_cpu_ptr(&suspend_args);
+ init_params = this_cpu_ptr(&kvm_init_params);
/*
* No need to acquire a lock before writing to boot_args because a core
@@ -174,7 +174,7 @@ static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
* point if it is a deep sleep state.
*/
return psci_call(func_id, power_state,
- __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_resume)),
+ __hyp_pa(&kvm_hyp_cpu_resume),
__hyp_pa(init_params));
}
@@ -186,8 +186,8 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
struct psci_boot_args *boot_args;
struct kvm_nvhe_init_params *init_params;
- boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args));
- init_params = this_cpu_ptr(hyp_symbol_addr(kvm_init_params));
+ boot_args = this_cpu_ptr(&suspend_args);
+ init_params = this_cpu_ptr(&kvm_init_params);
/*
* No need to acquire a lock before writing to boot_args because a core
@@ -198,7 +198,7 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
/* Will only return on error. */
return psci_call(func_id,
- __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_resume)),
+ __hyp_pa(&kvm_hyp_cpu_resume),
__hyp_pa(init_params), 0);
}
@@ -207,12 +207,12 @@ asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on)
struct psci_boot_args *boot_args;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &this_cpu_ptr(hyp_symbol_addr(kvm_host_data))->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
if (is_cpu_on)
- boot_args = this_cpu_ptr(hyp_symbol_addr(cpu_on_args));
+ boot_args = this_cpu_ptr(&cpu_on_args);
else
- boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args));
+ boot_args = this_cpu_ptr(&suspend_args);
cpu_reg(host_ctxt, 0) = boot_args->r0;
write_sysreg_el2(boot_args->pc, SYS_ELR);
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index bdf8e55ed308..4d177ce1d536 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -45,6 +45,10 @@
#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
+ KVM_PTE_LEAF_ATTR_HI_S2_XN)
+
struct kvm_pgtable_walk_data {
struct kvm_pgtable *pgt;
struct kvm_pgtable_walker *walker;
@@ -170,10 +174,9 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
smp_store_release(ptep, pte);
}
-static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
- u32 level)
+static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
{
- kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+ kvm_pte_t pte = kvm_phys_to_pte(pa);
u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
KVM_PTE_TYPE_BLOCK;
@@ -181,12 +184,7 @@ static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
pte |= FIELD_PREP(KVM_PTE_TYPE, type);
pte |= KVM_PTE_VALID;
- /* Tolerate KVM recreating the exact same mapping. */
- if (kvm_pte_valid(old))
- return old == pte;
-
- smp_store_release(ptep, pte);
- return true;
+ return pte;
}
static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
@@ -341,12 +339,17 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep, struct hyp_map_data *data)
{
+ kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
- WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+ /* Tolerate KVM recreating the exact same mapping */
+ new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+ if (old != new && !WARN_ON(kvm_pte_valid(old)))
+ smp_store_release(ptep, new);
+
data->phys += granule;
return true;
}
@@ -461,34 +464,41 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
return 0;
}
-static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
- kvm_pte_t *ptep,
- struct stage2_map_data *data)
+static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
{
+ kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
+ struct page *page = virt_to_page(ptep);
if (!kvm_block_mapping_supported(addr, end, phys, level))
- return false;
-
- /*
- * If the PTE was already valid, drop the refcount on the table
- * early, as it will be bumped-up again in stage2_map_walk_leaf().
- * This ensures that the refcount stays constant across a valid to
- * valid PTE update.
- */
- if (kvm_pte_valid(*ptep))
- put_page(virt_to_page(ptep));
-
- if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
- goto out;
+ return -E2BIG;
+
+ new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+ if (kvm_pte_valid(old)) {
+ /*
+ * Skip updating the PTE if we are trying to recreate the exact
+ * same mapping or only change the access permissions. Instead,
+ * the vCPU will exit one more time from guest if still needed
+ * and then go through the path of relaxing permissions.
+ */
+ if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
+ return -EAGAIN;
+
+ /*
+ * There's an existing different valid leaf entry, so perform
+ * break-before-make.
+ */
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+ put_page(page);
+ }
- /* There's an existing valid leaf entry, so perform break-before-make */
- kvm_set_invalid_pte(ptep);
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
- kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
-out:
+ smp_store_release(ptep, new);
+ get_page(page);
data->phys += granule;
- return true;
+ return 0;
}
static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
@@ -516,6 +526,7 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
struct stage2_map_data *data)
{
+ int ret;
kvm_pte_t *childp, pte = *ptep;
struct page *page = virt_to_page(ptep);
@@ -526,8 +537,9 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
return 0;
}
- if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
- goto out_get_page;
+ ret = stage2_map_walker_try_leaf(addr, end, level, ptep, data);
+ if (ret != -E2BIG)
+ return ret;
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
@@ -551,9 +563,8 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
}
kvm_set_table_pte(ptep, childp);
-
-out_get_page:
get_page(page);
+
return 0;
}
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
index 8f0585640241..87a54375bd6e 100644
--- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -64,7 +64,7 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
}
rd = kvm_vcpu_dabt_get_rd(vcpu);
- addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va;
+ addr = kvm_vgic_global_state.vcpu_hyp_va;
addr += fault_ipa - vgic->vgic_cpu_base;
if (kvm_vcpu_dabt_iswrite(vcpu)) {
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 25ea4ecb6449..ead21b98b620 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -71,6 +71,12 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
if (gpa != GPA_INVALID)
val = gpa;
break;
+ case ARM_SMCCC_TRNG_VERSION:
+ case ARM_SMCCC_TRNG_FEATURES:
+ case ARM_SMCCC_TRNG_GET_UUID:
+ case ARM_SMCCC_TRNG_RND32:
+ case ARM_SMCCC_TRNG_RND64:
+ return kvm_trng_call(vcpu);
default:
return kvm_psci_call(vcpu);
}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 7d2257cc5438..77cb2d28f2a4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -879,11 +879,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (vma_pagesize == PAGE_SIZE && !force_pte)
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
&pfn, &fault_ipa);
- if (writable) {
+ if (writable)
prot |= KVM_PGTABLE_PROT_W;
- kvm_set_pfn_dirty(pfn);
- mark_page_dirty(kvm, gfn);
- }
if (fault_status != FSC_PERM && !device)
clean_dcache_guest_page(pfn, vma_pagesize);
@@ -911,11 +908,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
memcache);
}
+ /* Mark the page dirty only if the fault is handled successfully */
+ if (writable && !ret) {
+ kvm_set_pfn_dirty(pfn);
+ mark_page_dirty(kvm, gfn);
+ }
+
out_unlock:
spin_unlock(&kvm->mmu_lock);
kvm_set_pfn_accessed(pfn);
kvm_release_pfn_clean(pfn);
- return ret;
+ return ret != -EAGAIN ? ret : 0;
}
/* Resolve the access fault by making the page young again. */
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 247422ac78a9..e9ec08b0b070 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -23,11 +23,11 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
static u32 kvm_pmu_event_mask(struct kvm *kvm)
{
switch (kvm->arch.pmuver) {
- case 1: /* ARMv8.0 */
+ case ID_AA64DFR0_PMUVER_8_0:
return GENMASK(9, 0);
- case 4: /* ARMv8.1 */
- case 5: /* ARMv8.4 */
- case 6: /* ARMv8.5 */
+ case ID_AA64DFR0_PMUVER_8_1:
+ case ID_AA64DFR0_PMUVER_8_4:
+ case ID_AA64DFR0_PMUVER_8_5:
return GENMASK(15, 0);
default: /* Shouldn't be here, just for sanity */
WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver);
@@ -795,6 +795,12 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
base = 0;
} else {
val = read_sysreg(pmceid1_el0);
+ /*
+ * Don't advertise STALL_SLOT, as PMMIR_EL0 is handled
+ * as RAZ
+ */
+ if (vcpu->kvm->arch.pmuver >= ID_AA64DFR0_PMUVER_8_4)
+ val &= ~BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32);
base = 32;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 7c4f79532406..4f2f1e3145de 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -9,6 +9,7 @@
* Christoffer Dall <c.dall@virtualopensystems.com>
*/
+#include <linux/bitfield.h>
#include <linux/bsearch.h>
#include <linux/kvm_host.h>
#include <linux/mm.h>
@@ -700,14 +701,18 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- u64 pmceid;
+ u64 pmceid, mask, shift;
BUG_ON(p->is_write);
if (pmu_access_el0_disabled(vcpu))
return false;
+ get_access_mask(r, &mask, &shift);
+
pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1));
+ pmceid &= mask;
+ pmceid >>= shift;
p->regval = pmceid;
@@ -1021,6 +1026,8 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
return true;
}
+#define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT))
+
/* Read a sanitised cpufeature ID register by sys_reg_desc */
static u64 read_id_reg(const struct kvm_vcpu *vcpu,
struct sys_reg_desc const *r, bool raz)
@@ -1028,36 +1035,41 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
u32 id = reg_to_encoding(r);
u64 val = raz ? 0 : read_sanitised_ftr_reg(id);
- if (id == SYS_ID_AA64PFR0_EL1) {
+ switch (id) {
+ case SYS_ID_AA64PFR0_EL1:
if (!vcpu_has_sve(vcpu))
- val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
- val &= ~(0xfUL << ID_AA64PFR0_AMU_SHIFT);
- val &= ~(0xfUL << ID_AA64PFR0_CSV2_SHIFT);
- val |= ((u64)vcpu->kvm->arch.pfr0_csv2 << ID_AA64PFR0_CSV2_SHIFT);
- val &= ~(0xfUL << ID_AA64PFR0_CSV3_SHIFT);
- val |= ((u64)vcpu->kvm->arch.pfr0_csv3 << ID_AA64PFR0_CSV3_SHIFT);
- } else if (id == SYS_ID_AA64PFR1_EL1) {
- val &= ~(0xfUL << ID_AA64PFR1_MTE_SHIFT);
- } else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) {
- val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) |
- (0xfUL << ID_AA64ISAR1_API_SHIFT) |
- (0xfUL << ID_AA64ISAR1_GPA_SHIFT) |
- (0xfUL << ID_AA64ISAR1_GPI_SHIFT));
- } else if (id == SYS_ID_AA64DFR0_EL1) {
- u64 cap = 0;
-
- /* Limit guests to PMUv3 for ARMv8.1 */
- if (kvm_vcpu_has_pmu(vcpu))
- cap = ID_AA64DFR0_PMUVER_8_1;
-
+ val &= ~FEATURE(ID_AA64PFR0_SVE);
+ val &= ~FEATURE(ID_AA64PFR0_AMU);
+ val &= ~FEATURE(ID_AA64PFR0_CSV2);
+ val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2);
+ val &= ~FEATURE(ID_AA64PFR0_CSV3);
+ val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3);
+ break;
+ case SYS_ID_AA64PFR1_EL1:
+ val &= ~FEATURE(ID_AA64PFR1_MTE);
+ break;
+ case SYS_ID_AA64ISAR1_EL1:
+ if (!vcpu_has_ptrauth(vcpu))
+ val &= ~(FEATURE(ID_AA64ISAR1_APA) |
+ FEATURE(ID_AA64ISAR1_API) |
+ FEATURE(ID_AA64ISAR1_GPA) |
+ FEATURE(ID_AA64ISAR1_GPI));
+ break;
+ case SYS_ID_AA64DFR0_EL1:
+ /* Limit debug to ARMv8.0 */
+ val &= ~FEATURE(ID_AA64DFR0_DEBUGVER);
+ val |= FIELD_PREP(FEATURE(ID_AA64DFR0_DEBUGVER), 6);
+ /* Limit guests to PMUv3 for ARMv8.4 */
val = cpuid_feature_cap_perfmon_field(val,
- ID_AA64DFR0_PMUVER_SHIFT,
- cap);
- } else if (id == SYS_ID_DFR0_EL1) {
- /* Limit guests to PMUv3 for ARMv8.1 */
+ ID_AA64DFR0_PMUVER_SHIFT,
+ kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0);
+ break;
+ case SYS_ID_DFR0_EL1:
+ /* Limit guests to PMUv3 for ARMv8.4 */
val = cpuid_feature_cap_perfmon_field(val,
- ID_DFR0_PERFMON_SHIFT,
- ID_DFR0_PERFMON_8_1);
+ ID_DFR0_PERFMON_SHIFT,
+ kvm_vcpu_has_pmu(vcpu) ? ID_DFR0_PERFMON_8_4 : 0);
+ break;
}
return val;
@@ -1493,6 +1505,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
.access = access_pminten, .reg = PMINTENSET_EL1 },
{ PMU_SYS_REG(SYS_PMINTENCLR_EL1),
.access = access_pminten, .reg = PMINTENSET_EL1 },
+ { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi },
{ SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 },
{ SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
@@ -1720,7 +1733,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 },
};
-static bool trap_dbgidr(struct kvm_vcpu *vcpu,
+static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
@@ -1734,7 +1747,7 @@ static bool trap_dbgidr(struct kvm_vcpu *vcpu,
p->regval = ((((dfr >> ID_AA64DFR0_WRPS_SHIFT) & 0xf) << 28) |
(((dfr >> ID_AA64DFR0_BRPS_SHIFT) & 0xf) << 24) |
(((dfr >> ID_AA64DFR0_CTX_CMPS_SHIFT) & 0xf) << 20)
- | (6 << 16) | (el3 << 14) | (el3 << 12));
+ | (6 << 16) | (1 << 15) | (el3 << 14) | (el3 << 12));
return true;
}
}
@@ -1767,8 +1780,8 @@ static bool trap_dbgidr(struct kvm_vcpu *vcpu,
* guest. Revisit this one day, would this principle change.
*/
static const struct sys_reg_desc cp14_regs[] = {
- /* DBGIDR */
- { Op1( 0), CRn( 0), CRm( 0), Op2( 0), trap_dbgidr },
+ /* DBGDIDR */
+ { Op1( 0), CRn( 0), CRm( 0), Op2( 0), trap_dbgdidr },
/* DBGDTRRXext */
{ Op1( 0), CRn( 0), CRm( 0), Op2( 2), trap_raz_wi },
@@ -1918,8 +1931,8 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1( 0), CRn( 9), CRm(12), Op2( 3), access_pmovs },
{ Op1( 0), CRn( 9), CRm(12), Op2( 4), access_pmswinc },
{ Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmselr },
- { Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmceid },
- { Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmceid },
+ { AA32(LO), Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmceid },
+ { AA32(LO), Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmceid },
{ Op1( 0), CRn( 9), CRm(13), Op2( 0), access_pmu_evcntr },
{ Op1( 0), CRn( 9), CRm(13), Op2( 1), access_pmu_evtyper },
{ Op1( 0), CRn( 9), CRm(13), Op2( 2), access_pmu_evcntr },
@@ -1927,6 +1940,10 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1( 0), CRn( 9), CRm(14), Op2( 1), access_pminten },
{ Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pminten },
{ Op1( 0), CRn( 9), CRm(14), Op2( 3), access_pmovs },
+ { AA32(HI), Op1( 0), CRn( 9), CRm(14), Op2( 4), access_pmceid },
+ { AA32(HI), Op1( 0), CRn( 9), CRm(14), Op2( 5), access_pmceid },
+ /* PMMIR */
+ { Op1( 0), CRn( 9), CRm(14), Op2( 6), trap_raz_wi },
/* PRRR/MAIR0 */
{ AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 },
diff --git a/arch/arm64/kvm/trng.c b/arch/arm64/kvm/trng.c
new file mode 100644
index 000000000000..99bdd7103c9c
--- /dev/null
+++ b/arch/arm64/kvm/trng.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 Arm Ltd.
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+
+#include <kvm/arm_hypercalls.h>
+
+#define ARM_SMCCC_TRNG_VERSION_1_0 0x10000UL
+
+/* Those values are deliberately separate from the generic SMCCC definitions. */
+#define TRNG_SUCCESS 0UL
+#define TRNG_NOT_SUPPORTED ((unsigned long)-1)
+#define TRNG_INVALID_PARAMETER ((unsigned long)-2)
+#define TRNG_NO_ENTROPY ((unsigned long)-3)
+
+#define TRNG_MAX_BITS64 192
+
+static const uuid_t arm_smc_trng_uuid __aligned(4) = UUID_INIT(
+ 0x0d21e000, 0x4384, 0x11eb, 0x80, 0x70, 0x52, 0x44, 0x55, 0x4e, 0x5a, 0x4c);
+
+static int kvm_trng_do_rnd(struct kvm_vcpu *vcpu, int size)
+{
+ DECLARE_BITMAP(bits, TRNG_MAX_BITS64);
+ u32 num_bits = smccc_get_arg1(vcpu);
+ int i;
+
+ if (num_bits > 3 * size) {
+ smccc_set_retval(vcpu, TRNG_INVALID_PARAMETER, 0, 0, 0);
+ return 1;
+ }
+
+ /* get as many bits as we need to fulfil the request */
+ for (i = 0; i < DIV_ROUND_UP(num_bits, BITS_PER_LONG); i++)
+ bits[i] = get_random_long();
+
+ bitmap_clear(bits, num_bits, TRNG_MAX_BITS64 - num_bits);
+
+ if (size == 32)
+ smccc_set_retval(vcpu, TRNG_SUCCESS, lower_32_bits(bits[1]),
+ upper_32_bits(bits[0]), lower_32_bits(bits[0]));
+ else
+ smccc_set_retval(vcpu, TRNG_SUCCESS, bits[2], bits[1], bits[0]);
+
+ memzero_explicit(bits, sizeof(bits));
+ return 1;
+}
+
+int kvm_trng_call(struct kvm_vcpu *vcpu)
+{
+ const __le32 *u = (__le32 *)arm_smc_trng_uuid.b;
+ u32 func_id = smccc_get_function(vcpu);
+ unsigned long val = TRNG_NOT_SUPPORTED;
+ int size = 64;
+
+ switch (func_id) {
+ case ARM_SMCCC_TRNG_VERSION:
+ val = ARM_SMCCC_TRNG_VERSION_1_0;
+ break;
+ case ARM_SMCCC_TRNG_FEATURES:
+ switch (smccc_get_arg1(vcpu)) {
+ case ARM_SMCCC_TRNG_VERSION:
+ case ARM_SMCCC_TRNG_FEATURES:
+ case ARM_SMCCC_TRNG_GET_UUID:
+ case ARM_SMCCC_TRNG_RND32:
+ case ARM_SMCCC_TRNG_RND64:
+ val = TRNG_SUCCESS;
+ }
+ break;
+ case ARM_SMCCC_TRNG_GET_UUID:
+ smccc_set_retval(vcpu, le32_to_cpu(u[0]), le32_to_cpu(u[1]),
+ le32_to_cpu(u[2]), le32_to_cpu(u[3]));
+ return 1;
+ case ARM_SMCCC_TRNG_RND32:
+ size = 32;
+ fallthrough;
+ case ARM_SMCCC_TRNG_RND64:
+ return kvm_trng_do_rnd(vcpu, size);
+ }
+
+ smccc_set_retval(vcpu, val, 0, 0, 0);
+ return 1;
+}
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index 70fcd6a12fe1..978301392d67 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -81,6 +81,34 @@ __init void kvm_compute_layout(void)
init_hyp_physvirt_offset();
}
+/*
+ * The .hyp.reloc ELF section contains a list of kimg positions that
+ * contains kimg VAs but will be accessed only in hyp execution context.
+ * Convert them to hyp VAs. See gen-hyprel.c for more details.
+ */
+__init void kvm_apply_hyp_relocations(void)
+{
+ int32_t *rel;
+ int32_t *begin = (int32_t *)__hyp_reloc_begin;
+ int32_t *end = (int32_t *)__hyp_reloc_end;
+
+ for (rel = begin; rel < end; ++rel) {
+ uintptr_t *ptr, kimg_va;
+
+ /*
+ * Each entry contains a 32-bit relative offset from itself
+ * to a kimg VA position.
+ */
+ ptr = (uintptr_t *)lm_alias((char *)rel + *rel);
+
+ /* Read the kimg VA value at the relocation address. */
+ kimg_va = *ptr;
+
+ /* Convert to hyp VA and store back to the relocation address. */
+ *ptr = __early_kern_hyp_va((uintptr_t)lm_alias(kimg_va));
+ }
+}
+
static u32 compute_instruction(int n, u32 rd, u32 rn)
{
u32 insn = AARCH64_BREAK_FAULT;
@@ -255,12 +283,6 @@ static void generate_mov_q(u64 val, __le32 *origptr, __le32 *updptr, int nr_inst
*updptr++ = cpu_to_le32(insn);
}
-void kvm_update_kimg_phys_offset(struct alt_instr *alt,
- __le32 *origptr, __le32 *updptr, int nr_inst)
-{
- generate_mov_q(kimage_voffset + PHYS_OFFSET, origptr, updptr, nr_inst);
-}
-
void kvm_get_kimage_voffset(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst)
{
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 24f3d0f9996b..3a5612e7304c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -83,7 +83,6 @@
#define KVM_MAX_VCPUS 16
-#define KVM_USER_MEM_SLOTS 16
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 0
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 8a88eb265516..6ce2117e49f6 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -10,7 +10,6 @@
#define _ASM_SPINLOCK_H
#include <asm/processor.h>
-#include <asm/qrwlock.h>
#include <asm-generic/qspinlock_types.h>
@@ -27,5 +26,6 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
}
#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
#endif /* _ASM_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index c98f5141e3fc..ed6086d57b22 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -535,9 +535,12 @@ struct h_cpu_char_result {
u64 behaviour;
};
-/* Register state for entering a nested guest with H_ENTER_NESTED */
+/*
+ * Register state for entering a nested guest with H_ENTER_NESTED.
+ * New member must be added at the end.
+ */
struct hv_guest_state {
- u64 version; /* version of this structure layout */
+ u64 version; /* version of this structure layout, must be first */
u32 lpid;
u32 vcpu_token;
/* These registers are hypervisor privileged (at least for writing) */
@@ -566,10 +569,26 @@ struct hv_guest_state {
u64 pidr;
u64 cfar;
u64 ppr;
+ /* Version 1 ends here */
+ u64 dawr1;
+ u64 dawrx1;
+ /* Version 2 ends here */
};
/* Latest version of hv_guest_state structure */
-#define HV_GUEST_STATE_VERSION 1
+#define HV_GUEST_STATE_VERSION 2
+
+static inline int hv_guest_state_size(unsigned int version)
+{
+ switch (version) {
+ case 1:
+ return offsetofend(struct hv_guest_state, ppr);
+ case 2:
+ return offsetofend(struct hv_guest_state, dawrx1);
+ default:
+ return -1;
+ }
+}
/*
* From the document "H_GetPerformanceCounterInfo Interface" v1.07
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 078f4648ea27..b6d31bff5209 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -74,16 +74,6 @@ struct kvm_split_mode {
u8 do_nap;
u8 napped[MAX_SMT_THREADS];
struct kvmppc_vcore *vc[MAX_SUBCORES];
- /* Bits for changing lpcr on P9 */
- unsigned long lpcr_req;
- unsigned long lpidr_req;
- unsigned long host_lpcr;
- u32 do_set;
- u32 do_restore;
- union {
- u32 allphases;
- u8 phase[4];
- } lpcr_sync;
};
/*
@@ -110,7 +100,6 @@ struct kvmppc_host_state {
u8 hwthread_state;
u8 host_ipi;
u8 ptid; /* thread number within subcore when split */
- u8 tid; /* thread number within whole core */
u8 fake_suspend;
struct kvm_vcpu *kvm_vcpu;
struct kvmppc_vcore *kvm_vcore;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d67a470e95a3..05fb00d37609 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -28,7 +28,6 @@
#define KVM_MAX_VCPUS NR_CPUS
#define KVM_MAX_VCORES NR_CPUS
-#define KVM_USER_MEM_SLOTS 512
#include <asm/cputhreads.h>
@@ -307,6 +306,7 @@ struct kvm_arch {
u8 svm_enabled;
bool threads_indep;
bool nested_enable;
+ bool dawr1_enabled;
pgd_t *pgtable;
u64 process_table;
struct dentry *debugfs_dir;
@@ -584,8 +584,10 @@ struct kvm_vcpu_arch {
u32 ctrl;
u32 dabrx;
ulong dabr;
- ulong dawr;
- ulong dawrx;
+ ulong dawr0;
+ ulong dawrx0;
+ ulong dawr1;
+ ulong dawrx1;
ulong ciabr;
ulong cfar;
ulong ppr;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0a056c64c317..df4bda867bab 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -314,6 +314,8 @@ struct kvmppc_ops {
int size);
int (*enable_svm)(struct kvm *kvm);
int (*svm_off)(struct kvm *kvm);
+ int (*enable_dawr1)(struct kvm *kvm);
+ bool (*hash_v3_possible)(void);
};
extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index c3af3f324c5a..9f18fa090f1f 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -644,6 +644,8 @@ struct kvm_ppc_cpu_char {
#define KVM_REG_PPC_MMCR3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc1)
#define KVM_REG_PPC_SIER2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc2)
#define KVM_REG_PPC_SIER3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3)
+#define KVM_REG_PPC_DAWR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4)
+#define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5)
/* Transactional Memory checkpointed state:
* This is all GPRs, all VSX regs and a subset of SPRs
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index b12d7c049bfe..b690c70f061c 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -526,8 +526,10 @@ int main(void)
OFFSET(VCPU_CTRL, kvm_vcpu, arch.ctrl);
OFFSET(VCPU_DABR, kvm_vcpu, arch.dabr);
OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx);
- OFFSET(VCPU_DAWR, kvm_vcpu, arch.dawr);
- OFFSET(VCPU_DAWRX, kvm_vcpu, arch.dawrx);
+ OFFSET(VCPU_DAWR0, kvm_vcpu, arch.dawr0);
+ OFFSET(VCPU_DAWRX0, kvm_vcpu, arch.dawrx0);
+ OFFSET(VCPU_DAWR1, kvm_vcpu, arch.dawr1);
+ OFFSET(VCPU_DAWRX1, kvm_vcpu, arch.dawrx1);
OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr);
OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags);
OFFSET(VCPU_DEC, kvm_vcpu, arch.dec);
@@ -668,7 +670,6 @@ int main(void)
HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
HSTATE_FIELD(HSTATE_PTID, ptid);
- HSTATE_FIELD(HSTATE_TID, tid);
HSTATE_FIELD(HSTATE_FAKE_SUSPEND, fake_suspend);
HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
@@ -698,8 +699,6 @@ int main(void)
OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
- OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
- OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6f612d240392..f09708da216e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -134,7 +134,7 @@ static inline bool nesting_enabled(struct kvm *kvm)
}
/* If set, the threads on each CPU core have to be in the same MMU mode */
-static bool no_mixing_hpt_and_radix;
+static bool no_mixing_hpt_and_radix __read_mostly;
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
@@ -782,8 +782,24 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
return H_UNSUPPORTED_FLAG_START;
if (value2 & DABRX_HYP)
return H_P4;
- vcpu->arch.dawr = value1;
- vcpu->arch.dawrx = value2;
+ vcpu->arch.dawr0 = value1;
+ vcpu->arch.dawrx0 = value2;
+ return H_SUCCESS;
+ case H_SET_MODE_RESOURCE_SET_DAWR1:
+ if (!kvmppc_power8_compatible(vcpu))
+ return H_P2;
+ if (!ppc_breakpoint_available())
+ return H_P2;
+ if (!cpu_has_feature(CPU_FTR_DAWR1))
+ return H_P2;
+ if (!vcpu->kvm->arch.dawr1_enabled)
+ return H_FUNCTION;
+ if (mflags)
+ return H_UNSUPPORTED_FLAG_START;
+ if (value2 & DABRX_HYP)
+ return H_P4;
+ vcpu->arch.dawr1 = value1;
+ vcpu->arch.dawrx1 = value2;
return H_SUCCESS;
case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
/* KVM does not support mflags=2 (AIL=2) */
@@ -1759,10 +1775,16 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
*val = get_reg_val(id, vcpu->arch.vcore->vtb);
break;
case KVM_REG_PPC_DAWR:
- *val = get_reg_val(id, vcpu->arch.dawr);
+ *val = get_reg_val(id, vcpu->arch.dawr0);
break;
case KVM_REG_PPC_DAWRX:
- *val = get_reg_val(id, vcpu->arch.dawrx);
+ *val = get_reg_val(id, vcpu->arch.dawrx0);
+ break;
+ case KVM_REG_PPC_DAWR1:
+ *val = get_reg_val(id, vcpu->arch.dawr1);
+ break;
+ case KVM_REG_PPC_DAWRX1:
+ *val = get_reg_val(id, vcpu->arch.dawrx1);
break;
case KVM_REG_PPC_CIABR:
*val = get_reg_val(id, vcpu->arch.ciabr);
@@ -1991,10 +2013,16 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
vcpu->arch.vcore->vtb = set_reg_val(id, *val);
break;
case KVM_REG_PPC_DAWR:
- vcpu->arch.dawr = set_reg_val(id, *val);
+ vcpu->arch.dawr0 = set_reg_val(id, *val);
break;
case KVM_REG_PPC_DAWRX:
- vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP;
+ vcpu->arch.dawrx0 = set_reg_val(id, *val) & ~DAWRX_HYP;
+ break;
+ case KVM_REG_PPC_DAWR1:
+ vcpu->arch.dawr1 = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_DAWRX1:
+ vcpu->arch.dawrx1 = set_reg_val(id, *val) & ~DAWRX_HYP;
break;
case KVM_REG_PPC_CIABR:
vcpu->arch.ciabr = set_reg_val(id, *val);
@@ -2862,11 +2890,6 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
return false;
- /* Some POWER9 chips require all threads to be in the same MMU mode */
- if (no_mixing_hpt_and_radix &&
- kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
- return false;
-
if (n_threads < cip->max_subcore_threads)
n_threads = cip->max_subcore_threads;
if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
@@ -2905,6 +2928,9 @@ static void prepare_threads(struct kvmppc_vcore *vc)
for_each_runnable_thread(i, vcpu, vc) {
if (signal_pending(vcpu->arch.run_task))
vcpu->arch.ret = -EINTR;
+ else if (no_mixing_hpt_and_radix &&
+ kvm_is_radix(vc->kvm) != radix_enabled())
+ vcpu->arch.ret = -EINVAL;
else if (vcpu->arch.vpa.update_pending ||
vcpu->arch.slb_shadow.update_pending ||
vcpu->arch.dtl.update_pending)
@@ -3110,7 +3136,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
int controlled_threads;
int trap;
bool is_power8;
- bool hpt_on_radix;
/*
* Remove from the list any threads that have a signal pending
@@ -3143,11 +3168,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* this is a HPT guest on a radix host machine where the
* CPU threads may not be in different MMU modes.
*/
- hpt_on_radix = no_mixing_hpt_and_radix && radix_enabled() &&
- !kvm_is_radix(vc->kvm);
- if (((controlled_threads > 1) &&
- ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
- (hpt_on_radix && vc->kvm->arch.threads_indep)) {
+ if ((controlled_threads > 1) &&
+ ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
for_each_runnable_thread(i, vcpu, vc) {
vcpu->arch.ret = -EBUSY;
kvmppc_remove_runnable(vc, vcpu);
@@ -3215,7 +3237,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
&& !cpu_has_feature(CPU_FTR_ARCH_300);
- if (split > 1 || hpt_on_radix) {
+ if (split > 1) {
sip = &split_info;
memset(&split_info, 0, sizeof(split_info));
for (sub = 0; sub < core_info.n_subcores; ++sub)
@@ -3237,13 +3259,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
split_info.subcore_size = subcore_size;
} else {
split_info.subcore_size = 1;
- if (hpt_on_radix) {
- /* Use the split_info for LPCR/LPIDR changes */
- split_info.lpcr_req = vc->lpcr;
- split_info.lpidr_req = vc->kvm->arch.lpid;
- split_info.host_lpcr = vc->kvm->arch.host_lpcr;
- split_info.do_set = 1;
- }
}
/* order writes to split_info before kvm_split_mode pointer */
@@ -3253,7 +3268,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
for (thr = 0; thr < controlled_threads; ++thr) {
struct paca_struct *paca = paca_ptrs[pcpu + thr];
- paca->kvm_hstate.tid = thr;
paca->kvm_hstate.napping = 0;
paca->kvm_hstate.kvm_split_mode = sip;
}
@@ -3327,10 +3341,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* When doing micro-threading, poke the inactive threads as well.
* This gets them to the nap instruction after kvm_do_nap,
* which reduces the time taken to unsplit later.
- * For POWER9 HPT guest on radix host, we need all the secondary
- * threads woken up so they can do the LPCR/LPIDR change.
*/
- if (cmd_bit || hpt_on_radix) {
+ if (cmd_bit) {
split_info.do_nap = 1; /* ask secondaries to nap when done */
for (thr = 1; thr < threads_per_subcore; ++thr)
if (!(active & (1 << thr)))
@@ -3391,19 +3403,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cpu_relax();
++loops;
}
- } else if (hpt_on_radix) {
- /* Wait for all threads to have seen final sync */
- for (thr = 1; thr < controlled_threads; ++thr) {
- struct paca_struct *paca = paca_ptrs[pcpu + thr];
-
- while (paca->kvm_hstate.kvm_split_mode) {
- HMT_low();
- barrier();
- }
- HMT_medium();
- }
+ split_info.do_nap = 0;
}
- split_info.do_nap = 0;
kvmppc_set_host_core(pcpu);
@@ -3449,10 +3450,17 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
int trap;
unsigned long host_hfscr = mfspr(SPRN_HFSCR);
unsigned long host_ciabr = mfspr(SPRN_CIABR);
- unsigned long host_dawr = mfspr(SPRN_DAWR0);
- unsigned long host_dawrx = mfspr(SPRN_DAWRX0);
+ unsigned long host_dawr0 = mfspr(SPRN_DAWR0);
+ unsigned long host_dawrx0 = mfspr(SPRN_DAWRX0);
unsigned long host_psscr = mfspr(SPRN_PSSCR);
unsigned long host_pidr = mfspr(SPRN_PID);
+ unsigned long host_dawr1 = 0;
+ unsigned long host_dawrx1 = 0;
+
+ if (cpu_has_feature(CPU_FTR_DAWR1)) {
+ host_dawr1 = mfspr(SPRN_DAWR1);
+ host_dawrx1 = mfspr(SPRN_DAWRX1);
+ }
/*
* P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0,
@@ -3489,8 +3497,12 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
mtspr(SPRN_SPURR, vcpu->arch.spurr);
if (dawr_enabled()) {
- mtspr(SPRN_DAWR0, vcpu->arch.dawr);
- mtspr(SPRN_DAWRX0, vcpu->arch.dawrx);
+ mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
+ mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
+ if (cpu_has_feature(CPU_FTR_DAWR1)) {
+ mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
+ mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
+ }
}
mtspr(SPRN_CIABR, vcpu->arch.ciabr);
mtspr(SPRN_IC, vcpu->arch.ic);
@@ -3542,8 +3554,12 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
(local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, host_hfscr);
mtspr(SPRN_CIABR, host_ciabr);
- mtspr(SPRN_DAWR0, host_dawr);
- mtspr(SPRN_DAWRX0, host_dawrx);
+ mtspr(SPRN_DAWR0, host_dawr0);
+ mtspr(SPRN_DAWRX0, host_dawrx0);
+ if (cpu_has_feature(CPU_FTR_DAWR1)) {
+ mtspr(SPRN_DAWR1, host_dawr1);
+ mtspr(SPRN_DAWRX1, host_dawrx1);
+ }
mtspr(SPRN_PID, host_pidr);
/*
@@ -3595,6 +3611,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long host_tidr = mfspr(SPRN_TIDR);
unsigned long host_iamr = mfspr(SPRN_IAMR);
unsigned long host_amr = mfspr(SPRN_AMR);
+ unsigned long host_fscr = mfspr(SPRN_FSCR);
s64 dec;
u64 tb;
int trap, save_pmu;
@@ -3735,6 +3752,9 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
if (host_amr != vcpu->arch.amr)
mtspr(SPRN_AMR, host_amr);
+ if (host_fscr != vcpu->arch.fscr)
+ mtspr(SPRN_FSCR, host_fscr);
+
msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
store_fp_state(&vcpu->arch.fp);
#ifdef CONFIG_ALTIVEC
@@ -4173,7 +4193,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
kvmppc_clear_host_core(pcpu);
- local_paca->kvm_hstate.tid = 0;
local_paca->kvm_hstate.napping = 0;
local_paca->kvm_hstate.kvm_split_mode = NULL;
kvmppc_start_thread(vcpu, vc);
@@ -4358,15 +4377,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
do {
/*
- * The early POWER9 chips that can't mix radix and HPT threads
- * on the same core also need the workaround for the problem
- * where the TLB would prefetch entries in the guest exit path
- * for radix guests using the guest PIDR value and LPID 0.
- * The workaround is in the old path (kvmppc_run_vcpu())
- * but not the new path (kvmhv_run_single_vcpu()).
+ * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu
+ * path, which also handles hash and dependent threads mode.
*/
if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
- !no_mixing_hpt_and_radix)
+ !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
vcpu->arch.vcore->lpcr);
else
@@ -5599,6 +5614,26 @@ out:
return ret;
}
+static int kvmhv_enable_dawr1(struct kvm *kvm)
+{
+ if (!cpu_has_feature(CPU_FTR_DAWR1))
+ return -ENODEV;
+
+ /* kvm == NULL means the caller is testing if the capability exists */
+ if (kvm)
+ kvm->arch.dawr1_enabled = true;
+ return 0;
+}
+
+static bool kvmppc_hash_v3_possible(void)
+{
+ if (radix_enabled() && no_mixing_hpt_and_radix)
+ return false;
+
+ return cpu_has_feature(CPU_FTR_ARCH_300) &&
+ cpu_has_feature(CPU_FTR_HVMODE);
+}
+
static struct kvmppc_ops kvm_ops_hv = {
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -5642,6 +5677,8 @@ static struct kvmppc_ops kvm_ops_hv = {
.store_to_eaddr = kvmhv_store_to_eaddr,
.enable_svm = kvmhv_enable_svm,
.svm_off = kvmhv_svm_off,
+ .enable_dawr1 = kvmhv_enable_dawr1,
+ .hash_v3_possible = kvmppc_hash_v3_possible,
};
static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 8053efdf7ea7..f3d3183249fe 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -277,8 +277,7 @@ void kvmhv_commence_exit(int trap)
struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
int ptid = local_paca->kvm_hstate.ptid;
struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
- int me, ee, i, t;
- int cpu0;
+ int me, ee, i;
/* Set our bit in the threads-exiting-guest map in the 0xff00
bits of vcore->entry_exit_map */
@@ -320,22 +319,6 @@ void kvmhv_commence_exit(int trap)
if ((ee >> 8) == 0)
kvmhv_interrupt_vcore(vc, ee);
}
-
- /*
- * On POWER9 when running a HPT guest on a radix host (sip != NULL),
- * we have to interrupt inactive CPU threads to get them to
- * restore the host LPCR value.
- */
- if (sip->lpcr_req) {
- if (cmpxchg(&sip->do_restore, 0, 1) == 0) {
- vc = local_paca->kvm_hstate.kvm_vcore;
- cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid;
- for (t = 1; t < threads_per_core; ++t) {
- if (sip->napped[t])
- kvmhv_rm_send_ipi(cpu0 + t);
- }
- }
- }
}
struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
@@ -667,95 +650,6 @@ void kvmppc_bad_interrupt(struct pt_regs *regs)
panic("Bad KVM trap");
}
-/*
- * Functions used to switch LPCR HR and UPRT bits on all threads
- * when entering and exiting HPT guests on a radix host.
- */
-
-#define PHASE_REALMODE 1 /* in real mode */
-#define PHASE_SET_LPCR 2 /* have set LPCR */
-#define PHASE_OUT_OF_GUEST 4 /* have finished executing in guest */
-#define PHASE_RESET_LPCR 8 /* have reset LPCR to host value */
-
-#define ALL(p) (((p) << 24) | ((p) << 16) | ((p) << 8) | (p))
-
-static void wait_for_sync(struct kvm_split_mode *sip, int phase)
-{
- int thr = local_paca->kvm_hstate.tid;
-
- sip->lpcr_sync.phase[thr] |= phase;
- phase = ALL(phase);
- while ((sip->lpcr_sync.allphases & phase) != phase) {
- HMT_low();
- barrier();
- }
- HMT_medium();
-}
-
-void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
-{
- int num_sets;
- unsigned long rb, set;
-
- /* wait for every other thread to get to real mode */
- wait_for_sync(sip, PHASE_REALMODE);
-
- /* Set LPCR and LPIDR */
- mtspr(SPRN_LPCR, sip->lpcr_req);
- mtspr(SPRN_LPID, sip->lpidr_req);
- isync();
-
- /*
- * P10 will flush all the congruence class with a single tlbiel
- */
- if (cpu_has_feature(CPU_FTR_ARCH_31))
- num_sets = 1;
- else
- num_sets = POWER9_TLB_SETS_RADIX;
-
- /* Invalidate the TLB on thread 0 */
- if (local_paca->kvm_hstate.tid == 0) {
- sip->do_set = 0;
- asm volatile("ptesync" : : : "memory");
- for (set = 0; set < num_sets; ++set) {
- rb = TLBIEL_INVAL_SET_LPID +
- (set << TLBIEL_INVAL_SET_SHIFT);
- asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
- "r" (rb), "r" (0));
- }
- asm volatile("ptesync" : : : "memory");
- }
-
- /* indicate that we have done so and wait for others */
- wait_for_sync(sip, PHASE_SET_LPCR);
- /* order read of sip->lpcr_sync.allphases vs. sip->do_set */
- smp_rmb();
-}
-
-/*
- * Called when a thread that has been in the guest needs
- * to reload the host LPCR value - but only on POWER9 when
- * running a HPT guest on a radix host.
- */
-void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
-{
- /* we're out of the guest... */
- wait_for_sync(sip, PHASE_OUT_OF_GUEST);
-
- mtspr(SPRN_LPID, 0);
- mtspr(SPRN_LPCR, sip->host_lpcr);
- isync();
-
- if (local_paca->kvm_hstate.tid == 0) {
- sip->do_restore = 0;
- smp_wmb(); /* order store of do_restore vs. phase */
- }
-
- wait_for_sync(sip, PHASE_RESET_LPCR);
- smp_mb();
- local_paca->kvm_hstate.kvm_split_mode = NULL;
-}
-
static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
{
vcpu->arch.ceded = 0;
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 33b58549a9aa..0cd0e7aad588 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -33,8 +33,8 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
hr->dpdes = vc->dpdes;
hr->hfscr = vcpu->arch.hfscr;
hr->tb_offset = vc->tb_offset;
- hr->dawr0 = vcpu->arch.dawr;
- hr->dawrx0 = vcpu->arch.dawrx;
+ hr->dawr0 = vcpu->arch.dawr0;
+ hr->dawrx0 = vcpu->arch.dawrx0;
hr->ciabr = vcpu->arch.ciabr;
hr->purr = vcpu->arch.purr;
hr->spurr = vcpu->arch.spurr;
@@ -49,6 +49,8 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
hr->pidr = vcpu->arch.pid;
hr->cfar = vcpu->arch.cfar;
hr->ppr = vcpu->arch.ppr;
+ hr->dawr1 = vcpu->arch.dawr1;
+ hr->dawrx1 = vcpu->arch.dawrx1;
}
static void byteswap_pt_regs(struct pt_regs *regs)
@@ -91,6 +93,8 @@ static void byteswap_hv_regs(struct hv_guest_state *hr)
hr->pidr = swab64(hr->pidr);
hr->cfar = swab64(hr->cfar);
hr->ppr = swab64(hr->ppr);
+ hr->dawr1 = swab64(hr->dawr1);
+ hr->dawrx1 = swab64(hr->dawrx1);
}
static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
@@ -138,6 +142,7 @@ static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
/* Don't let data address watchpoint match in hypervisor state */
hr->dawrx0 &= ~DAWRX_HYP;
+ hr->dawrx1 &= ~DAWRX_HYP;
/* Don't let completed instruction address breakpt match in HV state */
if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
@@ -151,8 +156,8 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
vc->pcr = hr->pcr | PCR_MASK;
vc->dpdes = hr->dpdes;
vcpu->arch.hfscr = hr->hfscr;
- vcpu->arch.dawr = hr->dawr0;
- vcpu->arch.dawrx = hr->dawrx0;
+ vcpu->arch.dawr0 = hr->dawr0;
+ vcpu->arch.dawrx0 = hr->dawrx0;
vcpu->arch.ciabr = hr->ciabr;
vcpu->arch.purr = hr->purr;
vcpu->arch.spurr = hr->spurr;
@@ -167,6 +172,8 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
vcpu->arch.pid = hr->pidr;
vcpu->arch.cfar = hr->cfar;
vcpu->arch.ppr = hr->ppr;
+ vcpu->arch.dawr1 = hr->dawr1;
+ vcpu->arch.dawrx1 = hr->dawrx1;
}
void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
@@ -215,12 +222,51 @@ static void kvmhv_nested_mmio_needed(struct kvm_vcpu *vcpu, u64 regs_ptr)
}
}
+static int kvmhv_read_guest_state_and_regs(struct kvm_vcpu *vcpu,
+ struct hv_guest_state *l2_hv,
+ struct pt_regs *l2_regs,
+ u64 hv_ptr, u64 regs_ptr)
+{
+ int size;
+
+ if (kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv->version,
+ sizeof(l2_hv->version)))
+ return -1;
+
+ if (kvmppc_need_byteswap(vcpu))
+ l2_hv->version = swab64(l2_hv->version);
+
+ size = hv_guest_state_size(l2_hv->version);
+ if (size < 0)
+ return -1;
+
+ return kvm_vcpu_read_guest(vcpu, hv_ptr, l2_hv, size) ||
+ kvm_vcpu_read_guest(vcpu, regs_ptr, l2_regs,
+ sizeof(struct pt_regs));
+}
+
+static int kvmhv_write_guest_state_and_regs(struct kvm_vcpu *vcpu,
+ struct hv_guest_state *l2_hv,
+ struct pt_regs *l2_regs,
+ u64 hv_ptr, u64 regs_ptr)
+{
+ int size;
+
+ size = hv_guest_state_size(l2_hv->version);
+ if (size < 0)
+ return -1;
+
+ return kvm_vcpu_write_guest(vcpu, hv_ptr, l2_hv, size) ||
+ kvm_vcpu_write_guest(vcpu, regs_ptr, l2_regs,
+ sizeof(struct pt_regs));
+}
+
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
{
long int err, r;
struct kvm_nested_guest *l2;
struct pt_regs l2_regs, saved_l1_regs;
- struct hv_guest_state l2_hv, saved_l1_hv;
+ struct hv_guest_state l2_hv = {0}, saved_l1_hv;
struct kvmppc_vcore *vc = vcpu->arch.vcore;
u64 hv_ptr, regs_ptr;
u64 hdec_exp;
@@ -235,17 +281,15 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
hv_ptr = kvmppc_get_gpr(vcpu, 4);
regs_ptr = kvmppc_get_gpr(vcpu, 5);
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
- sizeof(struct hv_guest_state)) ||
- kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
- sizeof(struct pt_regs));
+ err = kvmhv_read_guest_state_and_regs(vcpu, &l2_hv, &l2_regs,
+ hv_ptr, regs_ptr);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (err)
return H_PARAMETER;
if (kvmppc_need_byteswap(vcpu))
byteswap_hv_regs(&l2_hv);
- if (l2_hv.version != HV_GUEST_STATE_VERSION)
+ if (l2_hv.version > HV_GUEST_STATE_VERSION)
return H_P2;
if (kvmppc_need_byteswap(vcpu))
@@ -325,10 +369,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
byteswap_pt_regs(&l2_regs);
}
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
- sizeof(struct hv_guest_state)) ||
- kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
- sizeof(struct pt_regs));
+ err = kvmhv_write_guest_state_and_regs(vcpu, &l2_hv, &l2_regs,
+ hv_ptr, regs_ptr);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (err)
return H_AUTHORITY;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index cd9995ee8441..5e634db4809b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -52,11 +52,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#define STACK_SLOT_PID (SFS-32)
#define STACK_SLOT_IAMR (SFS-40)
#define STACK_SLOT_CIABR (SFS-48)
-#define STACK_SLOT_DAWR (SFS-56)
-#define STACK_SLOT_DAWRX (SFS-64)
+#define STACK_SLOT_DAWR0 (SFS-56)
+#define STACK_SLOT_DAWRX0 (SFS-64)
#define STACK_SLOT_HFSCR (SFS-72)
#define STACK_SLOT_AMR (SFS-80)
#define STACK_SLOT_UAMOR (SFS-88)
+#define STACK_SLOT_DAWR1 (SFS-96)
+#define STACK_SLOT_DAWRX1 (SFS-104)
/* the following is used by the P9 short path */
#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */
@@ -85,19 +87,6 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
RFI_TO_KERNEL
kvmppc_call_hv_entry:
-BEGIN_FTR_SECTION
- /* On P9, do LPCR setting, if necessary */
- ld r3, HSTATE_SPLIT_MODE(r13)
- cmpdi r3, 0
- beq 46f
- lwz r4, KVM_SPLIT_DO_SET(r3)
- cmpwi r4, 0
- beq 46f
- bl kvmhv_p9_set_lpcr
- nop
-46:
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
ld r4, HSTATE_KVM_VCPU(r13)
bl kvmppc_hv_entry
@@ -361,11 +350,11 @@ kvm_secondary_got_guest:
LOAD_REG_ADDR(r6, decrementer_max)
ld r6, 0(r6)
mtspr SPRN_HDEC, r6
+BEGIN_FTR_SECTION
/* and set per-LPAR registers, if doing dynamic micro-threading */
ld r6, HSTATE_SPLIT_MODE(r13)
cmpdi r6, 0
beq 63f
-BEGIN_FTR_SECTION
ld r0, KVM_SPLIT_RPR(r6)
mtspr SPRN_RPR, r0
ld r0, KVM_SPLIT_PMMAR(r6)
@@ -373,16 +362,7 @@ BEGIN_FTR_SECTION
ld r0, KVM_SPLIT_LDBAR(r6)
mtspr SPRN_LDBAR, r0
isync
-FTR_SECTION_ELSE
- /* On P9 we use the split_info for coordinating LPCR changes */
- lwz r4, KVM_SPLIT_DO_SET(r6)
- cmpwi r4, 0
- beq 1f
- mr r3, r6
- bl kvmhv_p9_set_lpcr
- nop
-1:
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
63:
/* Order load of vcpu after load of vcore */
lwsync
@@ -452,19 +432,15 @@ kvm_no_guest:
mtcr r5
blr
-53: HMT_LOW
+53:
+BEGIN_FTR_SECTION
+ HMT_LOW
ld r5, HSTATE_KVM_VCORE(r13)
cmpdi r5, 0
bne 60f
ld r3, HSTATE_SPLIT_MODE(r13)
cmpdi r3, 0
beq kvm_no_guest
- lwz r0, KVM_SPLIT_DO_SET(r3)
- cmpwi r0, 0
- bne kvmhv_do_set
- lwz r0, KVM_SPLIT_DO_RESTORE(r3)
- cmpwi r0, 0
- bne kvmhv_do_restore
lbz r0, KVM_SPLIT_DO_NAP(r3)
cmpwi r0, 0
beq kvm_no_guest
@@ -472,24 +448,19 @@ kvm_no_guest:
b kvm_unsplit_nap
60: HMT_MEDIUM
b kvm_secondary_got_guest
+FTR_SECTION_ELSE
+ HMT_LOW
+ ld r5, HSTATE_KVM_VCORE(r13)
+ cmpdi r5, 0
+ beq kvm_no_guest
+ HMT_MEDIUM
+ b kvm_secondary_got_guest
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
54: li r0, KVM_HWTHREAD_IN_KVM
stb r0, HSTATE_HWTHREAD_STATE(r13)
b kvm_no_guest
-kvmhv_do_set:
- /* Set LPCR, LPIDR etc. on P9 */
- HMT_MEDIUM
- bl kvmhv_p9_set_lpcr
- nop
- b kvm_no_guest
-
-kvmhv_do_restore:
- HMT_MEDIUM
- bl kvmhv_p9_restore_lpcr
- nop
- b kvm_no_guest
-
/*
* Here the primary thread is trying to return the core to
* whole-core mode, so we need to nap.
@@ -527,7 +498,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* Set kvm_split_mode.napped[tid] = 1 */
ld r3, HSTATE_SPLIT_MODE(r13)
li r0, 1
- lbz r4, HSTATE_TID(r13)
+ lhz r4, PACAPACAINDEX(r13)
+ clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */
addi r4, r4, KVM_SPLIT_NAPPED
stbx r0, r3, r4
/* Check the do_nap flag again after setting napped[] */
@@ -711,10 +683,16 @@ BEGIN_FTR_SECTION
mfspr r7, SPRN_DAWRX0
mfspr r8, SPRN_IAMR
std r5, STACK_SLOT_CIABR(r1)
- std r6, STACK_SLOT_DAWR(r1)
- std r7, STACK_SLOT_DAWRX(r1)
+ std r6, STACK_SLOT_DAWR0(r1)
+ std r7, STACK_SLOT_DAWRX0(r1)
std r8, STACK_SLOT_IAMR(r1)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+BEGIN_FTR_SECTION
+ mfspr r6, SPRN_DAWR1
+ mfspr r7, SPRN_DAWRX1
+ std r6, STACK_SLOT_DAWR1(r1)
+ std r7, STACK_SLOT_DAWRX1(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S | CPU_FTR_DAWR1)
mfspr r5, SPRN_AMR
std r5, STACK_SLOT_AMR(r1)
@@ -801,10 +779,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
lbz r5, 0(r5)
cmpdi r5, 0
beq 1f
- ld r5, VCPU_DAWR(r4)
- ld r6, VCPU_DAWRX(r4)
+ ld r5, VCPU_DAWR0(r4)
+ ld r6, VCPU_DAWRX0(r4)
mtspr SPRN_DAWR0, r5
mtspr SPRN_DAWRX0, r6
+BEGIN_FTR_SECTION
+ ld r5, VCPU_DAWR1(r4)
+ ld r6, VCPU_DAWRX1(r4)
+ mtspr SPRN_DAWR1, r5
+ mtspr SPRN_DAWRX1, r6
+END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
1:
ld r7, VCPU_CIABR(r4)
ld r8, VCPU_TAR(r4)
@@ -918,15 +902,19 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
cmpdi r3, 512 /* 1 microsecond */
blt hdec_soon
- /* For hash guest, clear out and reload the SLB */
ld r6, VCPU_KVM(r4)
lbz r0, KVM_RADIX(r6)
cmpwi r0, 0
bne 9f
+
+ /* For hash guest, clear out and reload the SLB */
+BEGIN_MMU_FTR_SECTION
+ /* Radix host won't have populated the SLB, so no need to clear */
li r6, 0
slbmte r6, r6
- slbia
+ PPC_SLBIA(6)
ptesync
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
lwz r5,VCPU_SLB_MAX(r4)
@@ -1187,6 +1175,20 @@ EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
mr r4, r3
b fast_guest_entry_c
guest_exit_short_path:
+ /*
+ * Malicious or buggy radix guests may have inserted SLB entries
+ * (only 0..3 because radix always runs with UPRT=1), so these must
+ * be cleared here to avoid side-channels. slbmte is used rather
+ * than slbia, as it won't clear cached translations.
+ */
+ li r0,0
+ slbmte r0,r0
+ li r4,1
+ slbmte r0,r4
+ li r4,2
+ slbmte r0,r4
+ li r4,3
+ slbmte r0,r4
li r0, KVM_GUEST_MODE_NONE
stb r0, HSTATE_IN_GUEST(r13)
@@ -1499,7 +1501,7 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
lbz r0, KVM_RADIX(r5)
li r5, 0
cmpwi r0, 0
- bne 3f /* for radix, save 0 entries */
+ bne 0f /* for radix, save 0 entries */
lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */
mtctr r0
li r6,0
@@ -1518,13 +1520,13 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
/* Finally clear out the SLB */
li r0,0
slbmte r0,r0
- slbia
+ PPC_SLBIA(6)
ptesync
-3: stw r5,VCPU_SLB_MAX(r9)
+ stw r5,VCPU_SLB_MAX(r9)
/* load host SLB entries */
BEGIN_MMU_FTR_SECTION
- b 0f
+ b guest_bypass
END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
ld r8,PACA_SLBSHADOWPTR(r13)
@@ -1538,7 +1540,21 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
slbmte r6,r5
1: addi r8,r8,16
.endr
-0:
+ b guest_bypass
+
+0: /*
+ * Sanitise radix guest SLB, see guest_exit_short_path comment.
+ * We clear vcpu->arch.slb_max to match earlier behaviour.
+ */
+ li r0,0
+ stw r0,VCPU_SLB_MAX(r9)
+ slbmte r0,r0
+ li r4,1
+ slbmte r0,r4
+ li r4,2
+ slbmte r0,r4
+ li r4,3
+ slbmte r0,r4
guest_bypass:
stw r12, STACK_SLOT_TRAP(r1)
@@ -1759,8 +1775,8 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/* Restore host values of some registers */
BEGIN_FTR_SECTION
ld r5, STACK_SLOT_CIABR(r1)
- ld r6, STACK_SLOT_DAWR(r1)
- ld r7, STACK_SLOT_DAWRX(r1)
+ ld r6, STACK_SLOT_DAWR0(r1)
+ ld r7, STACK_SLOT_DAWRX0(r1)
mtspr SPRN_CIABR, r5
/*
* If the DAWR doesn't work, it's ok to write these here as
@@ -1770,6 +1786,12 @@ BEGIN_FTR_SECTION
mtspr SPRN_DAWRX0, r7
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
BEGIN_FTR_SECTION
+ ld r6, STACK_SLOT_DAWR1(r1)
+ ld r7, STACK_SLOT_DAWRX1(r1)
+ mtspr SPRN_DAWR1, r6
+ mtspr SPRN_DAWRX1, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S | CPU_FTR_DAWR1)
+BEGIN_FTR_SECTION
ld r5, STACK_SLOT_TID(r1)
ld r6, STACK_SLOT_PSSCR(r1)
ld r7, STACK_SLOT_PID(r1)
@@ -1938,24 +1960,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
19: lis r8,0x7fff /* MAX_INT@h */
mtspr SPRN_HDEC,r8
-16:
-BEGIN_FTR_SECTION
- /* On POWER9 with HPT-on-radix we need to wait for all other threads */
- ld r3, HSTATE_SPLIT_MODE(r13)
- cmpdi r3, 0
- beq 47f
- lwz r8, KVM_SPLIT_DO_RESTORE(r3)
- cmpwi r8, 0
- beq 47f
- bl kvmhv_p9_restore_lpcr
- nop
- b 48f
-47:
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
- ld r8,KVM_HOST_LPCR(r4)
+16: ld r8,KVM_HOST_LPCR(r4)
mtspr SPRN_LPCR,r8
isync
-48:
+
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
/* Finish timing, if we have a vcpu */
ld r4, HSTATE_KVM_VCPU(r13)
@@ -2574,8 +2582,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW
rlwimi r5, r4, 2, DAWRX_WT
clrrdi r4, r4, 3
- std r4, VCPU_DAWR(r3)
- std r5, VCPU_DAWRX(r3)
+ std r4, VCPU_DAWR0(r3)
+ std r5, VCPU_DAWRX0(r3)
/*
* If came in through the real mode hcall handler then it is necessary
* to write the registers since the return path won't. Otherwise it is
@@ -2779,8 +2787,10 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
beq kvm_end_cede
cmpwi r0, NAPPING_NOVCPU
beq kvm_novcpu_wakeup
+BEGIN_FTR_SECTION
cmpwi r0, NAPPING_UNSPLIT
beq kvm_unsplit_wakeup
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
twi 31,0,0 /* Nap state must not be zero */
33: mr r4, r3
@@ -3343,13 +3353,18 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
mtspr SPRN_IAMR, r0
mtspr SPRN_CIABR, r0
mtspr SPRN_DAWRX0, r0
+BEGIN_FTR_SECTION
+ mtspr SPRN_DAWRX1, r0
+END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
+
+ /* Clear hash and radix guest SLB, see guest_exit_short_path comment. */
+ slbmte r0, r0
+ PPC_SLBIA(6)
BEGIN_MMU_FTR_SECTION
b 4f
END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
- slbmte r0, r0
- slbia
ptesync
ld r8, PACA_SLBSHADOWPTR(r13)
.rept SLB_NUM_BOLTED
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 288a9820ec01..f38ae3e54b37 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -698,7 +698,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
r = 1;
- };
+ }
return r;
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cf52d26f49cd..6c083a9b3545 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -611,8 +611,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!(hv_enabled && radix_enabled());
break;
case KVM_CAP_PPC_MMU_HASH_V3:
- r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
- cpu_has_feature(CPU_FTR_HVMODE));
+ r = !!(hv_enabled && kvmppc_hv_ops->hash_v3_possible &&
+ kvmppc_hv_ops->hash_v3_possible());
break;
case KVM_CAP_PPC_NESTED_HV:
r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
@@ -678,6 +678,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = hv_enabled && kvmppc_hv_ops->enable_svm &&
!kvmppc_hv_ops->enable_svm(NULL);
break;
+ case KVM_CAP_PPC_DAWR1:
+ r = !!(hv_enabled && kvmppc_hv_ops->enable_dawr1 &&
+ !kvmppc_hv_ops->enable_dawr1(NULL));
+ break;
#endif
default:
r = 0;
@@ -2187,6 +2191,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
break;
r = kvm->arch.kvm_ops->enable_svm(kvm);
break;
+ case KVM_CAP_PPC_DAWR1:
+ r = -EINVAL;
+ if (!is_kvmppc_hv_enabled(kvm) || !kvm->arch.kvm_ops->enable_dawr1)
+ break;
+ r = kvm->arch.kvm_ops->enable_dawr1(kvm);
+ break;
#endif
default:
r = -EINVAL;
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 74f9a036bab2..6bcfc5614bbc 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -28,7 +28,6 @@
#define KVM_S390_BSCA_CPU_SLOTS 64
#define KVM_S390_ESCA_CPU_SLOTS 248
#define KVM_MAX_VCPUS 255
-#define KVM_USER_MEM_SLOTS 32
/*
* These seem to be used for allocating ->chip in the routing table, which we
diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index 18f2d10c3176..474617b88648 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -170,7 +170,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
if (!(vma->vm_flags & VM_WRITE))
goto out_unlock_mmap;
- ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
+ ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
if (ret)
goto out_unlock_mmap;
@@ -311,7 +311,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
if (!(vma->vm_flags & VM_WRITE))
goto out_unlock_mmap;
- ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
+ ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
if (ret)
goto out_unlock_mmap;
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index 7fc82a233f49..3a9a0b0c7465 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -11,8 +11,8 @@
#include <asm/processor.h>
#include <asm/barrier.h>
-#include <asm/qrwlock.h>
#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
#endif /* !(__ASSEMBLY__) */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1feb6c089ba2..cc96e26d69f7 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -292,6 +292,7 @@
#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
@@ -335,6 +336,7 @@
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
new file mode 100644
index 000000000000..355a2ab8fc09
--- /dev/null
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL)
+BUILD_BUG_ON(1)
+#endif
+
+/*
+ * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate
+ * "static_call()"s. They are also intended for use when defining
+ * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those
+ * functions that follow the [svm|vmx]_func_name convention.
+ * KVM_X86_OP_NULL() can leave a NULL definition for the
+ * case where there is no definition or a function name that
+ * doesn't match the typical naming convention is supplied.
+ */
+KVM_X86_OP_NULL(hardware_enable)
+KVM_X86_OP_NULL(hardware_disable)
+KVM_X86_OP_NULL(hardware_unsetup)
+KVM_X86_OP_NULL(cpu_has_accelerated_tpr)
+KVM_X86_OP(has_emulated_msr)
+KVM_X86_OP(vcpu_after_set_cpuid)
+KVM_X86_OP(vm_init)
+KVM_X86_OP_NULL(vm_destroy)
+KVM_X86_OP(vcpu_create)
+KVM_X86_OP(vcpu_free)
+KVM_X86_OP(vcpu_reset)
+KVM_X86_OP(prepare_guest_switch)
+KVM_X86_OP(vcpu_load)
+KVM_X86_OP(vcpu_put)
+KVM_X86_OP(update_exception_bitmap)
+KVM_X86_OP(get_msr)
+KVM_X86_OP(set_msr)
+KVM_X86_OP(get_segment_base)
+KVM_X86_OP(get_segment)
+KVM_X86_OP(get_cpl)
+KVM_X86_OP(set_segment)
+KVM_X86_OP_NULL(get_cs_db_l_bits)
+KVM_X86_OP(set_cr0)
+KVM_X86_OP(is_valid_cr4)
+KVM_X86_OP(set_cr4)
+KVM_X86_OP(set_efer)
+KVM_X86_OP(get_idt)
+KVM_X86_OP(set_idt)
+KVM_X86_OP(get_gdt)
+KVM_X86_OP(set_gdt)
+KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr7)
+KVM_X86_OP(cache_reg)
+KVM_X86_OP(get_rflags)
+KVM_X86_OP(set_rflags)
+KVM_X86_OP(tlb_flush_all)
+KVM_X86_OP(tlb_flush_current)
+KVM_X86_OP_NULL(tlb_remote_flush)
+KVM_X86_OP_NULL(tlb_remote_flush_with_range)
+KVM_X86_OP(tlb_flush_gva)
+KVM_X86_OP(tlb_flush_guest)
+KVM_X86_OP(run)
+KVM_X86_OP_NULL(handle_exit)
+KVM_X86_OP_NULL(skip_emulated_instruction)
+KVM_X86_OP_NULL(update_emulated_instruction)
+KVM_X86_OP(set_interrupt_shadow)
+KVM_X86_OP(get_interrupt_shadow)
+KVM_X86_OP(patch_hypercall)
+KVM_X86_OP(set_irq)
+KVM_X86_OP(set_nmi)
+KVM_X86_OP(queue_exception)
+KVM_X86_OP(cancel_injection)
+KVM_X86_OP(interrupt_allowed)
+KVM_X86_OP(nmi_allowed)
+KVM_X86_OP(get_nmi_mask)
+KVM_X86_OP(set_nmi_mask)
+KVM_X86_OP(enable_nmi_window)
+KVM_X86_OP(enable_irq_window)
+KVM_X86_OP(update_cr8_intercept)
+KVM_X86_OP(check_apicv_inhibit_reasons)
+KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl)
+KVM_X86_OP(refresh_apicv_exec_ctrl)
+KVM_X86_OP(hwapic_irr_update)
+KVM_X86_OP(hwapic_isr_update)
+KVM_X86_OP_NULL(guest_apic_has_interrupt)
+KVM_X86_OP(load_eoi_exitmap)
+KVM_X86_OP(set_virtual_apic_mode)
+KVM_X86_OP_NULL(set_apic_access_page_addr)
+KVM_X86_OP(deliver_posted_interrupt)
+KVM_X86_OP_NULL(sync_pir_to_irr)
+KVM_X86_OP(set_tss_addr)
+KVM_X86_OP(set_identity_map_addr)
+KVM_X86_OP(get_mt_mask)
+KVM_X86_OP(load_mmu_pgd)
+KVM_X86_OP_NULL(has_wbinvd_exit)
+KVM_X86_OP(write_l1_tsc_offset)
+KVM_X86_OP(get_exit_info)
+KVM_X86_OP(check_intercept)
+KVM_X86_OP(handle_exit_irqoff)
+KVM_X86_OP_NULL(request_immediate_exit)
+KVM_X86_OP(sched_in)
+KVM_X86_OP_NULL(slot_enable_log_dirty)
+KVM_X86_OP_NULL(slot_disable_log_dirty)
+KVM_X86_OP_NULL(flush_log_dirty)
+KVM_X86_OP_NULL(enable_log_dirty_pt_masked)
+KVM_X86_OP_NULL(cpu_dirty_log_size)
+KVM_X86_OP_NULL(pre_block)
+KVM_X86_OP_NULL(post_block)
+KVM_X86_OP_NULL(vcpu_blocking)
+KVM_X86_OP_NULL(vcpu_unblocking)
+KVM_X86_OP_NULL(update_pi_irte)
+KVM_X86_OP_NULL(apicv_post_state_restore)
+KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt)
+KVM_X86_OP_NULL(set_hv_timer)
+KVM_X86_OP_NULL(cancel_hv_timer)
+KVM_X86_OP(setup_mce)
+KVM_X86_OP(smi_allowed)
+KVM_X86_OP(pre_enter_smm)
+KVM_X86_OP(pre_leave_smm)
+KVM_X86_OP(enable_smi_window)
+KVM_X86_OP_NULL(mem_enc_op)
+KVM_X86_OP_NULL(mem_enc_reg_region)
+KVM_X86_OP_NULL(mem_enc_unreg_region)
+KVM_X86_OP(get_msr_feature)
+KVM_X86_OP(can_emulate_instruction)
+KVM_X86_OP(apic_init_signal_blocked)
+KVM_X86_OP_NULL(enable_direct_tlbflush)
+KVM_X86_OP_NULL(migrate_timers)
+KVM_X86_OP(msr_filter_changed)
+KVM_X86_OP_NULL(complete_emulated_msr)
+
+#undef KVM_X86_OP
+#undef KVM_X86_OP_NULL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3d6616f6f6ef..84499aad01a4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -40,10 +40,8 @@
#define KVM_MAX_VCPUS 288
#define KVM_SOFT_MAX_VCPUS 240
#define KVM_MAX_VCPU_ID 1023
-#define KVM_USER_MEM_SLOTS 509
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
-#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
#define KVM_HALT_POLL_NS_DEFAULT 200000
@@ -52,6 +50,9 @@
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
+#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \
+ KVM_BUS_LOCK_DETECTION_EXIT)
+
/* x86-specific vcpu->requests bit members */
#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0)
#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1)
@@ -200,9 +201,17 @@ enum x86_intercept_stage;
#define DR6_BS (1 << 14)
#define DR6_BT (1 << 15)
#define DR6_RTM (1 << 16)
-#define DR6_FIXED_1 0xfffe0ff0
-#define DR6_INIT 0xffff0ff0
+/*
+ * DR6_ACTIVE_LOW combines fixed-1 and active-low bits.
+ * We can regard all the bits in DR6_FIXED_1 as active_low bits;
+ * they will never be 0 for now, but when they are defined
+ * in the future it will require no code change.
+ *
+ * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
+ */
+#define DR6_ACTIVE_LOW 0xffff0ff0
#define DR6_VOLATILE 0x0001e00f
+#define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
#define DR7_BP_EN_MASK 0x000000ff
#define DR7_GE (1 << 9)
@@ -337,6 +346,8 @@ struct kvm_mmu_root_info {
#define KVM_MMU_NUM_PREV_ROOTS 3
+#define KVM_HAVE_MMU_RWLOCK
+
struct kvm_mmu_page;
/*
@@ -358,8 +369,6 @@ struct kvm_mmu {
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
- void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte);
hpa_t root_hpa;
gpa_t root_pgd;
union kvm_mmu_role mmu_role;
@@ -510,6 +519,7 @@ struct kvm_vcpu_hv_synic {
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
+ struct kvm_vcpu *vcpu;
u32 vp_index;
u64 hv_vapic;
s64 runtime_offset;
@@ -520,6 +530,15 @@ struct kvm_vcpu_hv {
cpumask_t tlb_flush;
};
+/* Xen HVM per vcpu emulation context */
+struct kvm_vcpu_xen {
+ u64 hypercall_rip;
+ bool vcpu_info_set;
+ bool vcpu_time_info_set;
+ struct gfn_to_hva_cache vcpu_info_cache;
+ struct gfn_to_hva_cache vcpu_time_info_cache;
+};
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -640,7 +659,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
- unsigned long cr3_lm_rsvd_bits;
+ u64 reserved_gpa_bits;
int maxphyaddr;
int max_tdp_level;
@@ -717,7 +736,9 @@ struct kvm_vcpu_arch {
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
- struct kvm_vcpu_hv hyperv;
+ bool hyperv_enabled;
+ struct kvm_vcpu_hv *hyperv;
+ struct kvm_vcpu_xen xen;
cpumask_var_t wbinvd_dirty_mask;
@@ -888,6 +909,14 @@ struct msr_bitmap_range {
unsigned long *bitmap;
};
+/* Xen emulation context */
+struct kvm_xen {
+ bool long_mode;
+ bool shinfo_set;
+ u8 upcall_vector;
+ struct gfn_to_hva_cache shinfo_cache;
+};
+
enum kvm_irqchip_mode {
KVM_IRQCHIP_NONE,
KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
@@ -967,6 +996,7 @@ struct kvm_arch {
struct hlist_head mask_notifier_list;
struct kvm_hv hyperv;
+ struct kvm_xen xen;
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
@@ -998,9 +1028,12 @@ struct kvm_arch {
struct msr_bitmap_range ranges[16];
} msr_filter;
+ bool bus_lock_detection_enabled;
+
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
+#ifdef CONFIG_X86_64
/*
* Whether the TDP MMU is enabled for this VM. This contains a
* snapshot of the TDP MMU module parameter from when the VM was
@@ -1026,12 +1059,25 @@ struct kvm_arch {
* tdp_mmu_page set and a root_count of 0.
*/
struct list_head tdp_mmu_pages;
+
+ /*
+ * Protects accesses to the following fields when the MMU lock
+ * is held in read mode:
+ * - tdp_mmu_pages (above)
+ * - the link field of struct kvm_mmu_pages used by the TDP MMU
+ * - lpage_disallowed_mmu_pages
+ * - the lpage_disallowed_link field of struct kvm_mmu_pages used
+ * by the TDP MMU
+ * It is acceptable, but not necessary, to acquire this lock when
+ * the thread holds the MMU lock in write mode.
+ */
+ spinlock_t tdp_mmu_pages_lock;
+#endif /* CONFIG_X86_64 */
};
struct kvm_vm_stat {
ulong mmu_shadow_zapped;
ulong mmu_pte_write;
- ulong mmu_pte_updated;
ulong mmu_pde_zapped;
ulong mmu_flooded;
ulong mmu_recycled;
@@ -1340,6 +1386,19 @@ extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern struct kvm_x86_ops kvm_x86_ops;
+#define KVM_X86_OP(func) \
+ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+
+static inline void kvm_ops_static_call_update(void)
+{
+#define KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+}
+
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
@@ -1351,7 +1410,7 @@ void kvm_arch_free_vm(struct kvm *kvm);
static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
{
if (kvm_x86_ops.tlb_remote_flush &&
- !kvm_x86_ops.tlb_remote_flush(kvm))
+ !static_call(kvm_x86_tlb_remote_flush)(kvm))
return 0;
else
return -ENOTSUPP;
@@ -1421,6 +1480,8 @@ extern u8 kvm_tsc_scaling_ratio_frac_bits;
extern u64 kvm_max_tsc_scaling_ratio;
/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
extern u64 kvm_default_tsc_scaling_ratio;
+/* bus lock detection supported? */
+extern bool kvm_has_bus_lock_exit;
extern u64 kvm_mce_cap_supported;
@@ -1501,7 +1562,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
+void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -1742,14 +1803,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{
- if (kvm_x86_ops.vcpu_blocking)
- kvm_x86_ops.vcpu_blocking(vcpu);
+ static_call_cond(kvm_x86_vcpu_blocking)(vcpu);
}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- if (kvm_x86_ops.vcpu_unblocking)
- kvm_x86_ops.vcpu_unblocking(vcpu);
+ static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
}
static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 9aad0e0876fb..8757078d4442 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -30,16 +30,29 @@ static inline int cpu_has_vmx(void)
}
-/** Disable VMX on the current CPU
+/**
+ * cpu_vmxoff() - Disable VMX on the current CPU
*
- * vmxoff causes a undefined-opcode exception if vmxon was not run
- * on the CPU previously. Only call this function if you know VMX
- * is enabled.
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
*/
-static inline void cpu_vmxoff(void)
+static inline int cpu_vmxoff(void)
{
- asm volatile ("vmxoff");
+ asm_volatile_goto("1: vmxoff\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "cc", "memory" : fault);
+
+ cr4_clear_bits(X86_CR4_VMXE);
+ return 0;
+
+fault:
cr4_clear_bits(X86_CR4_VMXE);
+ return -EIO;
}
static inline int cpu_vmx_enabled(void)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 38ca445a8429..358707f60d99 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -73,6 +73,7 @@
#define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA)
#define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING)
#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
+#define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
#define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING)
#define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING)
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
index 9915990fd8cf..d9a74681a77d 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -83,5 +83,6 @@
#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */
#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
+#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */
#endif /* _ASM_X86_VMXFEATURES_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 9139b3e86316..baca0b00ef76 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -182,6 +182,9 @@ struct arch_shared_info {
unsigned long p2m_cr3; /* cr3 value of the p2m address space */
unsigned long p2m_vaddr; /* virtual address of the p2m list */
unsigned long p2m_generation; /* generation count of p2m mapping */
+#ifdef CONFIG_X86_32
+ uint32_t wc_sec_hi;
+#endif
};
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 8e76d3701db3..5a3022c8af82 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -112,6 +112,7 @@ struct kvm_ioapic_state {
#define KVM_NR_IRQCHIPS 3
#define KVM_RUN_X86_SMM (1 << 0)
+#define KVM_RUN_X86_BUS_LOCK (1 << 1)
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index ada955c5ebb6..b8e650a985e3 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -89,6 +89,7 @@
#define EXIT_REASON_XRSTORS 64
#define EXIT_REASON_UMWAIT 67
#define EXIT_REASON_TPAUSE 68
+#define EXIT_REASON_BUS_LOCK 74
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -150,7 +151,8 @@
{ EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" }, \
{ EXIT_REASON_UMWAIT, "UMWAIT" }, \
- { EXIT_REASON_TPAUSE, "TPAUSE" }
+ { EXIT_REASON_TPAUSE, "TPAUSE" }, \
+ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }
#define VMX_EXIT_REASON_FLAGS \
{ VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7f4c081f59f0..819db00c9388 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1747,6 +1747,7 @@ void apic_ap_setup(void)
#ifdef CONFIG_X86_X2APIC
int x2apic_mode;
+EXPORT_SYMBOL_GPL(x2apic_mode);
enum {
X2APIC_OFF,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 9991c5920aac..b29657b76e3f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -547,31 +547,21 @@ static void emergency_vmx_disable_all(void)
local_irq_disable();
/*
- * We need to disable VMX on all CPUs before rebooting, otherwise
- * we risk hanging up the machine, because the CPU ignores INIT
- * signals when VMX is enabled.
+ * Disable VMX on all CPUs before rebooting, otherwise we risk hanging
+ * the machine, because the CPU blocks INIT when it's in VMX root.
*
- * We can't take any locks and we may be on an inconsistent
- * state, so we use NMIs as IPIs to tell the other CPUs to disable
- * VMX and halt.
+ * We can't take any locks and we may be on an inconsistent state, so
+ * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
*
- * For safety, we will avoid running the nmi_shootdown_cpus()
- * stuff unnecessarily, but we don't have a way to check
- * if other CPUs have VMX enabled. So we will call it only if the
- * CPU we are running on has VMX enabled.
- *
- * We will miss cases where VMX is not enabled on all CPUs. This
- * shouldn't do much harm because KVM always enable VMX on all
- * CPUs anyway. But we can miss it on the small window where KVM
- * is still enabling VMX.
+ * Do the NMI shootdown even if VMX if off on _this_ CPU, as that
+ * doesn't prevent a different CPU from being in VMX root operation.
*/
- if (cpu_has_vmx() && cpu_vmx_enabled()) {
- /* Disable VMX on this CPU. */
- cpu_vmxoff();
+ if (cpu_has_vmx()) {
+ /* Safely force _this_ CPU out of VMX root operation. */
+ __cpu_emergency_vmxoff();
- /* Halt and disable VMX on the other CPUs */
+ /* Halt and exit VMX root operation on the other CPUs. */
nmi_shootdown_cpus(vmxoff_nmi);
-
}
}
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4bd14ab01323..aeab168c5711 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,10 +14,11 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
$(KVM)/dirty_ring.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
-kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
+kvm-y += x86.o emulate.o i8259.o irq.o lapic.o xen.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
- mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o
+ mmu/spte.o
+kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 38172ca627d3..c8f2592ccc99 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -173,16 +173,22 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_update_pv_runtime(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- kvm_mmu_reset_context(vcpu);
+ vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
kvm_pmu_refresh(vcpu);
vcpu->arch.cr4_guest_rsvd_bits =
__cr4_reserved_bits(guest_cpuid_has, vcpu);
- vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+ kvm_hv_set_cpuid(vcpu);
/* Invoke the vendor callback only after the above state is updated. */
- kvm_x86_ops.vcpu_after_set_cpuid(vcpu);
+ static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
+
+ /*
+ * Except for the MMU, which needs to be reset after any vendor
+ * specific adjustments to the reserved GPA bits.
+ */
+ kvm_mmu_reset_context(vcpu);
}
static int is_efer_nx(void)
@@ -223,6 +229,16 @@ not_found:
return 36;
}
+/*
+ * This "raw" version returns the reserved GPA bits without any adjustments for
+ * encryption technologies that usurp bits. The raw mask should be used if and
+ * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs.
+ */
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+}
+
/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid,
@@ -434,7 +450,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
kvm_cpu_cap_mask(CPUID_7_1_EAX,
- F(AVX512_BF16)
+ F(AVX_VNNI) | F(AVX512_BF16)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dc921d76e42e..2a0c5064497f 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -30,15 +30,32 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool exact_only);
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
return vcpu->arch.maxphyaddr;
}
+static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return !(gpa & vcpu->arch.reserved_gpa_bits);
+}
+
static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
{
- return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
+ return !kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu,
+ gpa_t gpa, gpa_t alignment)
+{
+ return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE);
}
struct cpuid_reg {
@@ -324,11 +341,6 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
kvm_cpu_cap_set(x86_feature);
}
-static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
-}
-
static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
unsigned int kvm_feature)
{
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66a08322988f..f7970ba6219f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2506,12 +2506,12 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
val = GET_SMSTATE(u32, smstate, 0x7fcc);
- if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 6, val))
return X86EMUL_UNHANDLEABLE;
val = GET_SMSTATE(u32, smstate, 0x7fc8);
- if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 7, val))
return X86EMUL_UNHANDLEABLE;
selector = GET_SMSTATE(u32, smstate, 0x7fc4);
@@ -2564,14 +2564,14 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
- val = GET_SMSTATE(u32, smstate, 0x7f68);
+ val = GET_SMSTATE(u64, smstate, 0x7f68);
- if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 6, val))
return X86EMUL_UNHANDLEABLE;
- val = GET_SMSTATE(u32, smstate, 0x7f60);
+ val = GET_SMSTATE(u64, smstate, 0x7f60);
- if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 7, val))
return X86EMUL_UNHANDLEABLE;
cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
@@ -4329,7 +4329,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
ctxt->ops->get_dr(ctxt, 6, &dr6);
dr6 &= ~DR_TRAP_BITS;
- dr6 |= DR6_BD | DR6_RTM;
+ dr6 |= DR6_BD | DR6_ACTIVE_LOW;
ctxt->ops->set_dr(ctxt, 6, dr6);
return emulate_db(ctxt);
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 922c69dcca4d..7d2dae92d638 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,6 +23,7 @@
#include "ioapic.h"
#include "cpuid.h"
#include "hyperv.h"
+#include "xen.h"
#include <linux/cpu.h>
#include <linux/kvm_host.h>
@@ -36,6 +37,9 @@
#include "trace.h"
#include "irq.h"
+/* "Hv#1" signature */
+#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
+
#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -128,7 +132,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
synic_update_vector(synic, vector);
/* Load SynIC vectors into EOI exit bitmap */
- kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic));
return 0;
}
@@ -141,10 +145,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
return NULL;
vcpu = kvm_get_vcpu(kvm, vpidx);
- if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+ if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx)
return vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
- if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+ if (kvm_hv_get_vpindex(vcpu) == vpidx)
return vcpu;
return NULL;
}
@@ -157,15 +161,15 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
vcpu = get_vcpu_by_vpidx(kvm, vpidx);
if (!vcpu)
return NULL;
- synic = vcpu_to_synic(vcpu);
+ synic = to_hv_synic(vcpu);
return (synic->active) ? synic : NULL;
}
static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
{
struct kvm *kvm = vcpu->kvm;
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_vcpu_hv_stimer *stimer;
int gsi, idx;
@@ -189,8 +193,8 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
hv_vcpu->exit.u.synic.msr = msr;
@@ -204,7 +208,7 @@ static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
u32 msr, u64 data, bool host)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
int ret;
if (!synic->active && !host)
@@ -282,8 +286,7 @@ static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu)
static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL)
hv->hv_syndbg.control.status =
@@ -293,8 +296,8 @@ static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
{
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG;
hv_vcpu->exit.u.syndbg.msr = msr;
@@ -310,13 +313,13 @@ static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
return 1;
trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id,
- vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data);
+ to_hv_vcpu(vcpu)->vp_index, msr, data);
switch (msr) {
case HV_X64_MSR_SYNDBG_CONTROL:
syndbg->control.control = data;
@@ -349,7 +352,7 @@ static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
return 1;
@@ -377,9 +380,7 @@ static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
break;
}
- trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id,
- vcpu_to_hv_vcpu(vcpu)->vp_index, msr,
- *pdata);
+ trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata);
return 0;
}
@@ -421,7 +422,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
struct kvm_lapic_irq irq;
int ret, vector;
@@ -457,7 +458,7 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
{
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
int i;
trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
@@ -514,7 +515,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
static u64 get_time_ref_counter(struct kvm *kvm)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct kvm_vcpu *vcpu;
u64 tsc;
@@ -534,10 +535,10 @@ static u64 get_time_ref_counter(struct kvm *kvm)
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
bool vcpu_kick)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
set_bit(stimer->index,
- vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
if (vcpu_kick)
kvm_vcpu_kick(vcpu);
@@ -545,14 +546,14 @@ static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
- trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index);
hrtimer_cancel(&stimer->timer);
clear_bit(stimer->index,
- vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
stimer->msg_pending = false;
stimer->exp_time = 0;
}
@@ -562,7 +563,7 @@ static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
struct kvm_vcpu_hv_stimer *stimer;
stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
- trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index);
stimer_mark_pending(stimer, true);
@@ -579,7 +580,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
u64 time_now;
ktime_t ktime_now;
- time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
+ time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm);
ktime_now = ktime_get();
if (stimer->config.periodic) {
@@ -596,7 +597,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
stimer->exp_time = time_now + stimer->count;
trace_kvm_hv_stimer_start_periodic(
- stimer_to_vcpu(stimer)->vcpu_id,
+ hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index,
time_now, stimer->exp_time);
@@ -618,7 +619,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
return 0;
}
- trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index,
time_now, stimer->count);
@@ -633,13 +634,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
{
union hv_stimer_config new_config = {.as_uint64 = config},
old_config = {.as_uint64 = stimer->config.as_uint64};
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
if (!synic->active && !host)
return 1;
- trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, config, host);
stimer_cleanup(stimer);
@@ -657,13 +658,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
bool host)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
if (!synic->active && !host)
return 1;
- trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, count, host);
stimer_cleanup(stimer);
@@ -694,7 +695,7 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
struct hv_message *src_msg, bool no_retry)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
int msg_off = offsetof(struct hv_message_page, sint_message[sint]);
gfn_t msg_page_gfn;
struct hv_message_header hv_hdr;
@@ -750,7 +751,7 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
struct hv_message *msg = &stimer->msg;
struct hv_timer_message_payload *payload =
(struct hv_timer_message_payload *)&msg->u.payload;
@@ -763,14 +764,14 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
payload->expiration_time = stimer->exp_time;
payload->delivery_time = get_time_ref_counter(vcpu->kvm);
- return synic_deliver_msg(vcpu_to_synic(vcpu),
+ return synic_deliver_msg(to_hv_synic(vcpu),
stimer->config.sintx, msg,
no_retry);
}
static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
struct kvm_lapic_irq irq = {
.delivery_mode = APIC_DM_FIXED,
.vector = stimer->config.apic_vector
@@ -790,7 +791,7 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
r = stimer_send_msg(stimer);
else
r = stimer_notify_direct(stimer);
- trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, direct, r);
if (!r) {
stimer->msg_pending = false;
@@ -801,11 +802,14 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_vcpu_hv_stimer *stimer;
u64 time_now, exp_time;
int i;
+ if (!hv_vcpu)
+ return;
+
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
stimer = &hv_vcpu->stimer[i];
@@ -831,16 +835,27 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
int i;
+ if (!hv_vcpu)
+ return;
+
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
stimer_cleanup(&hv_vcpu->stimer[i]);
+
+ kfree(hv_vcpu);
+ vcpu->arch.hyperv = NULL;
}
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
{
- if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
return false;
return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
}
@@ -880,28 +895,41 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
stimer_prepare_msg(stimer);
}
-void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu;
int i;
+ hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT);
+ if (!hv_vcpu)
+ return -ENOMEM;
+
+ vcpu->arch.hyperv = hv_vcpu;
+ hv_vcpu->vcpu = vcpu;
+
synic_init(&hv_vcpu->synic);
bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
stimer_init(&hv_vcpu->stimer[i], i);
-}
-
-void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu)
-{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+
+ return 0;
}
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
{
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu_hv_synic *synic;
+ int r;
+
+ if (!to_hv_vcpu(vcpu)) {
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ return r;
+ }
+
+ synic = to_hv_synic(vcpu);
/*
* Hyper-V SynIC auto EOI SINT's are
@@ -939,10 +967,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
return r;
}
-static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
- u32 index, u64 *pdata)
+static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
size_t size = ARRAY_SIZE(hv->hv_crash_param);
if (WARN_ON_ONCE(index >= size))
@@ -952,41 +979,26 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
return 0;
}
-static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata)
+static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
*pdata = hv->hv_crash_ctl;
return 0;
}
-static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host)
+static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
-
- if (host)
- hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
- if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) {
-
- vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
- hv->hv_crash_param[0],
- hv->hv_crash_param[1],
- hv->hv_crash_param[2],
- hv->hv_crash_param[3],
- hv->hv_crash_param[4]);
-
- /* Send notification about crash to user space */
- kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
- }
+ hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
return 0;
}
-static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
- u32 index, u64 data)
+static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
size_t size = ARRAY_SIZE(hv->hv_crash_param);
if (WARN_ON_ONCE(index >= size))
@@ -1068,7 +1080,7 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
u32 tsc_seq;
u64 gfn;
@@ -1078,7 +1090,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
return;
- mutex_lock(&kvm->arch.hyperv.hv_lock);
+ mutex_lock(&hv->hv_lock);
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
goto out_unlock;
@@ -1122,14 +1134,14 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
out_unlock:
- mutex_unlock(&kvm->arch.hyperv.hv_lock);
+ mutex_unlock(&hv->hv_lock);
}
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
bool host)
{
struct kvm *kvm = vcpu->kvm;
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
@@ -1139,9 +1151,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
break;
case HV_X64_MSR_HYPERCALL: {
- u64 gfn;
- unsigned long addr;
- u8 instructions[4];
+ u8 instructions[9];
+ int i = 0;
+ u64 addr;
/* if guest os id is not set hypercall should remain disabled */
if (!hv->hv_guest_os_id)
@@ -1150,16 +1162,33 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
hv->hv_hypercall = data;
break;
}
- gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return 1;
- kvm_x86_ops.patch_hypercall(vcpu, instructions);
- ((unsigned char *)instructions)[3] = 0xc3; /* ret */
- if (__copy_to_user((void __user *)addr, instructions, 4))
+
+ /*
+ * If Xen and Hyper-V hypercalls are both enabled, disambiguate
+ * the same way Xen itself does, by setting the bit 31 of EAX
+ * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just
+ * going to be clobbered on 64-bit.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ /* orl $0x80000000, %eax */
+ instructions[i++] = 0x0d;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x80;
+ }
+
+ /* vmcall/vmmcall */
+ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i);
+ i += 3;
+
+ /* ret */
+ ((unsigned char *)instructions)[i++] = 0xc3;
+
+ addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
+ if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
return 1;
hv->hv_hypercall = data;
- mark_page_dirty(kvm, gfn);
break;
}
case HV_X64_MSR_REFERENCE_TSC:
@@ -1168,11 +1197,25 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
break;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
- return kvm_hv_msr_set_crash_data(vcpu,
+ return kvm_hv_msr_set_crash_data(kvm,
msr - HV_X64_MSR_CRASH_P0,
data);
case HV_X64_MSR_CRASH_CTL:
- return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+ if (host)
+ return kvm_hv_msr_set_crash_ctl(kvm, data);
+
+ if (data & HV_CRASH_CTL_CRASH_NOTIFY) {
+ vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+ hv->hv_crash_param[0],
+ hv->hv_crash_param[1],
+ hv->hv_crash_param[2],
+ hv->hv_crash_param[3],
+ hv->hv_crash_param[4]);
+
+ /* Send notification about crash to user space */
+ kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+ }
+ break;
case HV_X64_MSR_RESET:
if (data == 1) {
vcpu_debug(vcpu, "hyper-v reset requested\n");
@@ -1216,11 +1259,11 @@ static u64 current_task_runtime_100ns(void)
static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
switch (msr) {
case HV_X64_MSR_VP_INDEX: {
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
int vcpu_idx = kvm_vcpu_get_idx(vcpu);
u32 new_vp_index = (u32)data;
@@ -1291,14 +1334,14 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
case HV_X64_MSR_SIMP:
case HV_X64_MSR_EOM:
case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
- return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host);
+ return synic_set_msr(to_hv_synic(vcpu), msr, data, host);
case HV_X64_MSR_STIMER0_CONFIG:
case HV_X64_MSR_STIMER1_CONFIG:
case HV_X64_MSR_STIMER2_CONFIG:
case HV_X64_MSR_STIMER3_CONFIG: {
int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
- return stimer_set_config(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_set_config(to_hv_stimer(vcpu, timer_index),
data, host);
}
case HV_X64_MSR_STIMER0_COUNT:
@@ -1307,7 +1350,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
case HV_X64_MSR_STIMER3_COUNT: {
int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
- return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_set_count(to_hv_stimer(vcpu, timer_index),
data, host);
}
case HV_X64_MSR_TSC_FREQUENCY:
@@ -1330,7 +1373,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
{
u64 data = 0;
struct kvm *kvm = vcpu->kvm;
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
@@ -1346,11 +1389,11 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = hv->hv_tsc_page;
break;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
- return kvm_hv_msr_get_crash_data(vcpu,
+ return kvm_hv_msr_get_crash_data(kvm,
msr - HV_X64_MSR_CRASH_P0,
pdata);
case HV_X64_MSR_CRASH_CTL:
- return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+ return kvm_hv_msr_get_crash_ctl(kvm, pdata);
case HV_X64_MSR_RESET:
data = 0;
break;
@@ -1379,7 +1422,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
bool host)
{
u64 data = 0;
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
switch (msr) {
case HV_X64_MSR_VP_INDEX:
@@ -1403,14 +1446,14 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_SIMP:
case HV_X64_MSR_EOM:
case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
- return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host);
+ return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host);
case HV_X64_MSR_STIMER0_CONFIG:
case HV_X64_MSR_STIMER1_CONFIG:
case HV_X64_MSR_STIMER2_CONFIG:
case HV_X64_MSR_STIMER3_CONFIG: {
int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
- return stimer_get_config(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_get_config(to_hv_stimer(vcpu, timer_index),
pdata);
}
case HV_X64_MSR_STIMER0_COUNT:
@@ -1419,7 +1462,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_STIMER3_COUNT: {
int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
- return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_get_count(to_hv_stimer(vcpu, timer_index),
pdata);
}
case HV_X64_MSR_TSC_FREQUENCY:
@@ -1438,12 +1481,22 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+ }
+
if (kvm_hv_msr_partition_wide(msr)) {
int r;
- mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_lock(&hv->hv_lock);
r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
- mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_unlock(&hv->hv_lock);
return r;
} else
return kvm_hv_set_msr(vcpu, msr, data, host);
@@ -1451,12 +1504,22 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+ }
+
if (kvm_hv_msr_partition_wide(msr)) {
int r;
- mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_lock(&hv->hv_lock);
r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host);
- mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_unlock(&hv->hv_lock);
return r;
} else
return kvm_hv_get_msr(vcpu, msr, pdata, host);
@@ -1466,7 +1529,7 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
u64 *vp_bitmap, unsigned long *vcpu_bitmap)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct kvm_vcpu *vcpu;
int i, bank, sbank = 0;
@@ -1483,18 +1546,16 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index,
- (unsigned long *)vp_bitmap))
+ if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
__set_bit(i, vcpu_bitmap);
}
return vcpu_bitmap;
}
-static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
- u16 rep_cnt, bool ex)
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool ex)
{
- struct kvm *kvm = current_vcpu->kvm;
- struct kvm_vcpu_hv *hv_vcpu = &current_vcpu->arch.hyperv;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
@@ -1592,10 +1653,10 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
}
}
-static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa,
bool ex, bool fast)
{
- struct kvm *kvm = current_vcpu->kvm;
+ struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
@@ -1666,9 +1727,20 @@ ret_success:
return HV_STATUS_SUCCESS;
}
-bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
{
- return READ_ONCE(kvm->arch.hyperv.hv_guest_os_id) != 0;
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
+ if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX)
+ vcpu->arch.hyperv_enabled = true;
+ else
+ vcpu->arch.hyperv_enabled = false;
+}
+
+bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
}
static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
@@ -1698,6 +1770,7 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
struct eventfd_ctx *eventfd;
if (unlikely(!fast)) {
@@ -1726,7 +1799,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
/* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
rcu_read_lock();
- eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+ eventfd = idr_find(&hv->conn_to_evt, param);
rcu_read_unlock();
if (!eventfd)
return HV_STATUS_INVALID_PORT_ID;
@@ -1745,7 +1818,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
* hypercall generates UD from non zero cpl and real mode
* per HYPER-V spec
*/
- if (kvm_x86_ops.get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -1793,7 +1866,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
fallthrough; /* maybe userspace knows this conn_id */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
- if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) {
+ if (unlikely(rep || !to_hv_synic(vcpu)->active)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -1855,7 +1928,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
fallthrough;
case HVCALL_RESET_DEBUG_SESSION: {
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
if (!kvm_hv_is_syndbg_enabled(vcpu)) {
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
@@ -1885,23 +1958,26 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
void kvm_hv_init_vm(struct kvm *kvm)
{
- mutex_init(&kvm->arch.hyperv.hv_lock);
- idr_init(&kvm->arch.hyperv.conn_to_evt);
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ mutex_init(&hv->hv_lock);
+ idr_init(&hv->conn_to_evt);
}
void kvm_hv_destroy_vm(struct kvm *kvm)
{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct eventfd_ctx *eventfd;
int i;
- idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i)
+ idr_for_each_entry(&hv->conn_to_evt, eventfd, i)
eventfd_ctx_put(eventfd);
- idr_destroy(&kvm->arch.hyperv.conn_to_evt);
+ idr_destroy(&hv->conn_to_evt);
}
static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct eventfd_ctx *eventfd;
int ret;
@@ -1925,7 +2001,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct eventfd_ctx *eventfd;
mutex_lock(&hv->hv_lock);
@@ -1997,8 +2073,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
break;
case HYPERV_CPUID_INTERFACE:
- memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
- ent->eax = signature[0];
+ ent->eax = HYPERV_CPUID_SIGNATURE_EAX;
break;
case HYPERV_CPUID_VERSION:
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 6d7def2b0aad..e951af1fcb2c 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -50,38 +50,46 @@
/* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */
#define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2)
-static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
+static inline struct kvm_hv *to_kvm_hv(struct kvm *kvm)
{
- return &vcpu->arch.hyperv;
+ return &kvm->arch.hyperv;
}
-static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu)
+static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_arch *arch;
-
- arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv);
- return container_of(arch, struct kvm_vcpu, arch);
+ return vcpu->arch.hyperv;
}
-static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu)
+static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu)
{
- return &vcpu->arch.hyperv.synic;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return &hv_vcpu->synic;
}
-static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+static inline struct kvm_vcpu *hv_synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
{
- return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
+ struct kvm_vcpu_hv *hv_vcpu = container_of(synic, struct kvm_vcpu_hv, synic);
+
+ return hv_vcpu->vcpu;
}
-static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu)
+static inline struct kvm_hv_syndbg *to_hv_syndbg(struct kvm_vcpu *vcpu)
{
return &vcpu->kvm->arch.hyperv.hv_syndbg;
}
+static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu);
+}
+
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
-bool kvm_hv_hypercall_enabled(struct kvm *kvm);
+bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu);
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
@@ -89,32 +97,35 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
-void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
-void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
struct hv_vp_assist_page *assist_page);
-static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
- int timer_index)
+static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
+ int timer_index)
{
- return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index];
+ return &to_hv_vcpu(vcpu)->stimer[timer_index];
}
-static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+static inline struct kvm_vcpu *hv_stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
{
struct kvm_vcpu_hv *hv_vcpu;
hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
stimer[0]);
- return hv_vcpu_to_vcpu(hv_vcpu);
+ return hv_vcpu->vcpu;
}
static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
{
- return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap,
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ return !bitmap_empty(hv_vcpu->stimer_pending_bitmap,
HV_SYNIC_STIMER_COUNT);
}
@@ -125,6 +136,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu);
int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 814698e5b152..172b05343cfd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -14,6 +14,7 @@
#include "irq.h"
#include "i8254.h"
#include "x86.h"
+#include "xen.h"
/*
* check if there are pending timer events
@@ -56,6 +57,9 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (!lapic_in_kernel(v))
return v->arch.interrupt.injected;
+ if (kvm_xen_has_interrupt(v))
+ return 1;
+
if (!kvm_apic_accept_pic_intr(v))
return 0;
@@ -110,6 +114,9 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
if (!lapic_in_kernel(v))
return v->arch.interrupt.nr;
+ if (kvm_xen_has_interrupt(v))
+ return v->kvm->arch.xen.upcall_vector;
+
if (irqchip_split(v->kvm)) {
int vector = v->arch.pending_external_vector;
@@ -143,8 +150,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
__kvm_migrate_pit_timer(vcpu);
- if (kvm_x86_ops.migrate_timers)
- kvm_x86_ops.migrate_timers(vcpu);
+ static_call_cond(kvm_x86_migrate_timers)(vcpu);
}
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index a889563ad02d..2e11da2f5621 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -68,7 +68,7 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
return 0;
if (!kvm_register_is_available(vcpu, reg))
- kvm_x86_ops.cache_reg(vcpu, reg);
+ static_call(kvm_x86_cache_reg)(vcpu, reg);
return vcpu->arch.regs[reg];
}
@@ -108,7 +108,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
might_sleep(); /* on svm */
if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_PDPTR);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -118,7 +118,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0);
return vcpu->arch.cr0 & mask;
}
@@ -132,14 +132,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4);
return vcpu->arch.cr4 & mask;
}
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR3);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 43c93ffa76ed..0d359115429a 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -205,7 +205,7 @@ struct x86_emulate_ops {
ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
int (*cpl)(struct x86_emulate_ctxt *ctxt);
- int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+ void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 43cceadd073e..45d40bfacb7c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -91,8 +91,8 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
-struct static_key_deferred apic_hw_disabled __read_mostly;
-struct static_key_deferred apic_sw_disabled __read_mostly;
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
static inline int apic_enabled(struct kvm_lapic *apic)
{
@@ -290,9 +290,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
if (enabled != apic->sw_enabled) {
apic->sw_enabled = enabled;
if (enabled)
- static_key_slow_dec_deferred(&apic_sw_disabled);
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
else
- static_key_slow_inc(&apic_sw_disabled.key);
+ static_branch_inc(&apic_sw_disabled.key);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
@@ -484,7 +484,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
if (unlikely(vcpu->arch.apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- kvm_x86_ops.hwapic_irr_update(vcpu,
+ static_call(kvm_x86_hwapic_irr_update)(vcpu,
apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
@@ -515,7 +515,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(vcpu->arch.apicv_active))
- kvm_x86_ops.hwapic_isr_update(vcpu, vec);
+ static_call(kvm_x86_hwapic_isr_update)(vcpu, vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -563,8 +563,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(vcpu->arch.apicv_active))
- kvm_x86_ops.hwapic_isr_update(vcpu,
- apic_find_highest_isr(apic));
+ static_call(kvm_x86_hwapic_isr_update)(vcpu,
+ apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -701,7 +701,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
if (apic->vcpu->arch.apicv_active)
- highest_irr = kvm_x86_ops.sync_pir_to_irr(apic->vcpu);
+ highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
@@ -1090,7 +1090,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic->regs + APIC_TMR);
}
- if (kvm_x86_ops.deliver_posted_interrupt(vcpu, vector)) {
+ if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) {
kvm_lapic_set_irr(vector, apic);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
@@ -1245,7 +1245,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap))
+ if (to_hv_vcpu(apic->vcpu) &&
+ test_bit(vector, to_hv_synic(apic->vcpu)->vec_bitmap))
kvm_hv_synic_send_eoi(apic->vcpu, vector);
kvm_ioapic_send_eoi(apic, vector);
@@ -1814,7 +1815,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
{
WARN_ON(preemptible());
WARN_ON(!apic->lapic_timer.hv_timer_in_use);
- kvm_x86_ops.cancel_hv_timer(apic->vcpu);
+ static_call(kvm_x86_cancel_hv_timer)(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
}
@@ -1831,7 +1832,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
if (!ktimer->tscdeadline)
return false;
- if (kvm_x86_ops.set_hv_timer(vcpu, ktimer->tscdeadline, &expired))
+ if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
return false;
ktimer->hv_timer_in_use = true;
@@ -2175,10 +2176,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
hrtimer_cancel(&apic->lapic_timer.timer);
if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
- static_key_slow_dec_deferred(&apic_hw_disabled);
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
if (!apic->sw_enabled)
- static_key_slow_dec_deferred(&apic_sw_disabled);
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
if (apic->regs)
free_page((unsigned long)apic->regs);
@@ -2250,9 +2251,9 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
if (value & MSR_IA32_APICBASE_ENABLE) {
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
- static_key_slow_dec_deferred(&apic_hw_disabled);
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
} else {
- static_key_slow_inc(&apic_hw_disabled.key);
+ static_branch_inc(&apic_hw_disabled.key);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
}
@@ -2261,7 +2262,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
- kvm_x86_ops.set_virtual_apic_mode(vcpu);
+ static_call(kvm_x86_set_virtual_apic_mode)(vcpu);
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -2338,9 +2339,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (vcpu->arch.apicv_active) {
- kvm_x86_ops.apicv_post_state_restore(vcpu);
- kvm_x86_ops.hwapic_irr_update(vcpu, -1);
- kvm_x86_ops.hwapic_isr_update(vcpu, -1);
+ static_call(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call(kvm_x86_hwapic_irr_update)(vcpu, -1);
+ static_call(kvm_x86_hwapic_isr_update)(vcpu, -1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2449,7 +2450,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
* thinking that APIC state has changed.
*/
vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
- static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
+ static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
return 0;
@@ -2512,7 +2513,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
*/
apic_clear_irr(vector, apic);
- if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+ if (to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap)) {
/*
* For auto-EOI interrupts, there might be another pending
* interrupt above PPR, so check whether to raise another
@@ -2601,10 +2602,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
- kvm_x86_ops.apicv_post_state_restore(vcpu);
- kvm_x86_ops.hwapic_irr_update(vcpu,
+ static_call(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call(kvm_x86_hwapic_irr_update)(vcpu,
apic_find_highest_irr(apic));
- kvm_x86_ops.hwapic_isr_update(vcpu,
+ static_call(kvm_x86_hwapic_isr_update)(vcpu,
apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -2904,13 +2905,6 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
}
}
-void kvm_lapic_init(void)
-{
- /* do not patch jump label more than once per second */
- jump_label_rate_limit(&apic_hw_disabled, HZ);
- jump_label_rate_limit(&apic_sw_disabled, HZ);
-}
-
void kvm_lapic_exit(void)
{
static_key_deferred_flush(&apic_hw_disabled);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4fb86e3a9dd3..997c45a5963a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -6,6 +6,8 @@
#include <linux/kvm_host.h>
+#include "hyperv.h"
+
#define KVM_APIC_INIT 0
#define KVM_APIC_SIPI 1
#define KVM_APIC_LVT_NUM 6
@@ -125,13 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
-}
-
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
-void kvm_lapic_init(void);
void kvm_lapic_exit(void);
#define VEC_POS(v) ((v) & (32 - 1))
@@ -172,29 +168,29 @@ static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 va
__kvm_lapic_set_reg(apic->regs, reg_off, val);
}
-extern struct static_key kvm_no_apic_vcpu;
+DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
{
- if (static_key_false(&kvm_no_apic_vcpu))
+ if (static_branch_unlikely(&kvm_has_noapic_vcpu))
return vcpu->arch.apic;
return true;
}
-extern struct static_key_deferred apic_hw_disabled;
+extern struct static_key_false_deferred apic_hw_disabled;
static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
{
- if (static_key_false(&apic_hw_disabled.key))
+ if (static_branch_unlikely(&apic_hw_disabled.key))
return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
return MSR_IA32_APICBASE_ENABLE;
}
-extern struct static_key_deferred apic_sw_disabled;
+extern struct static_key_false_deferred apic_sw_disabled;
static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
{
- if (static_key_false(&apic_sw_disabled.key))
+ if (static_branch_unlikely(&apic_sw_disabled.key))
return apic->sw_enabled;
return true;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 261be1d2032b..c68bfc3e2402 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -102,7 +102,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(root_hpa))
return;
- kvm_x86_ops.load_mmu_pgd(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
+ static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
vcpu->arch.mmu->shadow_root_level);
}
@@ -152,7 +152,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
*
* TODO: introduce APIs to split these two cases.
*/
-static inline int is_writable_pte(unsigned long pte)
+static inline bool is_writable_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
}
@@ -174,8 +174,8 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
unsigned pte_access, unsigned pte_pkey,
unsigned pfec)
{
- int cpl = kvm_x86_ops.get_cpl(vcpu);
- unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+ int cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
/*
* If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..e507568cd55d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -190,7 +190,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
int ret = -ENOTSUPP;
if (range && kvm_x86_ops.tlb_remote_flush_with_range)
- ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range);
+ ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
if (ret)
kvm_flush_remote_tlbs(kvm);
@@ -844,17 +844,17 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
int i, count = 0;
if (!rmap_head->val) {
- rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+ rmap_printk("%p %llx 0->1\n", spte, *spte);
rmap_head->val = (unsigned long)spte;
} else if (!(rmap_head->val & 1)) {
- rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+ rmap_printk("%p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_pte_list_desc(vcpu);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
- rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+ rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
while (desc->sptes[PTE_LIST_EXT-1]) {
count += PTE_LIST_EXT;
@@ -906,14 +906,14 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
pr_err("%s: %p 0->BUG\n", __func__, spte);
BUG();
} else if (!(rmap_head->val & 1)) {
- rmap_printk("%s: %p 1->0\n", __func__, spte);
+ rmap_printk("%p 1->0\n", spte);
if ((u64 *)rmap_head->val != spte) {
pr_err("%s: %p 1->BUG\n", __func__, spte);
BUG();
}
rmap_head->val = 0;
} else {
- rmap_printk("%s: %p many->many\n", __func__, spte);
+ rmap_printk("%p many->many\n", spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
prev_desc = NULL;
while (desc) {
@@ -1115,7 +1115,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
!(pt_protect && spte_can_locklessly_be_made_writable(spte)))
return false;
- rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("spte %p %llx\n", sptep, *sptep);
if (pt_protect)
spte &= ~SPTE_MMU_WRITEABLE;
@@ -1142,7 +1142,7 @@ static bool spte_clear_dirty(u64 *sptep)
{
u64 spte = *sptep;
- rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("spte %p %llx\n", sptep, *sptep);
MMU_WARN_ON(!spte_ad_enabled(spte));
spte &= ~shadow_dirty_mask;
@@ -1184,7 +1184,7 @@ static bool spte_set_dirty(u64 *sptep)
{
u64 spte = *sptep;
- rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("spte %p %llx\n", sptep, *sptep);
/*
* Similar to the !kvm_x86_ops.slot_disable_log_dirty case,
@@ -1225,7 +1225,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, true);
while (mask) {
@@ -1254,7 +1254,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, false);
while (mask) {
@@ -1283,8 +1283,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t gfn_offset, unsigned long mask)
{
if (kvm_x86_ops.enable_log_dirty_pt_masked)
- kvm_x86_ops.enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
- mask);
+ static_call(kvm_x86_enable_log_dirty_pt_masked)(kvm, slot,
+ gfn_offset,
+ mask);
else
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
@@ -1292,7 +1293,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
int kvm_cpu_dirty_log_size(void)
{
if (kvm_x86_ops.cpu_dirty_log_size)
- return kvm_x86_ops.cpu_dirty_log_size();
+ return static_call(kvm_x86_cpu_dirty_log_size)();
return 0;
}
@@ -1309,7 +1310,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
write_protected |= __rmap_write_protect(kvm, rmap_head, true);
}
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
write_protected |=
kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn);
@@ -1331,7 +1332,7 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
while ((sptep = rmap_get_first(rmap_head, &iter))) {
- rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
+ rmap_printk("spte %p %llx.\n", sptep, *sptep);
pte_list_remove(rmap_head, sptep);
flush = true;
@@ -1363,7 +1364,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
- rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
+ rmap_printk("spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
need_flush = 1;
@@ -1456,16 +1457,17 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-static int kvm_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn,
- int level,
- unsigned long data))
+static __always_inline int
+kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn,
+ int level,
+ unsigned long data))
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
@@ -1521,7 +1523,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
return r;
@@ -1533,7 +1535,7 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
return r;
@@ -1588,7 +1590,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
int young = false;
young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
return young;
@@ -1599,7 +1601,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
int young = false;
young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
return young;
@@ -1723,13 +1725,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
return 0;
}
-static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- const void *pte)
-{
- WARN_ON(1);
-}
-
#define KVM_PAGE_ARRAY_NR 16
struct kvm_mmu_pages {
@@ -2016,9 +2011,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
flush |= kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
- if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+ if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
- cond_resched_lock(&vcpu->kvm->mmu_lock);
+ cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
flush = false;
}
}
@@ -2417,7 +2412,7 @@ static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
return 0;
restart:
- list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+ list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
/*
* Don't zap active root pages, the page itself can't be freed
* and zapping it will just force vCPUs to realloc and reload.
@@ -2470,7 +2465,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
*/
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
@@ -2481,7 +2476,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2492,7 +2487,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
sp->role.word);
@@ -2500,7 +2495,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
return r;
}
@@ -3161,7 +3156,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
if (kvm_mmu_put_root(kvm, sp)) {
- if (sp->tdp_mmu_page)
+ if (is_tdp_mmu_page(sp))
kvm_tdp_mmu_free_root(kvm, sp);
else if (sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
@@ -3192,7 +3187,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return;
}
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3215,7 +3210,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
@@ -3236,16 +3231,16 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
{
struct kvm_mmu_page *sp;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
if (make_mmu_pages_available(vcpu)) {
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
return INVALID_PAGE;
}
sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
return __pa(sp->spt);
}
@@ -3255,7 +3250,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
hpa_t root;
unsigned i;
- if (vcpu->kvm->arch.tdp_mmu_enabled) {
+ if (is_tdp_mmu_enabled(vcpu->kvm)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
if (!VALID_PAGE(root))
@@ -3416,17 +3411,17 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
!smp_load_acquire(&sp->unsync_children))
return;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
mmu_sync_children(vcpu, sp);
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
return;
}
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
for (i = 0; i < 4; ++i) {
@@ -3440,7 +3435,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
@@ -3724,7 +3719,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
return r;
r = RET_PF_RETRY;
- spin_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ read_lock(&vcpu->kvm->mmu_lock);
+ else
+ write_lock(&vcpu->kvm->mmu_lock);
+
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
@@ -3739,7 +3739,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
prefault, is_tdp);
out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
+ if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ read_unlock(&vcpu->kvm->mmu_lock);
+ else
+ write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return r;
}
@@ -3813,7 +3816,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->update_pte = nonpaging_update_pte;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = true;
@@ -3984,20 +3986,27 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
static void
__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct rsvd_bits_validate *rsvd_check,
- int maxphyaddr, int level, bool nx, bool gbpages,
+ u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
bool pse, bool amd)
{
- u64 exb_bit_rsvd = 0;
u64 gbpages_bit_rsvd = 0;
u64 nonleaf_bit8_rsvd = 0;
+ u64 high_bits_rsvd;
rsvd_check->bad_mt_xwr = 0;
- if (!nx)
- exb_bit_rsvd = rsvd_bits(63, 63);
if (!gbpages)
gbpages_bit_rsvd = rsvd_bits(7, 7);
+ if (level == PT32E_ROOT_LEVEL)
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
+ else
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+
+ /* Note, NX doesn't exist in PDPTEs, this is handled below. */
+ if (!nx)
+ high_bits_rsvd |= rsvd_bits(63, 63);
+
/*
* Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
* leaf entries) on AMD CPUs only.
@@ -4026,45 +4035,39 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
break;
case PT32E_ROOT_LEVEL:
- rsvd_check->rsvd_bits_mask[0][2] =
- rsvd_bits(maxphyaddr, 63) |
- rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
- rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PDE */
- rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PTE */
- rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62) |
- rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
+ high_bits_rsvd |
+ rsvd_bits(5, 8) |
+ rsvd_bits(1, 2); /* PDPTE */
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
break;
case PT64_ROOT_5LEVEL:
- rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
- rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
rsvd_check->rsvd_bits_mask[1][4] =
rsvd_check->rsvd_bits_mask[0][4];
fallthrough;
case PT64_ROOT_4LEVEL:
- rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
- rsvd_bits(maxphyaddr, 51);
- rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- gbpages_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
+ gbpages_bit_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
rsvd_check->rsvd_bits_mask[1][3] =
rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
- gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 29);
- rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
+ gbpages_bit_rsvd |
+ rsvd_bits(13, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
break;
@@ -4075,8 +4078,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
__reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
- cpuid_maxphyaddr(vcpu), context->root_level,
- context->nx,
+ vcpu->arch.reserved_gpa_bits,
+ context->root_level, context->nx,
guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
is_pse(vcpu),
guest_cpuid_is_amd_or_hygon(vcpu));
@@ -4084,27 +4087,22 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
- int maxphyaddr, bool execonly)
+ u64 pa_bits_rsvd, bool execonly)
{
+ u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
u64 bad_mt_xwr;
- rsvd_check->rsvd_bits_mask[0][4] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][3] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][2] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][1] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
/* large page */
rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
- rsvd_check->rsvd_bits_mask[1][1] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20);
rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
@@ -4123,7 +4121,12 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
- cpuid_maxphyaddr(vcpu), execonly);
+ vcpu->arch.reserved_gpa_bits, execonly);
+}
+
+static inline u64 reserved_hpa_bits(void)
+{
+ return rsvd_bits(shadow_phys_bits, 63);
}
/*
@@ -4145,7 +4148,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
*/
shadow_zero_check = &context->shadow_zero_check;
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- shadow_phys_bits,
+ reserved_hpa_bits(),
context->shadow_root_level, uses_nx,
guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
is_pse(vcpu), true);
@@ -4182,14 +4185,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
if (boot_cpu_is_amd())
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- shadow_phys_bits,
+ reserved_hpa_bits(),
context->shadow_root_level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
true, true);
else
__reset_rsvds_bits_mask_ept(shadow_zero_check,
- shadow_phys_bits,
- false);
+ reserved_hpa_bits(), false);
if (!shadow_me_mask)
return;
@@ -4209,7 +4211,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
- shadow_phys_bits, execonly);
+ reserved_hpa_bits(), execonly);
}
#define BYTE_MASK(access) \
@@ -4395,7 +4397,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
- context->update_pte = paging64_update_pte;
context->shadow_root_level = level;
context->direct_map = false;
}
@@ -4424,7 +4425,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
context->gva_to_gpa = paging32_gva_to_gpa;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
- context->update_pte = paging32_update_pte;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = false;
}
@@ -4506,7 +4506,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->page_fault = kvm_tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->update_pte = nonpaging_update_pte;
context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
context->direct_map = true;
context->get_guest_pgd = get_cr3;
@@ -4678,7 +4677,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->gva_to_gpa = ept_gva_to_gpa;
context->sync_page = ept_sync_page;
context->invlpg = ept_invlpg;
- context->update_pte = ept_update_pte;
context->root_level = level;
context->direct_map = false;
context->mmu_role.as_u64 = new_role.as_u64;
@@ -4811,7 +4809,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
if (r)
goto out;
kvm_mmu_load_pgd(vcpu);
- kvm_x86_ops.tlb_flush_current(vcpu);
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
out:
return r;
}
@@ -4826,19 +4824,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- const void *new)
-{
- if (sp->role.level != PG_LEVEL_4K) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
-
- ++vcpu->kvm->stat.mmu_pte_updated;
- vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
-}
-
static bool need_remote_flush(u64 old, u64 new)
{
if (!is_shadow_present_pte(old))
@@ -4954,22 +4939,6 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
return spte;
}
-/*
- * Ignore various flags when determining if a SPTE can be immediately
- * overwritten for the current MMU.
- * - level: explicitly checked in mmu_pte_write_new_pte(), and will never
- * match the current MMU role, as MMU's level tracks the root level.
- * - access: updated based on the new guest PTE
- * - quadrant: handled by get_written_sptes()
- * - invalid: always false (loop only walks valid shadow pages)
- */
-static const union kvm_mmu_page_role role_ign = {
- .level = 0xf,
- .access = 0x7,
- .quadrant = 0x3,
- .invalid = 0x1,
-};
-
static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes,
struct kvm_page_track_notifier_node *node)
@@ -4999,7 +4968,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
*/
mmu_topup_memory_caches(vcpu, true);
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
@@ -5020,14 +4989,10 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
local_flush = true;
while (npte--) {
- u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
-
entry = *spte;
mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
- if (gentry &&
- !((sp->role.word ^ base_role) & ~role_ign.word) &&
- rmap_can_add(vcpu))
- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
+ if (gentry && sp->role.level != PG_LEVEL_4K)
+ ++vcpu->kvm->stat.mmu_pde_zapped;
if (need_remote_flush(entry, *spte))
remote_flush = true;
++spte;
@@ -5035,7 +5000,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
}
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -5125,7 +5090,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (is_noncanonical_address(gva, vcpu))
return;
- kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+ static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
}
if (!mmu->invlpg)
@@ -5182,7 +5147,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
}
if (tlb_flush)
- kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+ static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
++vcpu->stat.invlpg;
@@ -5233,14 +5198,14 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (iterator.rmap)
flush |= fn(kvm, iterator.rmap);
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
if (flush && lock_flush_tlb) {
kvm_flush_remote_tlbs_with_address(kvm,
start_gfn,
iterator.gfn - start_gfn + 1);
flush = false;
}
- cond_resched_lock(&kvm->mmu_lock);
+ cond_resched_rwlock_write(&kvm->mmu_lock);
}
}
@@ -5390,7 +5355,7 @@ restart:
* be in active use by the guest.
*/
if (batch >= BATCH_ZAP_PAGES &&
- cond_resched_lock(&kvm->mmu_lock)) {
+ cond_resched_rwlock_write(&kvm->mmu_lock)) {
batch = 0;
goto restart;
}
@@ -5423,7 +5388,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
{
lockdep_assert_held(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
trace_kvm_mmu_zap_all_fast(kvm);
/*
@@ -5447,10 +5412,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_zap_obsolete_pages(kvm);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_all(kvm);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5492,7 +5457,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
int i;
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(memslot, slots) {
@@ -5510,13 +5475,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
}
}
- if (kvm->arch.tdp_mmu_enabled) {
+ if (is_tdp_mmu_enabled(kvm)) {
flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
if (flush)
kvm_flush_remote_tlbs(kvm);
}
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5531,12 +5496,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
{
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
/*
* We can flush all the TLBs out of the mmu lock without TLB
@@ -5596,13 +5561,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
{
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
kvm_mmu_zap_collapsible_spte, true);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
@@ -5625,11 +5590,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
{
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
/*
* It's also safe to flush TLBs out of mmu lock here as currently this
@@ -5647,12 +5612,12 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
{
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
false);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5664,11 +5629,11 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
{
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5681,23 +5646,23 @@ void kvm_mmu_zap_all(struct kvm *kvm)
LIST_HEAD(invalid_list);
int ign;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
if (WARN_ON(sp->role.invalid))
continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
- if (cond_resched_lock(&kvm->mmu_lock))
+ if (cond_resched_rwlock_write(&kvm->mmu_lock))
goto restart;
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_all(kvm);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
@@ -5757,7 +5722,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
continue;
idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
if (kvm_has_zapped_obsolete_pages(kvm)) {
kvm_mmu_commit_zap_page(kvm,
@@ -5768,7 +5733,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
unlock:
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
/*
@@ -5988,7 +5953,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
ulong to_zap;
rcu_idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
@@ -6005,22 +5970,22 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
struct kvm_mmu_page,
lpage_disallowed_link);
WARN_ON_ONCE(!sp->lpage_disallowed);
- if (sp->tdp_mmu_page)
+ if (is_tdp_mmu_page(sp)) {
kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
- else {
+ } else {
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->lpage_disallowed);
}
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- cond_resched_lock(&kvm->mmu_lock);
+ cond_resched_rwlock_write(&kvm->mmu_lock);
}
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
index c8d51a37e2ce..ced15fd58fde 100644
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
}
static bool mmu_audit;
-static struct static_key mmu_audit_key;
+static DEFINE_STATIC_KEY_FALSE(mmu_audit_key);
static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
{
@@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
{
- if (static_key_false((&mmu_audit_key)))
+ if (static_branch_unlikely((&mmu_audit_key)))
__kvm_mmu_audit(vcpu, point);
}
@@ -259,7 +259,7 @@ static void mmu_audit_enable(void)
if (mmu_audit)
return;
- static_key_slow_inc(&mmu_audit_key);
+ static_branch_inc(&mmu_audit_key);
mmu_audit = true;
}
@@ -268,7 +268,7 @@ static void mmu_audit_disable(void)
if (!mmu_audit)
return;
- static_key_slow_dec(&mmu_audit_key);
+ static_branch_dec(&mmu_audit_key);
mmu_audit = false;
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index bfc6389edc28..9e38d3c5daad 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -12,7 +12,7 @@
extern bool dbg;
#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0)
#define MMU_WARN_ON(x) WARN_ON(x)
#else
#define pgprintk(x...) do { } while (0)
@@ -56,7 +56,12 @@ struct kvm_mmu_page {
/* Number of writes since the last time traversal visited this page. */
atomic_t write_flooding_count;
+#ifdef CONFIG_X86_64
bool tdp_mmu_page;
+
+ /* Used for freeing the page asyncronously if it is a TDP MMU page. */
+ struct rcu_head rcu_head;
+#endif
};
extern struct kmem_cache *mmu_page_header_cache;
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 8443a675715b..34bb0ec69bd8 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -184,9 +184,9 @@ kvm_page_track_register_notifier(struct kvm *kvm,
head = &kvm->arch.track_notifier_head;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
hlist_add_head_rcu(&n->node, &head->track_notifier_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
@@ -202,9 +202,9 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,
head = &kvm->arch.track_notifier_head;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
hlist_del_rcu(&n->node);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
synchronize_srcu(&head->track_srcu);
}
EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50e268eb8e1a..d9f66cc459e8 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -868,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
}
r = RET_PF_RETRY;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
@@ -881,7 +881,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return r;
}
@@ -919,7 +919,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
return;
}
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
level = iterator.level;
sptep = iterator.sptep;
@@ -954,7 +954,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
break;
}
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
}
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index c51ad544f25b..ef55f0bc4ccf 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -120,7 +120,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
- spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
+ spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
if (host_writable)
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 2b3a30bd38b0..6de3950fd704 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -131,6 +131,25 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
/*
+ * If a thread running without exclusive control of the MMU lock must perform a
+ * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
+ * non-present intermediate value. Other threads which encounter this value
+ * should not modify the SPTE.
+ *
+ * This constant works because it is considered non-present on both AMD and
+ * Intel CPUs and does not create a L1TF vulnerability because the pfn section
+ * is zeroed out.
+ *
+ * Only used by the TDP MMU.
+ */
+#define REMOVED_SPTE (1ull << 59)
+
+static inline bool is_removed_spte(u64 spte)
+{
+ return spte == REMOVED_SPTE;
+}
+
+/*
* In some cases, we need to preserve the GFN of a non-present or reserved
* SPTE when we usurp the upper five bits of the physical address space to
* defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
@@ -185,23 +204,19 @@ static inline bool is_access_track_spte(u64 spte)
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
}
-static inline int is_shadow_present_pte(u64 pte)
+static inline bool is_shadow_present_pte(u64 pte)
{
- return (pte != 0) && !is_mmio_spte(pte);
+ return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
}
-static inline int is_large_pte(u64 pte)
+static inline bool is_large_pte(u64 pte)
{
return pte & PT_PAGE_SIZE_MASK;
}
-static inline int is_last_spte(u64 pte, int level)
+static inline bool is_last_spte(u64 pte, int level)
{
- if (level == PG_LEVEL_4K)
- return 1;
- if (is_large_pte(pte))
- return 1;
- return 0;
+ return (level == PG_LEVEL_4K) || is_large_pte(pte);
}
static inline bool is_executable_pte(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 87b7e16911db..e5f148106e20 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
- iter->old_spte = READ_ONCE(*iter->sptep);
+ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
}
static gfn_t round_gfn_for_level(gfn_t gfn, int level)
@@ -22,21 +22,22 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
/*
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
- * rooted at root_pt, starting with the walk to translate goal_gfn.
+ * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
- int min_level, gfn_t goal_gfn)
+ int min_level, gfn_t next_last_level_gfn)
{
WARN_ON(root_level < 1);
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
- iter->goal_gfn = goal_gfn;
+ iter->next_last_level_gfn = next_last_level_gfn;
+ iter->yielded_gfn = iter->next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
iter->level = root_level;
- iter->pt_path[iter->level - 1] = root_pt;
+ iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;
- iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
+ iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
iter->valid = true;
@@ -47,7 +48,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
* address of the child page table referenced by the SPTE. Returns null if
* there is no such entry.
*/
-u64 *spte_to_child_pt(u64 spte, int level)
+tdp_ptep_t spte_to_child_pt(u64 spte, int level)
{
/*
* There's no child entry if this entry isn't present or is a
@@ -56,7 +57,7 @@ u64 *spte_to_child_pt(u64 spte, int level)
if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
return NULL;
- return __va(spte_to_pfn(spte) << PAGE_SHIFT);
+ return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
}
/*
@@ -65,7 +66,7 @@ u64 *spte_to_child_pt(u64 spte, int level)
*/
static bool try_step_down(struct tdp_iter *iter)
{
- u64 *child_pt;
+ tdp_ptep_t child_pt;
if (iter->level == iter->min_level)
return false;
@@ -74,7 +75,7 @@ static bool try_step_down(struct tdp_iter *iter)
* Reread the SPTE before stepping down to avoid traversing into page
* tables that are no longer linked from this entry.
*/
- iter->old_spte = READ_ONCE(*iter->sptep);
+ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
child_pt = spte_to_child_pt(iter->old_spte, iter->level);
if (!child_pt)
@@ -82,7 +83,7 @@ static bool try_step_down(struct tdp_iter *iter)
iter->level--;
iter->pt_path[iter->level - 1] = child_pt;
- iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
+ iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
@@ -106,9 +107,9 @@ static bool try_step_side(struct tdp_iter *iter)
return false;
iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
- iter->goal_gfn = iter->gfn;
+ iter->next_last_level_gfn = iter->gfn;
iter->sptep++;
- iter->old_spte = READ_ONCE(*iter->sptep);
+ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
return true;
}
@@ -158,24 +159,7 @@ void tdp_iter_next(struct tdp_iter *iter)
iter->valid = false;
}
-/*
- * Restart the walk over the paging structure from the root, starting from the
- * highest gfn the iterator had previously reached. Assumes that the entire
- * paging structure, except the root page, may have been completely torn down
- * and rebuilt.
- */
-void tdp_iter_refresh_walk(struct tdp_iter *iter)
-{
- gfn_t goal_gfn = iter->goal_gfn;
-
- if (iter->gfn > goal_gfn)
- goal_gfn = iter->gfn;
-
- tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
- iter->root_level, iter->min_level, goal_gfn);
-}
-
-u64 *tdp_iter_root_pt(struct tdp_iter *iter)
+tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
{
return iter->pt_path[iter->root_level - 1];
}
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 47170d0dc98e..4cc177d75c4a 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -7,6 +7,8 @@
#include "mmu.h"
+typedef u64 __rcu *tdp_ptep_t;
+
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
*/
@@ -15,11 +17,17 @@ struct tdp_iter {
* The iterator will traverse the paging structure towards the mapping
* for this GFN.
*/
- gfn_t goal_gfn;
+ gfn_t next_last_level_gfn;
+ /*
+ * The next_last_level_gfn at the time when the thread last
+ * yielded. Only yielding when the next_last_level_gfn !=
+ * yielded_gfn helps ensure forward progress.
+ */
+ gfn_t yielded_gfn;
/* Pointers to the page tables traversed to reach the current SPTE */
- u64 *pt_path[PT64_ROOT_MAX_LEVEL];
+ tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
/* A pointer to the current SPTE */
- u64 *sptep;
+ tdp_ptep_t sptep;
/* The lowest GFN mapped by the current SPTE */
gfn_t gfn;
/* The level of the root page given to the iterator */
@@ -49,12 +57,11 @@ struct tdp_iter {
#define for_each_tdp_pte(iter, root, root_level, start, end) \
for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)
-u64 *spte_to_child_pt(u64 pte, int level);
+tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
- int min_level, gfn_t goal_gfn);
+ int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
-void tdp_iter_refresh_walk(struct tdp_iter *iter);
-u64 *tdp_iter_root_pt(struct tdp_iter *iter);
+tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index b56d604809b8..71e100a5670f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -7,32 +7,23 @@
#include "tdp_mmu.h"
#include "spte.h"
+#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>
-#ifdef CONFIG_X86_64
static bool __read_mostly tdp_mmu_enabled = false;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
-#endif
-
-static bool is_tdp_mmu_enabled(void)
-{
-#ifdef CONFIG_X86_64
- return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
-#else
- return false;
-#endif /* CONFIG_X86_64 */
-}
/* Initializes the TDP MMU for the VM, if enabled. */
void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
- if (!is_tdp_mmu_enabled())
+ if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
return;
/* This should not be changed for the lifetime of the VM. */
kvm->arch.tdp_mmu_enabled = true;
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
+ spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
}
@@ -42,6 +33,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
return;
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+
+ /*
+ * Ensure that all the outstanding RCU callbacks to free shadow pages
+ * can run before the VM is torn down.
+ */
+ rcu_barrier();
}
static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
@@ -53,7 +50,7 @@ static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
struct kvm_mmu_page *root)
{
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
return false;
@@ -88,22 +85,6 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
#define for_each_tdp_mmu_root(_kvm, _root) \
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
-bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
-{
- struct kvm_mmu_page *sp;
-
- if (!kvm->arch.tdp_mmu_enabled)
- return false;
- if (WARN_ON(!VALID_PAGE(hpa)))
- return false;
-
- sp = to_shadow_page(hpa);
- if (WARN_ON(!sp))
- return false;
-
- return sp->tdp_mmu_page && sp->root_count;
-}
-
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield);
@@ -111,7 +92,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
WARN_ON(root->root_count);
WARN_ON(!root->tdp_mmu_page);
@@ -164,13 +145,13 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
/* Check for an existing root before allocating a new one. */
for_each_tdp_mmu_root(kvm, root) {
if (root->role.word == role.word) {
kvm_mmu_get_root(kvm, root);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
return root;
}
}
@@ -180,7 +161,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
list_add(&root->link, &kvm->arch.tdp_mmu_roots);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
return root;
}
@@ -196,8 +177,31 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
return __pa(root->spt);
}
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+{
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
+{
+ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+ rcu_head);
+
+ tdp_mmu_free_sp(sp);
+}
+
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level);
+ u64 old_spte, u64 new_spte, int level,
+ bool shared);
static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
{
@@ -235,6 +239,128 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
}
/**
+ * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
+ *
+ * @kvm: kvm instance
+ * @sp: the new page
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be adding or removing pages.
+ * @account_nx: This page replaces a NX large page and should be marked for
+ * eventual reclaim.
+ */
+static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared, bool account_nx)
+{
+ if (shared)
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
+ if (account_nx)
+ account_huge_nx_page(kvm, sp);
+
+ if (shared)
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/**
+ * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
+ *
+ * @kvm: kvm instance
+ * @sp: the page to be removed
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be adding or removing pages.
+ */
+static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared)
+{
+ if (shared)
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ list_del(&sp->link);
+ if (sp->lpage_disallowed)
+ unaccount_huge_nx_page(kvm, sp);
+
+ if (shared)
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/**
+ * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
+ *
+ * @kvm: kvm instance
+ * @pt: the page removed from the paging structure
+ * @shared: This operation may not be running under the exclusive use
+ * of the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
+ *
+ * Given a page table that has been removed from the TDP paging structure,
+ * iterates through the page table to clear SPTEs and free child page tables.
+ */
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+ bool shared)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(pt);
+ int level = sp->role.level;
+ gfn_t base_gfn = sp->gfn;
+ u64 old_child_spte;
+ u64 *sptep;
+ gfn_t gfn;
+ int i;
+
+ trace_kvm_mmu_prepare_zap_page(sp);
+
+ tdp_mmu_unlink_page(kvm, sp, shared);
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ sptep = pt + i;
+ gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
+
+ if (shared) {
+ /*
+ * Set the SPTE to a nonpresent value that other
+ * threads will not overwrite. If the SPTE was
+ * already marked as removed then another thread
+ * handling a page fault could overwrite it, so
+ * set the SPTE until it is set from some other
+ * value to the removed SPTE value.
+ */
+ for (;;) {
+ old_child_spte = xchg(sptep, REMOVED_SPTE);
+ if (!is_removed_spte(old_child_spte))
+ break;
+ cpu_relax();
+ }
+ } else {
+ old_child_spte = READ_ONCE(*sptep);
+
+ /*
+ * Marking the SPTE as a removed SPTE is not
+ * strictly necessary here as the MMU lock will
+ * stop other threads from concurrently modifying
+ * this SPTE. Using the removed SPTE value keeps
+ * the two branches consistent and simplifies
+ * the function.
+ */
+ WRITE_ONCE(*sptep, REMOVED_SPTE);
+ }
+ handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
+ old_child_spte, REMOVED_SPTE, level - 1,
+ shared);
+ }
+
+ kvm_flush_remote_tlbs_with_address(kvm, gfn,
+ KVM_PAGES_PER_HPAGE(level));
+
+ call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
+}
+
+/**
* handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of
@@ -242,22 +368,22 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
* @old_spte: The value of the SPTE before the change
* @new_spte: The value of the SPTE after the change
* @level: the level of the PT the SPTE is part of in the paging structure
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
*
* Handle bookkeeping that might result from the modification of a SPTE.
* This function must be called for all TDP SPTE modifications.
*/
static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level)
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
bool was_leaf = was_present && is_last_spte(old_spte, level);
bool is_leaf = is_present && is_last_spte(new_spte, level);
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
- u64 *pt;
- struct kvm_mmu_page *sp;
- u64 old_child_spte;
- int i;
WARN_ON(level > PT64_ROOT_MAX_LEVEL);
WARN_ON(level < PG_LEVEL_4K);
@@ -298,15 +424,19 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
*/
if (!was_present && !is_present) {
/*
- * If this change does not involve a MMIO SPTE, it is
- * unexpected. Log the change, though it should not impact the
- * guest since both the former and current SPTEs are nonpresent.
+ * If this change does not involve a MMIO SPTE or removed SPTE,
+ * it is unexpected. Log the change, though it should not
+ * impact the guest since both the former and current SPTEs
+ * are nonpresent.
*/
- if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
+ if (WARN_ON(!is_mmio_spte(old_spte) &&
+ !is_mmio_spte(new_spte) &&
+ !is_removed_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
"different nonpresent SPTE, unless one or both\n"
- "are MMIO SPTEs.\n"
+ "are MMIO SPTEs, or the new SPTE is\n"
+ "a temporary removed SPTE.\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
return;
@@ -321,54 +451,127 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* Recursively handle child PTs if the change removed a subtree from
* the paging structure.
*/
- if (was_present && !was_leaf && (pfn_changed || !is_present)) {
- pt = spte_to_child_pt(old_spte, level);
- sp = sptep_to_sp(pt);
+ if (was_present && !was_leaf && (pfn_changed || !is_present))
+ handle_removed_tdp_mmu_page(kvm,
+ spte_to_child_pt(old_spte, level), shared);
+}
- trace_kvm_mmu_prepare_zap_page(sp);
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
+{
+ __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
+ shared);
+ handle_changed_spte_acc_track(old_spte, new_spte, level);
+ handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
+ new_spte, level);
+}
- list_del(&sp->link);
+/*
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
+ * associated bookkeeping
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * Returns: true if the SPTE was set, false if it was not. If false is returned,
+ * this function will have no side-effects.
+ */
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ u64 *root_pt = tdp_iter_root_pt(iter);
+ struct kvm_mmu_page *root = sptep_to_sp(root_pt);
+ int as_id = kvm_mmu_page_as_id(root);
- if (sp->lpage_disallowed)
- unaccount_huge_nx_page(kvm, sp);
+ lockdep_assert_held_read(&kvm->mmu_lock);
- for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- old_child_spte = READ_ONCE(*(pt + i));
- WRITE_ONCE(*(pt + i), 0);
- handle_changed_spte(kvm, as_id,
- gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
- old_child_spte, 0, level - 1);
- }
+ /*
+ * Do not change removed SPTEs. Only the thread that froze the SPTE
+ * may modify it.
+ */
+ if (iter->old_spte == REMOVED_SPTE)
+ return false;
- kvm_flush_remote_tlbs_with_address(kvm, gfn,
- KVM_PAGES_PER_HPAGE(level));
+ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
+ new_spte) != iter->old_spte)
+ return false;
- free_page((unsigned long)pt);
- kmem_cache_free(mmu_page_header_cache, sp);
- }
+ handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
+ iter->level, true);
+
+ return true;
}
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level)
+static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
{
- __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
- handle_changed_spte_acc_track(old_spte, new_spte, level);
- handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
- new_spte, level);
+ /*
+ * Freeze the SPTE by setting it to a special,
+ * non-present value. This will stop other threads from
+ * immediately installing a present entry in its place
+ * before the TLBs are flushed.
+ */
+ if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
+ return false;
+
+ kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
+ KVM_PAGES_PER_HPAGE(iter->level));
+
+ /*
+ * No other thread can overwrite the removed SPTE as they
+ * must either wait on the MMU lock or use
+ * tdp_mmu_set_spte_atomic which will not overrite the
+ * special removed SPTE value. No bookkeeping is needed
+ * here since the SPTE is going from non-present
+ * to non-present.
+ */
+ WRITE_ONCE(*iter->sptep, 0);
+
+ return true;
}
+
+/*
+ * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * @record_acc_track: Notify the MM subsystem of changes to the accessed state
+ * of the page. Should be set unless handling an MMU
+ * notifier for access tracking. Leaving record_acc_track
+ * unset in that case prevents page accesses from being
+ * double counted.
+ * @record_dirty_log: Record the page as dirty in the dirty bitmap if
+ * appropriate for the change being made. Should be set
+ * unless performing certain dirty logging operations.
+ * Leaving record_dirty_log unset in that case prevents page
+ * writes from being double counted.
+ */
static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte, bool record_acc_track,
bool record_dirty_log)
{
- u64 *root_pt = tdp_iter_root_pt(iter);
+ tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
struct kvm_mmu_page *root = sptep_to_sp(root_pt);
int as_id = kvm_mmu_page_as_id(root);
- WRITE_ONCE(*iter->sptep, new_spte);
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * No thread should be using this function to set SPTEs to the
+ * temporary removed SPTE value.
+ * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
+ * should be used. If operating under the MMU lock in write mode, the
+ * use of the removed SPTE should not be necessary.
+ */
+ WARN_ON(iter->old_spte == REMOVED_SPTE);
+
+ WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
__handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
- iter->level);
+ iter->level, false);
if (record_acc_track)
handle_changed_spte_acc_track(iter->old_spte, new_spte,
iter->level);
@@ -413,27 +616,46 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
_mmu->shadow_root_level, _start, _end)
/*
- * Flush the TLB if the process should drop kvm->mmu_lock.
- * Return whether the caller still needs to flush the tlb.
+ * Yield if the MMU lock is contended or this thread needs to return control
+ * to the scheduler.
+ *
+ * If this function should yield and flush is set, it will perform a remote
+ * TLB flush before yielding.
+ *
+ * If this function yields, it will also reset the tdp_iter's walk over the
+ * paging structure and the calling function should skip to the next
+ * iteration to allow the iterator to continue its traversal from the
+ * paging structure root.
+ *
+ * Return true if this function yielded and the iterator's traversal was reset.
+ * Return false if a yield was not needed.
*/
-static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
+static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter, bool flush)
{
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- kvm_flush_remote_tlbs(kvm);
- cond_resched_lock(&kvm->mmu_lock);
- tdp_iter_refresh_walk(iter);
+ /* Ensure forward progress has been made before yielding. */
+ if (iter->next_last_level_gfn == iter->yielded_gfn)
return false;
- } else {
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ rcu_read_unlock();
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ rcu_read_lock();
+
+ WARN_ON(iter->gfn > iter->next_last_level_gfn);
+
+ tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
+ iter->root_level, iter->min_level,
+ iter->next_last_level_gfn);
+
return true;
}
-}
-static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
-{
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- cond_resched_lock(&kvm->mmu_lock);
- tdp_iter_refresh_walk(iter);
- }
+ return false;
}
/*
@@ -453,7 +675,15 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
struct tdp_iter iter;
bool flush_needed = false;
+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
+ if (can_yield &&
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
+ flush_needed = false;
+ continue;
+ }
+
if (!is_shadow_present_pte(iter.old_spte))
continue;
@@ -468,12 +698,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
tdp_mmu_set_spte(kvm, &iter, 0);
-
- if (can_yield)
- flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
- else
- flush_needed = true;
+ flush_needed = true;
}
+
+ rcu_read_unlock();
return flush_needed;
}
@@ -517,21 +745,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
int ret = 0;
int make_spte_ret = 0;
- if (unlikely(is_noslot_pfn(pfn))) {
+ if (unlikely(is_noslot_pfn(pfn)))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
- trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
- } else {
+ else
make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
pfn, iter->old_spte, prefault, true,
map_writable, !shadow_accessed_mask,
&new_spte);
- trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
- }
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else
- tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
+ else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ return RET_PF_RETRY;
/*
* If the page fault was caused by a write but the page is write
@@ -545,10 +770,16 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
}
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
- if (unlikely(is_mmio_spte(new_spte)))
+ if (unlikely(is_mmio_spte(new_spte))) {
+ trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
+ new_spte);
ret = RET_PF_EMULATE;
+ } else
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+ rcu_dereference(iter->sptep));
- trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+ rcu_dereference(iter->sptep));
if (!prefault)
vcpu->stat.pf_fixed++;
@@ -586,6 +817,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
huge_page_disallowed, &req_level);
trace_kvm_mmu_spte_requested(gpa, level, pfn);
+
+ rcu_read_lock();
+
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
if (nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(iter.old_spte, gfn,
@@ -601,49 +835,61 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
*/
if (is_shadow_present_pte(iter.old_spte) &&
is_large_pte(iter.old_spte)) {
- tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
-
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
- KVM_PAGES_PER_HPAGE(iter.level));
+ if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
+ break;
/*
* The iter must explicitly re-read the spte here
* because the new value informs the !present
* path below.
*/
- iter.old_spte = READ_ONCE(*iter.sptep);
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
}
if (!is_shadow_present_pte(iter.old_spte)) {
sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
- list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
child_pt = sp->spt;
- clear_page(child_pt);
+
new_spte = make_nonleaf_spte(child_pt,
!shadow_accessed_mask);
- trace_kvm_mmu_get_page(sp, true);
- if (huge_page_disallowed && req_level >= iter.level)
- account_huge_nx_page(vcpu->kvm, sp);
-
- tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
+ if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
+ new_spte)) {
+ tdp_mmu_link_page(vcpu->kvm, sp, true,
+ huge_page_disallowed &&
+ req_level >= iter.level);
+
+ trace_kvm_mmu_get_page(sp, true);
+ } else {
+ tdp_mmu_free_sp(sp);
+ break;
+ }
}
}
- if (WARN_ON(iter.level != level))
+ if (iter.level != level) {
+ rcu_read_unlock();
return RET_PF_RETRY;
+ }
ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
pfn, prefault);
+ rcu_read_unlock();
return ret;
}
-static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end, unsigned long data,
- int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start,
- gfn_t end, unsigned long data))
+static __always_inline int
+kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ struct kvm_mmu_page *root,
+ gfn_t start,
+ gfn_t end,
+ unsigned long data))
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
@@ -705,6 +951,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
int young = 0;
u64 new_spte = 0;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, start, end) {
/*
* If we have a non-accessed entry we don't need to change the
@@ -736,6 +984,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
trace_kvm_age_page(iter.gfn, iter.level, slot, young);
}
+ rcu_read_unlock();
+
return young;
}
@@ -781,6 +1031,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
u64 new_spte;
int need_flush = 0;
+ rcu_read_lock();
+
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
@@ -809,6 +1061,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
if (need_flush)
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
+ rcu_read_unlock();
+
return 0;
}
@@ -832,21 +1086,27 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
min_level, start, end) {
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ continue;
+
if (!is_shadow_present_pte(iter.old_spte) ||
- !is_last_spte(iter.old_spte, iter.level))
+ !is_last_spte(iter.old_spte, iter.level) ||
+ !(iter.old_spte & PT_WRITABLE_MASK))
continue;
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
spte_set = true;
-
- tdp_mmu_iter_cond_resched(kvm, &iter);
}
+
+ rcu_read_unlock();
return spte_set;
}
@@ -888,7 +1148,12 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ continue;
+
if (spte_ad_need_write_protect(iter.old_spte)) {
if (is_writable_pte(iter.old_spte))
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -903,9 +1168,9 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
spte_set = true;
-
- tdp_mmu_iter_cond_resched(kvm, &iter);
}
+
+ rcu_read_unlock();
return spte_set;
}
@@ -947,6 +1212,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
struct tdp_iter iter;
u64 new_spte;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
gfn + BITS_PER_LONG) {
if (!mask)
@@ -956,6 +1223,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
!(mask & (1UL << (iter.gfn - gfn))))
continue;
+ mask &= ~(1UL << (iter.gfn - gfn));
+
if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
if (is_writable_pte(iter.old_spte))
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -969,9 +1238,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
}
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
-
- mask &= ~(1UL << (iter.gfn - gfn));
}
+
+ rcu_read_unlock();
}
/*
@@ -989,7 +1258,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_mmu_page *root;
int root_as_id;
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
@@ -1011,18 +1280,23 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
- if (!is_shadow_present_pte(iter.old_spte))
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ iter.old_spte & shadow_dirty_mask)
continue;
new_spte = iter.old_spte | shadow_dirty_mask;
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
-
- tdp_mmu_iter_cond_resched(kvm, &iter);
}
+ rcu_read_unlock();
return spte_set;
}
@@ -1060,7 +1334,14 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
kvm_pfn_t pfn;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
+ spte_set = false;
+ continue;
+ }
+
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
@@ -1072,9 +1353,10 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
tdp_mmu_set_spte(kvm, &iter, 0);
- spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
+ spte_set = true;
}
+ rcu_read_unlock();
if (spte_set)
kvm_flush_remote_tlbs(kvm);
}
@@ -1111,6 +1393,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
if (!is_writable_pte(iter.old_spte))
break;
@@ -1122,6 +1406,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
spte_set = true;
}
+ rcu_read_unlock();
+
return spte_set;
}
@@ -1137,7 +1423,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
int root_as_id;
bool spte_set = false;
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
@@ -1162,10 +1448,14 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*root_level = vcpu->arch.mmu->shadow_root_level;
+ rcu_read_lock();
+
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
+ rcu_read_unlock();
+
return leaf;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index cbbdbadd1526..b4b65e3699b3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -5,10 +5,6 @@
#include <linux/kvm_host.h>
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
-void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-
-bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
@@ -47,4 +43,32 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
+#ifdef CONFIG_X86_64
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
+#else
+static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {}
+static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
+#endif
+
+static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!is_tdp_mmu_enabled(kvm))
+ return false;
+ if (WARN_ON(!VALID_PAGE(hpa)))
+ return false;
+
+ sp = to_shadow_page(hpa);
+ if (WARN_ON(!sp))
+ return false;
+
+ return is_tdp_mmu_page(sp) && sp->root_count;
+}
+
#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index f472fdb6ae7e..a8502e02f479 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -75,7 +75,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* variable MTRRs */
WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
- mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
if ((msr & 1) == 0) {
/* MTRR base */
if (!valid_mtrr_type(data & 0xff))
@@ -351,14 +351,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (var_mtrr_range_is_valid(cur))
list_del(&mtrr_state->var_ranges[index].node);
- /* Extend the mask with all 1 bits to the left, since those
- * bits must implicitly be 0. The bits are then cleared
- * when reading them.
+ /*
+ * Set all illegal GPA bits in the mask, since those bits must
+ * implicitly be 0. The bits are then cleared when reading them.
*/
if (!is_mtrr_mask)
cur->base = data;
else
- cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
+ cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
/* add it to the list if it's enabled. */
if (var_mtrr_range_is_valid(cur)) {
@@ -426,7 +426,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
else
*pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
- *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
+ *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
}
return 0;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 67741d2a0308..827886c12c16 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -373,7 +373,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
return 1;
if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
- (kvm_x86_ops.get_cpl(vcpu) != 0) &&
+ (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
(kvm_read_cr0(vcpu) & X86_CR0_PE))
return 1;
@@ -383,8 +383,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
- if (lapic_in_kernel(vcpu))
+ if (lapic_in_kernel(vcpu)) {
+ if (kvm_x86_ops.pmu_ops->deliver_pmi)
+ kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+ }
}
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
@@ -473,6 +476,9 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
pmc_stop_counter(pmc);
}
+ if (kvm_x86_ops.pmu_ops->cleanup)
+ kvm_x86_ops.pmu_ops->cleanup(vcpu);
+
bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 067fef51760c..7b30bc967af3 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -39,6 +39,8 @@ struct kvm_pmu_ops {
void (*refresh)(struct kvm_vcpu *vcpu);
void (*init)(struct kvm_vcpu *vcpu);
void (*reset)(struct kvm_vcpu *vcpu);
+ void (*deliver_pmi)(struct kvm_vcpu *vcpu);
+ void (*cleanup)(struct kvm_vcpu *vcpu);
};
static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 0ef84d57b72e..78bdcfac4e40 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -298,6 +298,23 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ bool m = kvm_apic_match_dest(vcpu, source,
+ icrl & APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & APIC_DEST_MASK);
+
+ if (m && !avic_vcpu_is_running(vcpu))
+ kvm_vcpu_wake_up(vcpu);
+ }
+}
+
int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
{
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
@@ -324,28 +341,14 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
kvm_lapic_reg_write(apic, APIC_ICR, icrl);
break;
- case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm *kvm = svm->vcpu.kvm;
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
-
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
/*
* At this point, we expect that the AVIC HW has already
* set the appropriate IRR bits on the valid target
* vcpus. So, we just need to kick the appropriate vcpu.
*/
- kvm_for_each_vcpu(i, vcpu, kvm) {
- bool m = kvm_apic_match_dest(vcpu, apic,
- icrl & APIC_SHORT_MASK,
- GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK);
-
- if (m && !avic_vcpu_is_running(vcpu))
- kvm_vcpu_wake_up(vcpu);
- }
+ avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
break;
- }
case AVIC_IPI_FAILURE_INVALID_TARGET:
WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
index, svm->vcpu.vcpu_id, icrh, icrl);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index db30670dd8c4..cc91738ab445 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -58,7 +58,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
u64 pdpte;
int ret;
- ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
offset_in_page(cr3) + index * 8, 8);
if (ret)
return 0;
@@ -248,7 +248,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
if (vmcb12_lma) {
if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
!(vmcb12->save.cr0 & X86_CR0_PE) ||
- (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits))
+ kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
return false;
}
if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
@@ -345,7 +345,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
bool nested_npt)
{
- if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))
+ if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
return -EINVAL;
if (!nested_npt && is_pae_paging(vcpu) &&
@@ -392,7 +392,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.rsp = vmcb12->save.rsp;
svm->vmcb->save.rip = vmcb12->save.rip;
svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_FIXED_1 | DR6_RTM;
+ svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
svm->vmcb->save.cpl = vmcb12->save.cpl;
}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 48017fef1cd9..874ea309279f 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -22,6 +22,7 @@
#include "x86.h"
#include "svm.h"
+#include "svm_ops.h"
#include "cpuid.h"
#include "trace.h"
@@ -1041,6 +1042,74 @@ e_unpin_memory:
return ret;
}
+static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *report = (void __user *)(uintptr_t)argp->data;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_attestation_report *data;
+ struct kvm_sev_attestation_report params;
+ void __user *p;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = (void __user *)(uintptr_t)params.uaddr;
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+ ret = -EINVAL;
+ goto e_free;
+ }
+
+ ret = -ENOMEM;
+ blob = kmalloc(params.len, GFP_KERNEL);
+ if (!blob)
+ goto e_free;
+
+ data->address = __psp_pa(blob);
+ data->len = params.len;
+ memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+ }
+cmd:
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data->len;
+ if (copy_to_user(report, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+ return ret;
+}
+
int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -1091,6 +1160,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
case KVM_SEV_LAUNCH_SECRET:
r = sev_launch_secret(kvm, &sev_cmd);
break;
+ case KVM_SEV_GET_ATTESTATION_REPORT:
+ r = sev_get_attestation_report(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
@@ -1994,29 +2066,17 @@ void sev_es_create_vcpu(struct vcpu_svm *svm)
sev_enc_bit));
}
-void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu)
+void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu)
{
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
struct vmcb_save_area *hostsa;
- unsigned int i;
/*
* As an SEV-ES guest, hardware will restore the host state on VMEXIT,
* of which one step is to perform a VMLOAD. Since hardware does not
* perform a VMSAVE on VMRUN, the host savearea must be updated.
*/
- asm volatile(__ex("vmsave %0") : : "a" (__sme_page_pa(sd->save_area)) : "memory");
-
- /*
- * Certain MSRs are restored on VMEXIT, only save ones that aren't
- * restored.
- */
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
- if (host_save_user_msrs[i].sev_es_restored)
- continue;
-
- rdmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
- }
+ vmsave(__sme_page_pa(sd->save_area));
/* XCR0 is restored on VMEXIT, save the current host value */
hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
@@ -2029,22 +2089,6 @@ void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu)
hostsa->xss = host_xss;
}
-void sev_es_vcpu_put(struct vcpu_svm *svm)
-{
- unsigned int i;
-
- /*
- * Certain MSRs are restored on VMEXIT and were saved with vmsave in
- * sev_es_vcpu_load() above. Only restore ones that weren't.
- */
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
- if (host_save_user_msrs[i].sev_es_restored)
- continue;
-
- wrmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
- }
-}
-
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
struct vcpu_svm *svm = to_svm(vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3442d44ca53b..adb3619a3c16 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -41,6 +41,7 @@
#include "trace.h"
#include "svm.h"
+#include "svm_ops.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -200,9 +201,9 @@ module_param(sev_es, int, 0444);
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
-static u8 rsm_ins_bytes[] = "\x0f\xaa";
+static bool svm_gp_erratum_intercept = true;
-static void svm_complete_interrupts(struct vcpu_svm *svm);
+static u8 rsm_ins_bytes[] = "\x0f\xaa";
static unsigned long iopm_base;
@@ -246,21 +247,6 @@ u32 svm_msrpm_offset(u32 msr)
#define MAX_INST_SIZE 15
-static inline void clgi(void)
-{
- asm volatile (__ex("clgi"));
-}
-
-static inline void stgi(void)
-{
- asm volatile (__ex("stgi"));
-}
-
-static inline void invlpga(unsigned long addr, u32 asid)
-{
- asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
-}
-
static int get_max_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -288,6 +274,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if (!(efer & EFER_SVME)) {
svm_leave_nested(svm);
svm_set_gif(svm, true);
+ /* #GP intercept is still needed for vmware backdoor */
+ if (!enable_vmware_backdoor)
+ clr_exception_intercept(svm, GP_VECTOR);
/*
* Free the nested guest state, unless we are in SMM.
@@ -304,6 +293,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
vcpu->arch.efer = old_efer;
return ret;
}
+
+ if (svm_gp_erratum_intercept)
+ set_exception_intercept(svm, GP_VECTOR);
}
}
@@ -925,6 +917,9 @@ static __init void svm_set_cpu_caps(void)
if (npt_enabled)
kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+ /* Nested VM can receive #VMEXIT instead of triggering #GP */
+ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
/* CPUID 0x80000008 */
@@ -1032,6 +1027,9 @@ static __init int svm_hardware_setup(void)
}
}
+ if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+ svm_gp_erratum_intercept = false;
+
if (vgif) {
if (!boot_cpu_has(X86_FEATURE_VGIF))
vgif = false;
@@ -1207,7 +1205,7 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
- kvm_set_rflags(&svm->vcpu, 2);
+ kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1366,6 +1364,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->vmsa = page_address(vmsa_page);
svm->asid_generation = 0;
+ svm->guest_state_loaded = false;
init_vmcb(svm);
svm_init_osvw(vcpu);
@@ -1413,30 +1412,31 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
}
-static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- int i;
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ unsigned int i;
- if (unlikely(cpu != vcpu->cpu)) {
- svm->asid_generation = 0;
- vmcb_mark_all_dirty(svm->vmcb);
- }
+ if (svm->guest_state_loaded)
+ return;
+ /*
+ * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
+ * area (non-sev-es). Save ones that aren't so we can restore them
+ * individually later.
+ */
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ /*
+ * Save additional host state that will be restored on VMEXIT (sev-es)
+ * or subsequent vmload of host save area.
+ */
if (sev_es_guest(svm->vcpu.kvm)) {
- sev_es_vcpu_load(svm, cpu);
+ sev_es_prepare_guest_switch(svm, vcpu->cpu);
} else {
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
-#endif
- savesegment(fs, svm->host.fs);
- savesegment(gs, svm->host.gs);
- svm->host.ldt = kvm_read_ldt();
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i].index,
- svm->host_user_msrs[i]);
+ vmsave(__sme_page_pa(sd->save_area));
}
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
@@ -1446,10 +1446,42 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
}
}
+
/* This assumes that the kernel never uses MSR_TSC_AUX */
if (static_cpu_has(X86_FEATURE_RDTSCP))
wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ svm->guest_state_loaded = true;
+}
+
+static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned int i;
+
+ if (!svm->guest_state_loaded)
+ return;
+
+ /*
+ * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
+ * area (non-sev-es). Restore the ones that weren't.
+ */
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ svm->guest_state_loaded = false;
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+
+ if (unlikely(cpu != vcpu->cpu)) {
+ svm->asid_generation = 0;
+ vmcb_mark_all_dirty(svm->vmcb);
+ }
+
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
indirect_branch_prediction_barrier();
@@ -1459,30 +1491,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
avic_vcpu_put(vcpu);
+ svm_prepare_host_switch(vcpu);
++vcpu->stat.host_state_reload;
- if (sev_es_guest(svm->vcpu.kvm)) {
- sev_es_vcpu_put(svm);
- } else {
- kvm_load_ldt(svm->host.ldt);
-#ifdef CONFIG_X86_64
- loadsegment(fs, svm->host.fs);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
- load_gs_index(svm->host.gs);
-#else
-#ifdef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
-#endif
-#endif
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i].index,
- svm->host_user_msrs[i]);
- }
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1815,7 +1827,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
}
-static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1865,7 +1877,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
get_debugreg(vcpu->arch.db[2], 2);
get_debugreg(vcpu->arch.db[3], 3);
/*
- * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here,
+ * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
* because db_interception might need it. We can do it before vmentry.
*/
vcpu->arch.dr6 = svm->vmcb->save.dr6;
@@ -1916,7 +1928,7 @@ static int db_interception(struct vcpu_svm *svm)
if (!(svm->vcpu.guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
!svm->nmi_singlestep) {
- u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1;
+ u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
return 1;
}
@@ -1962,24 +1974,6 @@ static int ac_interception(struct vcpu_svm *svm)
return 1;
}
-static int gp_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- u32 error_code = svm->vmcb->control.exit_info_1;
-
- WARN_ON_ONCE(!enable_vmware_backdoor);
-
- /*
- * VMware backdoor emulation on #GP interception only handles IN{S},
- * OUT{S}, and RDPMC, none of which generate a non-zero error code.
- */
- if (error_code) {
- kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
- return 1;
- }
- return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
-}
-
static bool is_erratum_383(void)
{
int err, i;
@@ -2178,6 +2172,102 @@ static int vmrun_interception(struct vcpu_svm *svm)
return nested_svm_vmrun(svm);
}
+enum {
+ NONE_SVM_INSTR,
+ SVM_INSTR_VMRUN,
+ SVM_INSTR_VMLOAD,
+ SVM_INSTR_VMSAVE,
+};
+
+/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
+static int svm_instr_opcode(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
+ return NONE_SVM_INSTR;
+
+ switch (ctxt->modrm) {
+ case 0xd8: /* VMRUN */
+ return SVM_INSTR_VMRUN;
+ case 0xda: /* VMLOAD */
+ return SVM_INSTR_VMLOAD;
+ case 0xdb: /* VMSAVE */
+ return SVM_INSTR_VMSAVE;
+ default:
+ break;
+ }
+
+ return NONE_SVM_INSTR;
+}
+
+static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
+{
+ const int guest_mode_exit_codes[] = {
+ [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
+ [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
+ [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
+ };
+ int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+ [SVM_INSTR_VMRUN] = vmrun_interception,
+ [SVM_INSTR_VMLOAD] = vmload_interception,
+ [SVM_INSTR_VMSAVE] = vmsave_interception,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu)) {
+ svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ return nested_svm_vmexit(svm);
+ } else
+ return svm_instr_handlers[opcode](svm);
+}
+
+/*
+ * #GP handling code. Note that #GP can be triggered under the following two
+ * cases:
+ * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
+ * some AMD CPUs when EAX of these instructions are in the reserved memory
+ * regions (e.g. SMM memory on host).
+ * 2) VMware backdoor
+ */
+static int gp_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int opcode;
+
+ /* Both #GP cases have zero error_code */
+ if (error_code)
+ goto reinject;
+
+ /* Decode the instruction for usage later */
+ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
+ goto reinject;
+
+ opcode = svm_instr_opcode(vcpu);
+
+ if (opcode == NONE_SVM_INSTR) {
+ if (!enable_vmware_backdoor)
+ goto reinject;
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC.
+ */
+ if (!is_guest_mode(vcpu))
+ return kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
+ } else
+ return emulate_svm_instr(vcpu, opcode);
+
+reinject:
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
void svm_set_gif(struct vcpu_svm *svm, bool value)
{
if (value) {
@@ -2265,11 +2355,8 @@ static int xsetbv_interception(struct vcpu_svm *svm)
u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
u32 index = kvm_rcx_read(&svm->vcpu);
- if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
- return kvm_skip_emulated_instruction(&svm->vcpu);
- }
-
- return 1;
+ int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
+ return kvm_complete_insn_gp(&svm->vcpu, err);
}
static int rdpru_interception(struct vcpu_svm *svm)
@@ -2530,6 +2617,7 @@ static int dr_interception(struct vcpu_svm *svm)
{
int reg, dr;
unsigned long val;
+ int err = 0;
if (svm->vcpu.guest_debug == 0) {
/*
@@ -2547,20 +2635,16 @@ static int dr_interception(struct vcpu_svm *svm)
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
-
- if (dr >= 16) { /* mov to DRn */
- if (!kvm_require_dr(&svm->vcpu, dr - 16))
- return 1;
+ if (dr >= 16) { /* mov to DRn */
+ dr -= 16;
val = kvm_register_read(&svm->vcpu, reg);
- kvm_set_dr(&svm->vcpu, dr - 16, val);
+ err = kvm_set_dr(&svm->vcpu, dr, val);
} else {
- if (!kvm_require_dr(&svm->vcpu, dr))
- return 1;
kvm_get_dr(&svm->vcpu, dr, &val);
kvm_register_write(&svm->vcpu, reg, val);
}
- return kvm_skip_emulated_instruction(&svm->vcpu);
+ return kvm_complete_insn_gp(&svm->vcpu, err);
}
static int cr8_write_interception(struct vcpu_svm *svm)
@@ -3354,7 +3438,7 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3479,7 +3563,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !svm_interrupt_blocked(vcpu);
}
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3503,7 +3587,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
}
}
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3560,10 +3644,6 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
invlpga(gva, svm->vmcb->control.asid);
}
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3708,16 +3788,11 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
if (sev_es_guest(svm->vcpu.kvm)) {
__svm_sev_es_vcpu_run(svm->vmcb_pa);
} else {
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+
__svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
-#ifdef CONFIG_X86_64
- native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
- loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
-#endif
-#endif
+ vmload(__sme_page_pa(sd->save_area));
}
/*
@@ -3783,7 +3858,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
svm_set_dr6(svm, vcpu->arch.dr6);
else
- svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM);
+ svm_set_dr6(svm, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
@@ -3978,7 +4053,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (sev_guest(vcpu->kvm)) {
best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
if (best)
- vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f));
+ vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
}
if (!kvm_vcpu_apicv_active(vcpu))
@@ -4285,7 +4360,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
return ret;
}
-static void enable_smi_window(struct kvm_vcpu *vcpu)
+static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4439,7 +4514,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_blocking = svm_vcpu_blocking,
.vcpu_unblocking = svm_vcpu_unblocking,
- .update_exception_bitmap = update_exception_bitmap,
+ .update_exception_bitmap = svm_update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
@@ -4482,9 +4557,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.nmi_allowed = svm_nmi_allowed,
.get_nmi_mask = svm_get_nmi_mask,
.set_nmi_mask = svm_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
+ .enable_nmi_window = svm_enable_nmi_window,
+ .enable_irq_window = svm_enable_irq_window,
+ .update_cr8_intercept = svm_update_cr8_intercept,
.set_virtual_apic_mode = svm_set_virtual_apic_mode,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
@@ -4527,7 +4602,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.smi_allowed = svm_smi_allowed,
.pre_enter_smm = svm_pre_enter_smm,
.pre_leave_smm = svm_pre_leave_smm,
- .enable_smi_window = enable_smi_window,
+ .enable_smi_window = svm_enable_smi_window,
.mem_enc_op = svm_mem_enc_op,
.mem_enc_reg_region = svm_register_enc_region,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 6e7d070f8b86..39e071fdab0c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -23,22 +23,8 @@
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
-static const struct svm_host_save_msrs {
- u32 index; /* Index of the MSR */
- bool sev_es_restored; /* True if MSR is restored on SEV-ES VMEXIT */
-} host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- { .index = MSR_STAR, .sev_es_restored = true },
- { .index = MSR_LSTAR, .sev_es_restored = true },
- { .index = MSR_CSTAR, .sev_es_restored = true },
- { .index = MSR_SYSCALL_MASK, .sev_es_restored = true },
- { .index = MSR_KERNEL_GS_BASE, .sev_es_restored = true },
- { .index = MSR_FS_BASE, .sev_es_restored = true },
-#endif
- { .index = MSR_IA32_SYSENTER_CS, .sev_es_restored = true },
- { .index = MSR_IA32_SYSENTER_ESP, .sev_es_restored = true },
- { .index = MSR_IA32_SYSENTER_EIP, .sev_es_restored = true },
- { .index = MSR_TSC_AUX, .sev_es_restored = false },
+static const u32 host_save_user_msrs[] = {
+ MSR_TSC_AUX,
};
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
@@ -130,12 +116,6 @@ struct vcpu_svm {
u64 next_rip;
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- struct {
- u16 fs;
- u16 gs;
- u16 ldt;
- u64 gs_base;
- } host;
u64 spec_ctrl;
/*
@@ -192,6 +172,8 @@ struct vcpu_svm {
u64 ghcb_sa_len;
bool ghcb_sa_sync;
bool ghcb_sa_free;
+
+ bool guest_state_loaded;
};
struct svm_cpu_data {
@@ -587,9 +569,8 @@ int sev_handle_vmgexit(struct vcpu_svm *svm);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_create_vcpu(struct vcpu_svm *svm);
-void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu);
-void sev_es_vcpu_put(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
new file mode 100644
index 000000000000..8170f2a5a16f
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SVM_OPS_H
+#define __KVM_X86_SVM_OPS_H
+
+#include <linux/compiler_types.h>
+
+#include <asm/kvm_host.h>
+
+#define svm_asm(insn, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) "\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ ::: clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm1(insn, op1, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm2(insn, op1, op2, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1, op2 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+static inline void clgi(void)
+{
+ svm_asm(clgi);
+}
+
+static inline void stgi(void)
+{
+ svm_asm(stgi);
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+ svm_asm2(invlpga, "c"(asid), "a"(addr));
+}
+
+/*
+ * Despite being a physical address, the portion of rAX that is consumed by
+ * VMSAVE, VMLOAD, etc... is still controlled by the effective address size,
+ * hence 'unsigned long' instead of 'hpa_t'.
+ */
+static inline void vmsave(unsigned long pa)
+{
+ svm_asm1(vmsave, "a" (pa), "memory");
+}
+
+static inline void vmload(unsigned long pa)
+{
+ svm_asm1(vmload, "a" (pa), "memory");
+}
+
+#endif /* __KVM_X86_SVM_OPS_H */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 2de30c20bc26..a61c015870e3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -93,6 +93,42 @@ TRACE_EVENT(kvm_hv_hypercall,
);
/*
+ * Tracepoint for Xen hypercall.
+ */
+TRACE_EVENT(kvm_xen_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3, unsigned long a4,
+ unsigned long a5),
+ TP_ARGS(nr, a0, a1, a2, a3, a4, a5),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, nr)
+ __field(unsigned long, a0)
+ __field(unsigned long, a1)
+ __field(unsigned long, a2)
+ __field(unsigned long, a3)
+ __field(unsigned long, a4)
+ __field(unsigned long, a5)
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ __entry->a4 = a4;
+ __entry->a4 = a5;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3, __entry->a4, __entry->a5)
+);
+
+
+
+/*
* Tracepoint for PIO.
*/
@@ -256,7 +292,7 @@ TRACE_EVENT(name, \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
- kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \
+ static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \
&__entry->info2, \
&__entry->intr_info, \
&__entry->error_code); \
@@ -738,7 +774,7 @@ TRACE_EVENT(kvm_emulate_insn,
),
TP_fast_assign(
- __entry->csbase = kvm_x86_ops.get_segment_base(vcpu, VCPU_SREG_CS);
+ __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS);
__entry->len = vcpu->arch.emulate_ctxt->fetch.ptr
- vcpu->arch.emulate_ctxt->fetch.data;
__entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 3a1861403d73..d1d77985e889 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -19,6 +19,9 @@ extern int __read_mostly pt_mode;
#define PT_MODE_HOST_GUEST 1
#define PMU_CAP_FW_WRITES (1ULL << 13)
+#define PMU_CAP_LBR_FMT 0x3f
+
+#define DEBUGCTLMSR_LBR_MASK (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI)
struct nested_vmx_msrs {
/*
@@ -262,6 +265,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
SECONDARY_EXEC_TSC_SCALING;
}
+static inline bool cpu_has_vmx_bus_lock_detection(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_BUS_LOCK_DETECTION;
+}
+
static inline bool cpu_has_vmx_apicv(void)
{
return cpu_has_vmx_apic_register_virt() &&
@@ -371,11 +380,28 @@ static inline bool vmx_pt_mode_is_host_guest(void)
static inline u64 vmx_get_perf_capabilities(void)
{
+ u64 perf_cap = 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
+
+ perf_cap &= PMU_CAP_LBR_FMT;
+
/*
* Since counters are virtualized, KVM would support full
* width counting unconditionally, even if the host lacks it.
*/
- return PMU_CAP_FW_WRITES;
+ return PMU_CAP_FW_WRITES | perf_cap;
+}
+
+static inline u64 vmx_supported_debugctl(void)
+{
+ u64 debugctl = 0;
+
+ if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
+ debugctl |= DEBUGCTLMSR_LBR_MASK;
+
+ return debugctl;
}
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f2b9bfb58206..b2f0b5e9cd63 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -12,6 +12,7 @@
#include "nested.h"
#include "pmu.h"
#include "trace.h"
+#include "vmx.h"
#include "x86.h"
static bool __read_mostly enable_shadow_vmcs = 1;
@@ -411,8 +412,8 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
if (nr == DB_VECTOR) {
if (!has_payload) {
payload = vcpu->arch.dr6;
- payload &= ~(DR6_FIXED_1 | DR6_BT);
- payload ^= DR6_RTM;
+ payload &= ~DR6_BT;
+ payload ^= DR6_ACTIVE_LOW;
}
*exit_qual = payload;
} else
@@ -744,8 +745,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
(CC(!nested_cpu_has_vid(vmcs12)) ||
CC(!nested_exit_intr_ack_set(vcpu)) ||
CC((vmcs12->posted_intr_nv & 0xff00)) ||
- CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
- CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
+ CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
return -EINVAL;
/* tpr shadow is needed by all apicv features. */
@@ -758,13 +758,11 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
u32 count, u64 addr)
{
- int maxphyaddr;
-
if (count == 0)
return 0;
- maxphyaddr = cpuid_maxphyaddr(vcpu);
- if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
- (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
+
+ if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
+ !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
return -EINVAL;
return 0;
@@ -1062,14 +1060,6 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
}
}
-static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
-{
- unsigned long invalid_mask;
-
- invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
- return (val & invalid_mask) == 0;
-}
-
/*
* Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
* tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
@@ -1121,7 +1111,7 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
enum vm_entry_failure_code *entry_failure_code)
{
- if (CC(!nested_cr3_valid(vcpu, cr3))) {
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
}
@@ -2532,7 +2522,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* bitwise-or of what L1 wants to trap for L2, and what we want to
* trap. Note that CR0.TS also needs updating - we do this later.
*/
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
@@ -2635,7 +2625,6 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
/* Check for memory type validity */
switch (new_eptp & VMX_EPTP_MT_MASK) {
@@ -2666,7 +2655,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
}
/* Reserved bits should not be set */
- if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f)))
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
return false;
/* AD, if set, should be supported */
@@ -2850,7 +2839,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
- CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
+ CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
return -EINVAL;
if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
@@ -3057,35 +3046,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- asm(
- "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
- "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
- "je 1f \n\t"
- __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
- "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
- "1: \n\t"
- "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
-
- /* Check if vmlaunch or vmresume is needed */
- "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
-
- /*
- * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
- * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
- * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
- * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
- */
- "call vmx_vmenter\n\t"
-
- CC_SET(be)
- : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
- : [HOST_RSP]"r"((unsigned long)HOST_RSP),
- [loaded_vmcs]"r"(vmx->loaded_vmcs),
- [launched]"i"(offsetof(struct loaded_vmcs, launched)),
- [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
- [wordsize]"i"(sizeof(ulong))
- : "memory"
- );
+ vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ vmx->loaded_vmcs->launched);
if (vmx->msr_autoload.host.nr)
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
@@ -3330,7 +3292,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
enum vm_entry_failure_code entry_failure_code;
bool evaluate_pending_interrupts;
- u32 exit_reason, failed_index;
+ union vmx_exit_reason exit_reason = {
+ .basic = EXIT_REASON_INVALID_STATE,
+ .failed_vmentry = 1,
+ };
+ u32 failed_index;
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
@@ -3382,7 +3348,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (nested_vmx_check_guest_state(vcpu, vmcs12,
&entry_failure_code)) {
- exit_reason = EXIT_REASON_INVALID_STATE;
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
vmcs12->exit_qualification = entry_failure_code;
goto vmentry_fail_vmexit;
}
@@ -3393,7 +3359,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
- exit_reason = EXIT_REASON_INVALID_STATE;
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
vmcs12->exit_qualification = entry_failure_code;
goto vmentry_fail_vmexit_guest_mode;
}
@@ -3403,7 +3369,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
vmcs12->vm_entry_msr_load_addr,
vmcs12->vm_entry_msr_load_count);
if (failed_index) {
- exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
+ exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
vmcs12->exit_qualification = failed_index;
goto vmentry_fail_vmexit_guest_mode;
}
@@ -3471,7 +3437,7 @@ vmentry_fail_vmexit:
return NVMX_VMENTRY_VMEXIT;
load_vmcs12_host_state(vcpu, vmcs12);
- vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+ vmcs12->vm_exit_reason = exit_reason.full;
if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
vmx->nested.need_vmcs12_to_shadow_sync = true;
return NVMX_VMENTRY_VMEXIT;
@@ -5559,7 +5525,12 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
fail:
- nested_vmx_vmexit(vcpu, vmx->exit_reason,
+ /*
+ * This is effectively a reflected VM-Exit, as opposed to a synthesized
+ * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
+ * EXIT_REASON_VMFUNC as the exit reason.
+ */
+ nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
vmx_get_intr_info(vcpu),
vmx_get_exit_qual(vcpu));
return 1;
@@ -5627,7 +5598,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
* MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
*/
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12, u32 exit_reason)
+ struct vmcs12 *vmcs12,
+ union vmx_exit_reason exit_reason)
{
u32 msr_index = kvm_rcx_read(vcpu);
gpa_t bitmap;
@@ -5641,7 +5613,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
* First we need to figure out which of the four to use:
*/
bitmap = vmcs12->msr_bitmap;
- if (exit_reason == EXIT_REASON_MSR_WRITE)
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
bitmap += 2048;
if (msr_index >= 0xc0000000) {
msr_index -= 0xc0000000;
@@ -5778,11 +5750,12 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
* Return true if L0 wants to handle an exit from L2 regardless of whether or not
* L1 wants the exit. Only call this when in is_guest_mode (L2).
*/
-static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
{
u32 intr_info;
- switch ((u16)exit_reason) {
+ switch ((u16)exit_reason.basic) {
case EXIT_REASON_EXCEPTION_NMI:
intr_info = vmx_get_intr_info(vcpu);
if (is_nmi(intr_info))
@@ -5838,12 +5811,13 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
* Return 1 if L1 wants to intercept an exit from L2. Only call this when in
* is_guest_mode (L2).
*/
-static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
u32 intr_info;
- switch ((u16)exit_reason) {
+ switch ((u16)exit_reason.basic) {
case EXIT_REASON_EXCEPTION_NMI:
intr_info = vmx_get_intr_info(vcpu);
if (is_nmi(intr_info))
@@ -5962,7 +5936,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx->exit_reason;
unsigned long exit_qual;
u32 exit_intr_info;
@@ -5981,7 +5955,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
goto reflect_vmexit;
}
- trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX);
+ trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
@@ -6007,7 +5981,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
exit_qual = vmx_get_exit_qual(vcpu);
reflect_vmexit:
- nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
+ nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
return true;
}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index cdf5f34518f4..d1df618cb7de 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -152,12 +152,17 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
return &counters[array_index_nospec(idx, num_counters)];
}
-static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
{
if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
- return false;
+ return 0;
- return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES;
+ return vcpu->arch.perf_capabilities;
+}
+
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+ return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0;
}
static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
@@ -168,6 +173,41 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
}
+bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
+{
+ /*
+ * As a first step, a guest could only enable LBR feature if its
+ * cpu model is the same as the host because the LBR registers
+ * would be pass-through to the guest and they're model specific.
+ */
+ return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
+}
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
+{
+ struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+
+ return lbr->nr && (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_LBR_FMT);
+}
+
+static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
+{
+ struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu);
+ bool ret = false;
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ return ret;
+
+ ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) ||
+ (index >= records->from && index < records->from + records->nr) ||
+ (index >= records->to && index < records->to + records->nr);
+
+ if (!ret && records->info)
+ ret = (index >= records->info && index < records->info + records->nr);
+
+ return ret;
+}
+
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -183,7 +223,8 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
- get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr);
+ get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) ||
+ intel_pmu_is_valid_lbr_msr(vcpu, msr);
break;
}
@@ -202,6 +243,111 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
return pmc;
}
+static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (lbr_desc->event) {
+ perf_event_release_kernel(lbr_desc->event);
+ lbr_desc->event = NULL;
+ vcpu_to_pmu(vcpu)->event_count--;
+ }
+}
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct perf_event *event;
+
+ /*
+ * The perf_event_attr is constructed in the minimum efficient way:
+ * - set 'pinned = true' to make it task pinned so that if another
+ * cpu pinned event reclaims LBR, the event->oncpu will be set to -1;
+ * - set '.exclude_host = true' to record guest branches behavior;
+ *
+ * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf
+ * schedule the event without a real HW counter but a fake one;
+ * check is_guest_lbr_event() and __intel_get_event_constraints();
+ *
+ * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and
+ * 'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ * PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack
+ * event, which helps KVM to save/restore guest LBR records
+ * during host context switches and reduces quite a lot overhead,
+ * check branch_user_callstack() and intel_pmu_lbr_sched_task();
+ */
+ struct perf_event_attr attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(attr),
+ .config = INTEL_FIXED_VLBR_EVENT,
+ .sample_type = PERF_SAMPLE_BRANCH_STACK,
+ .pinned = true,
+ .exclude_host = true,
+ .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ PERF_SAMPLE_BRANCH_USER,
+ };
+
+ if (unlikely(lbr_desc->event)) {
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ return 0;
+ }
+
+ event = perf_event_create_kernel_counter(&attr, -1,
+ current, NULL, NULL);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("%s: failed %ld\n",
+ __func__, PTR_ERR(event));
+ return -ENOENT;
+ }
+ lbr_desc->event = event;
+ pmu->event_count++;
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ return 0;
+}
+
+/*
+ * It's safe to access LBR msrs from guest when they have not
+ * been passthrough since the host would help restore or reset
+ * the LBR msrs records when the guest LBR event is scheduled in.
+ */
+static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info, bool read)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ u32 index = msr_info->index;
+
+ if (!intel_pmu_is_valid_lbr_msr(vcpu, index))
+ return false;
+
+ if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu))
+ goto dummy;
+
+ /*
+ * Disable irq to ensure the LBR feature doesn't get reclaimed by the
+ * host at the time the value is read from the msr, and this avoids the
+ * host LBR value to be leaked to the guest. If LBR has been reclaimed,
+ * return 0 on guest reads.
+ */
+ local_irq_disable();
+ if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (read)
+ rdmsrl(index, msr_info->data);
+ else
+ wrmsrl(index, msr_info->data);
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+ local_irq_enable();
+ return true;
+ }
+ clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+ local_irq_enable();
+
+dummy:
+ if (read)
+ msr_info->data = 0;
+ return true;
+}
+
static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -236,7 +382,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
msr_info->data = pmc->eventsel;
return 0;
- }
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true))
+ return 0;
}
return 1;
@@ -307,7 +454,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
reprogram_gp_counter(pmc, data);
return 0;
}
- }
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
+ return 0;
}
return 1;
@@ -316,6 +464,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
struct x86_pmu_capability x86_pmu;
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
@@ -327,7 +477,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
- vcpu->arch.perf_capabilities = 0;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
@@ -340,8 +489,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
return;
perf_get_x86_pmu_capability(&x86_pmu);
- if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
- vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
x86_pmu.num_counters_gp);
@@ -385,12 +532,21 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
nested_vmx_pmu_entry_exit_ctls_update(vcpu);
+
+ if (intel_pmu_lbr_is_compatible(vcpu))
+ x86_perf_get_lbr(&lbr_desc->records);
+ else
+ lbr_desc->records.nr = 0;
+
+ if (lbr_desc->records.nr)
+ bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
}
static void intel_pmu_init(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
@@ -405,6 +561,11 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
pmu->fixed_counters[i].current_config = 0;
}
+
+ vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
+ lbr_desc->records.nr = 0;
+ lbr_desc->event = NULL;
+ lbr_desc->msr_passthrough = false;
}
static void intel_pmu_reset(struct kvm_vcpu *vcpu)
@@ -429,6 +590,119 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
pmu->global_ovf_ctrl = 0;
+
+ intel_pmu_release_guest_lbr_event(vcpu);
+}
+
+/*
+ * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4.
+ *
+ * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and
+ * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL.
+ *
+ * Guest needs to re-enable LBR to resume branches recording.
+ */
+static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+{
+ u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+ data &= ~DEBUGCTLMSR_LBR;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ }
+}
+
+static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ u8 version = vcpu_to_pmu(vcpu)->version;
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ return;
+
+ if (version > 1 && version < 4)
+ intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu);
+}
+
+static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
+{
+ struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+ int i;
+
+ for (i = 0; i < lbr->nr; i++) {
+ vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set);
+ if (lbr->info)
+ vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set);
+ }
+
+ vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set);
+}
+
+static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc->msr_passthrough)
+ return;
+
+ vmx_update_intercept_for_lbr_msrs(vcpu, true);
+ lbr_desc->msr_passthrough = false;
+}
+
+static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (lbr_desc->msr_passthrough)
+ return;
+
+ vmx_update_intercept_for_lbr_msrs(vcpu, false);
+ lbr_desc->msr_passthrough = true;
+}
+
+/*
+ * Higher priority host perf events (e.g. cpu pinned) could reclaim the
+ * pmu resources (e.g. LBR) that were assigned to the guest. This is
+ * usually done via ipi calls (more details in perf_install_in_context).
+ *
+ * Before entering the non-root mode (with irq disabled here), double
+ * confirm that the pmu features enabled to the guest are not reclaimed
+ * by higher priority host events. Otherwise, disallow vcpu's access to
+ * the reclaimed features.
+ */
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc->event) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+ if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+ goto warn;
+ if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+ goto warn;
+ return;
+ }
+
+ if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+ __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ goto warn;
+ } else
+ vmx_enable_lbr_msrs_passthrough(vcpu);
+
+ return;
+
+warn:
+ pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n",
+ vcpu->vcpu_id);
+}
+
+static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+ if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+ intel_pmu_release_guest_lbr_event(vcpu);
}
struct kvm_pmu_ops intel_pmu_ops = {
@@ -445,4 +719,6 @@ struct kvm_pmu_ops intel_pmu_ops = {
.refresh = intel_pmu_refresh,
.init = intel_pmu_init,
.reset = intel_pmu_reset,
+ .deliver_pmi = intel_pmu_deliver_pmi,
+ .cleanup = intel_pmu_cleanup,
};
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index f02962dcc72c..4831bc44ce66 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -54,7 +54,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
dest = cpu_physical_id(cpu);
- if (x2apic_enabled())
+ if (x2apic_mode)
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
@@ -104,7 +104,7 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
dest = cpu_physical_id(vcpu->cpu);
- if (x2apic_enabled())
+ if (x2apic_mode)
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
@@ -174,7 +174,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
*/
dest = cpu_physical_id(vcpu->pre_pcpu);
- if (x2apic_enabled())
+ if (x2apic_mode)
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index e85aa5faa22d..3a6461694fc2 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -44,7 +44,7 @@
* they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
* to vmx_vmexit.
*/
-SYM_FUNC_START(vmx_vmenter)
+SYM_FUNC_START_LOCAL(vmx_vmenter)
/* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
je 2f
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index eb69fef57485..e0a3a9be654b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -50,6 +50,7 @@
#include "capabilities.h"
#include "cpuid.h"
#include "evmcs.h"
+#include "hyperv.h"
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
@@ -552,7 +553,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
struct hv_partition_assist_pg **p_hv_pa_pg =
- &vcpu->kvm->arch.hyperv.hv_pa_pg;
+ &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
/*
* Synthetic VM-Exit is not enabled in current code and so All
* evmcs in singe VM shares same assist page.
@@ -658,6 +659,14 @@ static bool is_valid_passthrough_msr(u32 msr)
case MSR_IA32_RTIT_CR3_MATCH:
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
/* PT MSRs. These are handled in pt_update_intercept_for_msr() */
+ case MSR_LBR_SELECT:
+ case MSR_LBR_TOS:
+ case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
+ case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
+ case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
+ case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
+ case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
+ /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
return true;
}
@@ -806,7 +815,7 @@ static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
return *p;
}
-void update_exception_bitmap(struct kvm_vcpu *vcpu)
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
@@ -1102,7 +1111,7 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
{
/* The base must be 128-byte aligned and a legal physical address. */
- return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
}
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
@@ -1577,7 +1586,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
* i.e. we end up advancing IP with some random value.
*/
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
- to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
#ifdef CONFIG_X86_64
@@ -1924,6 +1933,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
goto find_uret_msr;
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ break;
default:
find_uret_msr:
msr = vmx_find_uret_msr(vmx, msr_info->index);
@@ -1947,6 +1959,16 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
return (unsigned long)data;
}
+static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
+{
+ u64 debugctl = vmx_supported_debugctl();
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ debugctl &= ~DEBUGCTLMSR_LBR_MASK;
+
+ return debugctl;
+}
+
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -1997,14 +2019,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_DEBUGCTLMSR: {
+ u64 invalid = data & ~vcpu_supported_debugctl(vcpu);
+ if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
+ data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+ invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+ }
+
+ if (invalid)
+ return 1;
+
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- ret = kvm_set_msr_common(vcpu, msr_info);
- break;
-
+ vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+ return 0;
+ }
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
@@ -2196,6 +2233,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((data >> 32) != 0)
return 1;
goto find_uret_msr;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (data && !vcpu_to_pmu(vcpu)->version)
+ return 1;
+ if (data & PMU_CAP_LBR_FMT) {
+ if ((data & PMU_CAP_LBR_FMT) !=
+ (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
+ return 1;
+ if (!intel_pmu_lbr_is_compatible(vcpu))
+ return 1;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
default:
find_uret_msr:
@@ -2265,7 +2314,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
u64 msr;
cr4_set_bits(X86_CR4_VMXE);
- intel_pt_handle_vmx(1);
asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
_ASM_EXTABLE(1b, %l[fault])
@@ -2276,7 +2324,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
fault:
WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- intel_pt_handle_vmx(0);
cr4_clear_bits(X86_CR4_VMXE);
return -EFAULT;
@@ -2299,9 +2346,13 @@ static int hardware_enable(void)
!hv_get_vp_assist_page(cpu))
return -EFAULT;
+ intel_pt_handle_vmx(1);
+
r = kvm_cpu_vmxon(phys_addr);
- if (r)
+ if (r) {
+ intel_pt_handle_vmx(0);
return r;
+ }
if (enable_ept)
ept_sync_global();
@@ -2319,22 +2370,14 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-
-/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
- * tricks.
- */
-static void kvm_cpu_vmxoff(void)
-{
- asm volatile (__ex("vmxoff"));
-
- intel_pt_handle_vmx(0);
- cr4_clear_bits(X86_CR4_VMXE);
-}
-
static void hardware_disable(void)
{
vmclear_local_loaded_vmcss();
- kvm_cpu_vmxoff();
+
+ if (cpu_vmxoff())
+ kvm_spurious_fault();
+
+ intel_pt_handle_vmx(0);
}
/*
@@ -2428,7 +2471,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
- SECONDARY_EXEC_ENABLE_VMFUNC;
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_BUS_LOCK_DETECTION;
if (cpu_has_sgx())
opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
if (adjust_vmx_controls(min2, opt2,
@@ -2739,7 +2783,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
@@ -2819,7 +2863,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
@@ -3774,7 +3818,7 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type, bool value)
{
if (value)
@@ -4269,6 +4313,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
ENABLE_USR_WAIT_PAUSE, false);
+ if (!vcpu->kvm->arch.bus_lock_detection_enabled)
+ exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
+
vmx->secondary_exec_control = exec_control;
}
@@ -4467,23 +4514,23 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_set_cr4(vcpu, 0);
vmx_set_efer(vcpu, 0);
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
vpid_sync_context(vmx->vpid);
if (init_event)
vmx_clear_hlt(vcpu);
}
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
if (!enable_vnmi ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
- enable_irq_window(vcpu);
+ vmx_enable_irq_window(vcpu);
return;
}
@@ -4824,7 +4871,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
}
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
fallthrough;
case BP_VECTOR:
@@ -5049,6 +5096,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
int dr, dr7, reg;
+ int err = 1;
exit_qualification = vmx_get_exit_qual(vcpu);
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
@@ -5057,9 +5105,9 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (!kvm_require_dr(vcpu, dr))
return 1;
- /* Do not handle if the CPL > 0, will trigger GP on re-entry */
- if (!kvm_require_cpl(vcpu, 0))
- return 1;
+ if (kvm_x86_ops.get_cpl(vcpu) > 0)
+ goto out;
+
dr7 = vmcs_readl(GUEST_DR7);
if (dr7 & DR7_GD) {
/*
@@ -5068,7 +5116,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
* guest debugging itself.
*/
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
- vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1;
+ vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
vcpu->run->debug.arch.dr7 = dr7;
vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
vcpu->run->debug.arch.exception = DB_VECTOR;
@@ -5096,14 +5144,15 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (exit_qualification & TYPE_MOV_FROM_DR) {
unsigned long val;
- if (kvm_get_dr(vcpu, dr, &val))
- return 1;
+ kvm_get_dr(vcpu, dr, &val);
kvm_register_write(vcpu, reg, val);
- } else
- if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
- return 1;
+ err = 0;
+ } else {
+ err = kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg));
+ }
- return kvm_skip_emulated_instruction(vcpu);
+out:
+ return kvm_complete_insn_gp(vcpu, err);
}
static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
@@ -5177,9 +5226,8 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
u64 new_bv = kvm_read_edx_eax(vcpu);
u32 index = kvm_rcx_read(vcpu);
- if (kvm_set_xcr(vcpu, index, new_bv) == 0)
- return kvm_skip_emulated_instruction(vcpu);
- return 1;
+ int err = kvm_set_xcr(vcpu, index, new_bv);
+ return kvm_complete_insn_gp(vcpu, err);
}
static int handle_apic_access(struct kvm_vcpu *vcpu)
@@ -5600,6 +5648,13 @@ static int handle_encls(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+ return 0;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -5656,6 +5711,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
+ [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
};
static const int kvm_vmx_max_exit_handlers =
@@ -5667,7 +5723,7 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
struct vcpu_vmx *vmx = to_vmx(vcpu);
*info1 = vmx_get_exit_qual(vcpu);
- if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+ if (!(vmx->exit_reason.failed_vmentry)) {
*info2 = vmx->idt_vectoring_info;
*intr_info = vmx_get_intr_info(vcpu);
if (is_exception_with_error_code(*intr_info))
@@ -5908,11 +5964,12 @@ void dump_vmcs(void)
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ u16 exit_handler_index;
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -5954,11 +6011,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 1;
}
- if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ if (exit_reason.failed_vmentry) {
dump_vmcs();
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
- = exit_reason;
+ = exit_reason.full;
vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -5980,18 +6037,18 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* will cause infinite loop.
*/
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
- (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
- exit_reason != EXIT_REASON_EPT_VIOLATION &&
- exit_reason != EXIT_REASON_PML_FULL &&
- exit_reason != EXIT_REASON_APIC_ACCESS &&
- exit_reason != EXIT_REASON_TASK_SWITCH)) {
+ (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason.basic != EXIT_REASON_PML_FULL &&
+ exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
+ exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
vcpu->run->internal.ndata = 3;
vcpu->run->internal.data[0] = vectoring_info;
- vcpu->run->internal.data[1] = exit_reason;
+ vcpu->run->internal.data[1] = exit_reason.full;
vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
- if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+ if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
vcpu->run->internal.ndata++;
vcpu->run->internal.data[3] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@ -6023,42 +6080,62 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
- if (exit_reason >= kvm_vmx_max_exit_handlers)
+ if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
goto unexpected_vmexit;
#ifdef CONFIG_RETPOLINE
- if (exit_reason == EXIT_REASON_MSR_WRITE)
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
- else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
+ else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
return handle_preemption_timer(vcpu);
- else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
+ else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
return handle_interrupt_window(vcpu);
- else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
return handle_external_interrupt(vcpu);
- else if (exit_reason == EXIT_REASON_HLT)
+ else if (exit_reason.basic == EXIT_REASON_HLT)
return kvm_emulate_halt(vcpu);
- else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
+ else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
return handle_ept_misconfig(vcpu);
#endif
- exit_reason = array_index_nospec(exit_reason,
- kvm_vmx_max_exit_handlers);
- if (!kvm_vmx_exit_handlers[exit_reason])
+ exit_handler_index = array_index_nospec((u16)exit_reason.basic,
+ kvm_vmx_max_exit_handlers);
+ if (!kvm_vmx_exit_handlers[exit_handler_index])
goto unexpected_vmexit;
- return kvm_vmx_exit_handlers[exit_reason](vcpu);
+ return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
unexpected_vmexit:
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+ exit_reason.full);
dump_vmcs();
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_reason;
+ vcpu->run->internal.data[0] = exit_reason.full;
vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
return 0;
}
+static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ int ret = __vmx_handle_exit(vcpu, exit_fastpath);
+
+ /*
+ * Even when current exit reason is handled by KVM internally, we
+ * still need to exit to user space when bus lock detected to inform
+ * that there is a bus lock in guest.
+ */
+ if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
+ if (ret > 0)
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+ return 0;
+ }
+ return ret;
+}
+
/*
* Software based L1D cache flush which is used when microcode providing
* the cache control MSR is not loaded.
@@ -6129,7 +6206,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
: "eax", "ebx", "ecx", "edx");
}
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
int tpr_threshold;
@@ -6373,9 +6450,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
- else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
+ else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
handle_exception_nmi_irqoff(vmx);
}
@@ -6567,7 +6644,7 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
- switch (to_vmx(vcpu)->exit_reason) {
+ switch (to_vmx(vcpu)->exit_reason.basic) {
case EXIT_REASON_MSR_WRITE:
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
@@ -6577,8 +6654,6 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
}
}
-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
-
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx)
{
@@ -6638,11 +6713,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
- fastpath_t exit_fastpath;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
-reenter_guest:
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked))
@@ -6696,6 +6769,8 @@ reenter_guest:
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
+ if (intel_pmu_lbr_is_enabled(vcpu))
+ vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
@@ -6734,12 +6809,12 @@ reenter_guest:
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
/* All fields are clean at this point */
- if (static_branch_unlikely(&enable_evmcs))
+ if (static_branch_unlikely(&enable_evmcs)) {
current_evmcs->hv_clean_fields |=
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
- if (static_branch_unlikely(&enable_evmcs))
- current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
+ current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
+ }
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (vmx->host_debugctlmsr)
@@ -6768,17 +6843,17 @@ reenter_guest:
vmx->idt_vectoring_info = 0;
if (unlikely(vmx->fail)) {
- vmx->exit_reason = 0xdead;
+ vmx->exit_reason.full = 0xdead;
return EXIT_FASTPATH_NONE;
}
- vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY))
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
- trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
+ trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
- if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ if (unlikely(vmx->exit_reason.failed_vmentry))
return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
@@ -6790,22 +6865,7 @@ reenter_guest:
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;
- exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
- if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) {
- if (!kvm_vcpu_exit_request(vcpu)) {
- /*
- * FIXME: this goto should be a loop in vcpu_enter_guest,
- * but it would incur the cost of a retpoline for now.
- * Revisit once static calls are available.
- */
- if (vcpu->arch.apicv_active)
- vmx_sync_pir_to_irr(vcpu);
- goto reenter_guest;
- }
- exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
- }
-
- return exit_fastpath;
+ return vmx_exit_handlers_fastpath(vcpu);
}
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@ -7256,7 +7316,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
set_cr4_guest_host_mask(vmx);
/* Refresh #PF interception to account for MAXPHYADDR changes. */
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
}
static __init void vmx_set_cpu_caps(void)
@@ -7546,7 +7606,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
return 0;
}
-static void enable_smi_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
@@ -7606,7 +7666,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .update_exception_bitmap = update_exception_bitmap,
+ .update_exception_bitmap = vmx_update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
@@ -7649,9 +7709,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.nmi_allowed = vmx_nmi_allowed,
.get_nmi_mask = vmx_get_nmi_mask,
.set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
+ .enable_nmi_window = vmx_enable_nmi_window,
+ .enable_irq_window = vmx_enable_irq_window,
+ .update_cr8_intercept = vmx_update_cr8_intercept,
.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
@@ -7709,7 +7769,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.smi_allowed = vmx_smi_allowed,
.pre_enter_smm = vmx_pre_enter_smm,
.pre_leave_smm = vmx_pre_leave_smm,
- .enable_smi_window = enable_smi_window,
+ .enable_smi_window = vmx_enable_smi_window,
.can_emulate_instruction = vmx_can_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
@@ -7810,6 +7870,8 @@ static __init int hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 48;
}
+ kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
if (enable_ept)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9d3a557949ac..12c53d05a902 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -70,6 +70,54 @@ struct pt_desc {
struct pt_ctx guest;
};
+union vmx_exit_reason {
+ struct {
+ u32 basic : 16;
+ u32 reserved16 : 1;
+ u32 reserved17 : 1;
+ u32 reserved18 : 1;
+ u32 reserved19 : 1;
+ u32 reserved20 : 1;
+ u32 reserved21 : 1;
+ u32 reserved22 : 1;
+ u32 reserved23 : 1;
+ u32 reserved24 : 1;
+ u32 reserved25 : 1;
+ u32 bus_lock_detected : 1;
+ u32 enclave_mode : 1;
+ u32 smi_pending_mtf : 1;
+ u32 smi_from_vmx_root : 1;
+ u32 reserved30 : 1;
+ u32 failed_vmentry : 1;
+ };
+ u32 full;
+};
+
+#define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc)
+#define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records)
+
+bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu);
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
+
+struct lbr_desc {
+ /* Basic info about guest LBR records. */
+ struct x86_pmu_lbr records;
+
+ /*
+ * Emulate LBR feature via passthrough LBR registers when the
+ * per-vcpu guest LBR event is scheduled on the current pcpu.
+ *
+ * The records may be inaccurate if the host reclaims the LBR.
+ */
+ struct perf_event *event;
+
+ /* True if LBRs are marked as not intercepted in the MSR bitmap */
+ bool msr_passthrough;
+};
+
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -244,7 +292,7 @@ struct vcpu_vmx {
int vpid;
bool emulation_required;
- u32 exit_reason;
+ union vmx_exit_reason exit_reason;
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
@@ -279,6 +327,7 @@ struct vcpu_vmx {
u64 ept_pointer;
struct pt_desc pt_desc;
+ struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
@@ -329,7 +378,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
int root_level);
-void update_exception_bitmap(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
@@ -339,8 +388,11 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
+bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type, bool value);
static inline u8 vmx_get_rvi(void)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b967c1c774a1..884e5b3838c7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -29,6 +29,7 @@
#include "pmu.h"
#include "hyperv.h"
#include "lapic.h"
+#include "xen.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -114,11 +115,21 @@ static int sync_regs(struct kvm_vcpu *vcpu);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
+#define KVM_X86_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
+ *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
+EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
+EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current);
+
static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
-static bool __read_mostly report_ignored_msrs = true;
+bool __read_mostly report_ignored_msrs = true;
module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+EXPORT_SYMBOL_GPL(report_ignored_msrs);
unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
@@ -136,6 +147,8 @@ u64 __read_mostly kvm_max_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
u64 __read_mostly kvm_default_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
+bool __read_mostly kvm_has_bus_lock_exit;
+EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -234,7 +247,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
- VM_STAT("mmu_pte_updated", mmu_pte_updated),
VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
VM_STAT("mmu_flooded", mmu_flooded),
VM_STAT("mmu_recycled", mmu_recycled),
@@ -395,7 +407,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
- u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
+ u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
@@ -484,19 +496,24 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
*/
vcpu->arch.dr6 &= ~DR_TRAP_BITS;
/*
- * DR6.RTM is set by all #DB exceptions that don't clear it.
+ * In order to reflect the #DB exception payload in guest
+ * dr6, three components need to be considered: active low
+ * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
+ * DR6_BS and DR6_BT)
+ * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
+ * In the target guest dr6:
+ * FIXED_1 bits should always be set.
+ * Active low bits should be cleared if 1-setting in payload.
+ * Active high bits should be set if 1-setting in payload.
+ *
+ * Note, the payload is compatible with the pending debug
+ * exceptions/exit qualification under VMX, that active_low bits
+ * are active high in payload.
+ * So they need to be flipped for DR6.
*/
- vcpu->arch.dr6 |= DR6_RTM;
+ vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
vcpu->arch.dr6 |= payload;
- /*
- * Bit 16 should be set in the payload whenever the #DB
- * exception should clear DR6.RTM. This makes the payload
- * compatible with the pending debug exceptions under VMX.
- * Though not currently documented in the SDM, this also
- * makes the payload compatible with the exit qualification
- * for #DB exceptions under VMX.
- */
- vcpu->arch.dr6 ^= payload & DR6_RTM;
+ vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
/*
* The #DB payload is defined as compatible with the 'pending
@@ -692,7 +709,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
*/
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
- if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
+ if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
return true;
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
@@ -742,8 +759,7 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
- return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
- rsvd_bits(1, 2);
+ return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
}
/*
@@ -852,7 +868,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!is_pae(vcpu))
return 1;
- kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
if (cs_l)
return 1;
}
@@ -865,7 +881,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
return 1;
- kvm_x86_ops.set_cr0(vcpu, cr0);
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
kvm_post_set_cr0(vcpu, old_cr0, cr0);
@@ -970,12 +986,10 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
- __kvm_set_xcr(vcpu, index, xcr)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- return 0;
+ if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
+ return __kvm_set_xcr(vcpu, index, xcr);
+
+ return 1;
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
@@ -987,7 +1001,7 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
return false;
- return kvm_x86_ops.is_valid_cr4(vcpu, cr4);
+ return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
}
EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
@@ -1031,7 +1045,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
}
- kvm_x86_ops.set_cr4(vcpu, cr4);
+ static_call(kvm_x86_set_cr4)(vcpu, cr4);
kvm_post_set_cr4(vcpu, old_cr4, cr4);
@@ -1059,8 +1073,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
- if (is_long_mode(vcpu) &&
- (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
+ if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3))
return 1;
else if (is_pae_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
@@ -1114,7 +1127,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu)
dr7 = vcpu->arch.guest_debug_dr7;
else
dr7 = vcpu->arch.dr7;
- kvm_x86_ops.set_dr7(vcpu, dr7);
+ static_call(kvm_x86_set_dr7)(vcpu, dr7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
if (dr7 & DR7_BP_EN_MASK)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
@@ -1130,7 +1143,7 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
return fixed;
}
-static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
size_t size = ARRAY_SIZE(vcpu->arch.db);
@@ -1143,13 +1156,13 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
case 4:
case 6:
if (!kvm_dr6_valid(val))
- return -1; /* #GP */
+ return 1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
break;
case 5:
default: /* 7 */
if (!kvm_dr7_valid(val))
- return -1; /* #GP */
+ return 1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
kvm_update_dr7(vcpu);
break;
@@ -1157,18 +1170,9 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
return 0;
}
-
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
-{
- if (__kvm_set_dr(vcpu, dr, val)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- return 0;
-}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
size_t size = ARRAY_SIZE(vcpu->arch.db);
@@ -1185,7 +1189,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
*val = vcpu->arch.dr7;
break;
}
- return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -1426,7 +1429,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
rdmsrl_safe(msr->index, &msr->data);
break;
default:
- return kvm_x86_ops.get_msr_feature(msr);
+ return static_call(kvm_x86_get_msr_feature)(msr);
}
return 0;
}
@@ -1502,7 +1505,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- r = kvm_x86_ops.set_efer(vcpu, efer);
+ r = static_call(kvm_x86_set_efer)(vcpu, efer);
if (r) {
WARN_ON(r > 0);
return r;
@@ -1599,7 +1602,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
msr.index = index;
msr.host_initiated = host_initiated;
- return kvm_x86_ops.set_msr(vcpu, &msr);
+ return static_call(kvm_x86_set_msr)(vcpu, &msr);
}
static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
@@ -1632,7 +1635,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
msr.index = index;
msr.host_initiated = host_initiated;
- ret = kvm_x86_ops.get_msr(vcpu, &msr);
+ ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
if (!ret)
*data = msr.data;
return ret;
@@ -1673,12 +1676,12 @@ static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
}
- return kvm_x86_ops.complete_emulated_msr(vcpu, err);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
}
static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops.complete_emulated_msr(vcpu, vcpu->run->msr.error);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
}
static u64 kvm_msr_reason(int r)
@@ -1750,7 +1753,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
trace_kvm_msr_read_ex(ecx);
}
- return kvm_x86_ops.complete_emulated_msr(vcpu, r);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
@@ -1776,17 +1779,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
else
trace_kvm_msr_write_ex(ecx, data);
- return kvm_x86_ops.complete_emulated_msr(vcpu, r);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
-bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
xfer_to_guest_mode_prepare();
return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
xfer_to_guest_mode_work_pending();
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
/*
* The fast path for frequent and performance sensitive wrmsr emulation,
@@ -1936,15 +1938,14 @@ static s64 get_kvmclock_base_ns(void)
}
#endif
-static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
{
int version;
int r;
struct pvclock_wall_clock wc;
+ u32 wc_sec_hi;
u64 wall_nsec;
- kvm->arch.wall_clock = wall_clock;
-
if (!wall_clock)
return;
@@ -1973,6 +1974,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+ if (sec_hi_ofs) {
+ wc_sec_hi = wall_nsec >> 32;
+ kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
+ &wc_sec_hi, sizeof(wc_sec_hi));
+ }
+
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
@@ -2209,7 +2216,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
vcpu->arch.l1_tsc_offset = offset;
- vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
+ vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -2592,13 +2599,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
return ret;
}
-static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
+ struct gfn_to_hva_cache *cache,
+ unsigned int offset)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info guest_hv_clock;
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
- &guest_hv_clock, sizeof(guest_hv_clock))))
+ if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
+ &guest_hv_clock, offset, sizeof(guest_hv_clock))))
return;
/* This VCPU is paused, but it's legal for a guest to read another
@@ -2621,9 +2630,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
++guest_hv_clock.version; /* first time write, random junk */
vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_write_guest_offset_cached(v->kvm, cache,
+ &vcpu->hv_clock, offset,
+ sizeof(vcpu->hv_clock.version));
smp_wmb();
@@ -2637,16 +2646,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ kvm_write_guest_offset_cached(v->kvm, cache,
+ &vcpu->hv_clock, offset,
+ sizeof(vcpu->hv_clock));
smp_wmb();
vcpu->hv_clock.version++;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_write_guest_offset_cached(v->kvm, cache,
+ &vcpu->hv_clock, offset,
+ sizeof(vcpu->hv_clock.version));
}
static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2733,7 +2742,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
if (vcpu->pv_time_enabled)
- kvm_setup_pvclock_page(v);
+ kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
+ if (vcpu->xen.vcpu_info_set)
+ kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
+ if (vcpu->xen.vcpu_time_info_set)
+ kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
if (v == kvm_get_vcpu(v->kvm, 0))
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
@@ -2858,32 +2872,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
-static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
-{
- struct kvm *kvm = vcpu->kvm;
- int lm = is_long_mode(vcpu);
- u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
- : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
- u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
- : kvm->arch.xen_hvm_config.blob_size_32;
- u32 page_num = data & ~PAGE_MASK;
- u64 page_addr = data & PAGE_MASK;
- u8 *page;
-
- if (page_num >= blob_size)
- return 1;
-
- page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
- if (IS_ERR(page))
- return PTR_ERR(page);
-
- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
- kfree(page);
- return 1;
- }
- return 0;
-}
-
static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
{
u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
@@ -2955,13 +2943,13 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops.tlb_flush_all(vcpu);
+ static_call(kvm_x86_tlb_flush_all)(vcpu);
}
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops.tlb_flush_guest(vcpu);
+ static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
static void record_steal_time(struct kvm_vcpu *vcpu)
@@ -3017,6 +3005,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
u32 msr = msr_info->index;
u64 data = msr_info->data;
+ if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
+ return kvm_xen_write_hypercall_page(vcpu, data);
+
switch (msr) {
case MSR_AMD64_NB_CFG:
case MSR_IA32_UCODE_WRITE:
@@ -3073,18 +3064,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
break;
- case MSR_IA32_DEBUGCTLMSR:
- if (!data) {
- /* We support the non-activated case already */
- break;
- } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
- /* Values other than LBR and BTF are vendor-specific,
- thus reserved and should throw a #GP */
- return 1;
- } else if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
- break;
case 0x200 ... 0x2ff:
return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
@@ -3153,13 +3132,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
return 1;
- kvm_write_wall_clock(vcpu->kvm, data);
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
break;
case MSR_KVM_WALL_CLOCK:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
return 1;
- kvm_write_wall_clock(vcpu->kvm, data);
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
break;
case MSR_KVM_SYSTEM_TIME_NEW:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
@@ -3304,8 +3285,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.msr_misc_features_enables = data;
break;
default:
- if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
- return xen_hvm_config(vcpu, data);
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
return KVM_MSR_RET_INVALID;
@@ -3357,7 +3336,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_IA32_PLATFORM_ID:
case MSR_IA32_EBL_CR_POWERON:
- case MSR_IA32_DEBUGCTLMSR:
case MSR_IA32_LASTBRANCHFROMIP:
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
@@ -3739,7 +3717,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
- case KVM_CAP_XEN_HVM:
case KVM_CAP_VCPU_EVENTS:
case KVM_CAP_HYPERV:
case KVM_CAP_HYPERV_VAPIC:
@@ -3779,6 +3756,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
r = 1;
break;
+ case KVM_CAP_XEN_HVM:
+ r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO;
+ break;
case KVM_CAP_SYNC_REGS:
r = KVM_SYNC_X86_VALID_FIELDS;
break;
@@ -3800,10 +3782,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = kvm_x86_ops.has_emulated_msr(kvm, MSR_IA32_SMBASE);
+ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
- r = !kvm_x86_ops.cpu_has_accelerated_tpr();
+ r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
break;
case KVM_CAP_NR_VCPUS:
r = KVM_SOFT_MAX_VCPUS;
@@ -3845,6 +3827,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_STEAL_TIME:
r = sched_info_on();
break;
+ case KVM_CAP_X86_BUS_LOCK_EXIT:
+ if (kvm_has_bus_lock_exit)
+ r = KVM_BUS_LOCK_DETECTION_OFF |
+ KVM_BUS_LOCK_DETECTION_EXIT;
+ else
+ r = 0;
+ break;
default:
break;
}
@@ -3962,14 +3951,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
- if (kvm_x86_ops.has_wbinvd_exit())
+ if (static_call(kvm_x86_has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
}
- kvm_x86_ops.vcpu_load(vcpu, cpu);
+ static_call(kvm_x86_vcpu_load)(vcpu, cpu);
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -4015,6 +4004,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
struct kvm_host_map map;
struct kvm_steal_time *st;
+ int idx;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4022,9 +4012,15 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (vcpu->arch.st.preempted)
return;
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
&vcpu->arch.st.cache, true))
- return;
+ goto out;
st = map.hva +
offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
@@ -4032,33 +4028,18 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+
+out:
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- int idx;
-
if (vcpu->preempted && !vcpu->arch.guest_state_protected)
- vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
+ vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
- /*
- * Disable page faults because we're in atomic context here.
- * kvm_write_guest_offset_cached() would call might_fault()
- * that relies on pagefault_disable() to tell if there's a
- * bug. NOTE: the write to guest memory may not go through if
- * during postcopy live migration or if there's heavy guest
- * paging.
- */
- pagefault_disable();
- /*
- * kvm_memslots() will be called by
- * kvm_write_guest_offset_cached() so take the srcu lock.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_steal_time_set_preempted(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- pagefault_enable();
- kvm_x86_ops.vcpu_put(vcpu);
+ static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
/*
* If userspace has set any breakpoints or watchpoints, dr6 is restored
@@ -4072,7 +4053,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
if (vcpu->arch.apicv_active)
- kvm_x86_ops.sync_pir_to_irr(vcpu);
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -4182,7 +4163,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
- kvm_x86_ops.setup_mce(vcpu);
+ static_call(kvm_x86_setup_mce)(vcpu);
out:
return r;
}
@@ -4289,11 +4270,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
- events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
+ events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
- events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
+ events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
events->nmi.pad = 0;
events->sipi_vector = 0; /* never valid when reporting to user space */
@@ -4360,13 +4341,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- kvm_x86_ops.set_interrupt_shadow(vcpu,
- events->interrupt.shadow);
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu,
+ events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
vcpu->arch.nmi_pending = events->nmi.pending;
- kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
+ static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
lapic_in_kernel(vcpu))
@@ -4422,9 +4403,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (dbgregs->flags)
return -EINVAL;
- if (dbgregs->dr6 & ~0xffffffffull)
+ if (!kvm_dr6_valid(dbgregs->dr6))
return -EINVAL;
- if (dbgregs->dr7 & ~0xffffffffull)
+ if (!kvm_dr7_valid(dbgregs->dr7))
return -EINVAL;
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
@@ -4661,7 +4642,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
if (!kvm_x86_ops.enable_direct_tlbflush)
return -ENOTTY;
- return kvm_x86_ops.enable_direct_tlbflush(vcpu);
+ return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
@@ -5032,6 +5013,26 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_SUPPORTED_HV_CPUID:
r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
break;
+ case KVM_XEN_VCPU_GET_ATTR: {
+ struct kvm_xen_vcpu_attr xva;
+
+ r = -EFAULT;
+ if (copy_from_user(&xva, argp, sizeof(xva)))
+ goto out;
+ r = kvm_xen_vcpu_get_attr(vcpu, &xva);
+ if (!r && copy_to_user(argp, &xva, sizeof(xva)))
+ r = -EFAULT;
+ break;
+ }
+ case KVM_XEN_VCPU_SET_ATTR: {
+ struct kvm_xen_vcpu_attr xva;
+
+ r = -EFAULT;
+ if (copy_from_user(&xva, argp, sizeof(xva)))
+ goto out;
+ r = kvm_xen_vcpu_set_attr(vcpu, &xva);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -5053,14 +5054,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
if (addr > (unsigned int)(-3 * PAGE_SIZE))
return -EINVAL;
- ret = kvm_x86_ops.set_tss_addr(kvm, addr);
+ ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
return ret;
}
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
- return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
+ return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -5217,8 +5218,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
/*
* Flush potentially hardware-cached dirty pages to dirty_bitmap.
*/
- if (kvm_x86_ops.flush_log_dirty)
- kvm_x86_ops.flush_log_dirty(kvm);
+ static_call_cond(kvm_x86_flush_log_dirty)(kvm);
}
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@ -5308,6 +5308,20 @@ split_irqchip_unlock:
kvm->arch.user_space_msr_mask = cap->args[0];
r = 0;
break;
+ case KVM_CAP_X86_BUS_LOCK_EXIT:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
+ break;
+
+ if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
+ (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
+ break;
+
+ if (kvm_has_bus_lock_exit &&
+ cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
+ kvm->arch.bus_lock_detection_enabled = true;
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -5637,11 +5651,27 @@ set_pit2_out:
r = -EFAULT;
if (copy_from_user(&xhc, argp, sizeof(xhc)))
goto out;
- r = -EINVAL;
- if (xhc.flags)
+ r = kvm_xen_hvm_config(kvm, &xhc);
+ break;
+ }
+ case KVM_XEN_HVM_GET_ATTR: {
+ struct kvm_xen_hvm_attr xha;
+
+ r = -EFAULT;
+ if (copy_from_user(&xha, argp, sizeof(xha)))
goto out;
- memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
- r = 0;
+ r = kvm_xen_hvm_get_attr(kvm, &xha);
+ if (!r && copy_to_user(argp, &xha, sizeof(xha)))
+ r = -EFAULT;
+ break;
+ }
+ case KVM_XEN_HVM_SET_ATTR: {
+ struct kvm_xen_hvm_attr xha;
+
+ r = -EFAULT;
+ if (copy_from_user(&xha, argp, sizeof(xha)))
+ goto out;
+ r = kvm_xen_hvm_set_attr(kvm, &xha);
break;
}
case KVM_SET_CLOCK: {
@@ -5686,7 +5716,7 @@ set_pit2_out:
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_op)
- r = kvm_x86_ops.mem_enc_op(kvm, argp);
+ r = static_call(kvm_x86_mem_enc_op)(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -5698,7 +5728,7 @@ set_pit2_out:
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_reg_region)
- r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
+ r = static_call(kvm_x86_mem_enc_reg_region)(kvm, &region);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -5710,7 +5740,7 @@ set_pit2_out:
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_unreg_region)
- r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
+ r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, &region);
break;
}
case KVM_HYPERV_EVENTFD: {
@@ -5812,7 +5842,7 @@ static void kvm_init_msr_list(void)
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
- if (!kvm_x86_ops.has_emulated_msr(NULL, emulated_msrs_all[i]))
+ if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
continue;
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
@@ -5875,13 +5905,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
static void kvm_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- kvm_x86_ops.set_segment(vcpu, var, seg);
+ static_call(kvm_x86_set_segment)(vcpu, var, seg);
}
void kvm_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- kvm_x86_ops.get_segment(vcpu, var, seg);
+ static_call(kvm_x86_get_segment)(vcpu, var, seg);
}
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -5901,14 +5931,14 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
@@ -5916,7 +5946,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
@@ -5965,7 +5995,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
@@ -5990,7 +6020,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -6011,7 +6041,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = 0;
- if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
+ if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -6064,7 +6094,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = PFERR_WRITE_MASK;
- if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
+ if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -6089,7 +6119,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
- if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
+ if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0)))
return 1;
if (force_emulation_prefix &&
@@ -6123,7 +6153,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
- u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
/*
@@ -6531,7 +6561,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- return kvm_x86_ops.get_segment_base(vcpu, seg);
+ return static_call(kvm_x86_get_segment_base)(vcpu, seg);
}
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
@@ -6544,7 +6574,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
- if (kvm_x86_ops.has_wbinvd_exit()) {
+ if (static_call(kvm_x86_has_wbinvd_exit)()) {
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
@@ -6571,17 +6601,17 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
}
-static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest)
+static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest)
{
- return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+ kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
}
static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long value)
{
- return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
+ return kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
}
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -6649,27 +6679,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
- return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
+ return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
}
static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
}
static unsigned long emulator_get_cached_segment_base(
@@ -6811,7 +6841,7 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
- return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
+ return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
&ctxt->exception);
}
@@ -6849,7 +6879,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
{
- kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
+ static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
}
static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
@@ -6865,7 +6895,7 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla
static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
- return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
+ return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
}
static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
@@ -6927,7 +6957,7 @@ static const struct x86_emulate_ops emulate_ops = {
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
+ u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -6938,7 +6968,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
if (int_shadow & mask)
mask = 0;
if (unlikely(int_shadow || mask)) {
- kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
if (!mask)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -6980,7 +7010,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
- kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
ctxt->gpa_available = false;
ctxt->eflags = kvm_get_rflags(vcpu);
@@ -7041,7 +7071,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
kvm_queue_exception(vcpu, UD_VECTOR);
- if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
+ if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -7101,9 +7131,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (vcpu->arch.mmu->direct_map) {
unsigned int indirect_shadow_pages;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
if (indirect_shadow_pages)
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -7210,7 +7240,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -7222,10 +7252,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
int r;
- r = kvm_x86_ops.skip_emulated_instruction(vcpu);
+ r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
if (unlikely(!r))
return 0;
@@ -7254,7 +7284,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
vcpu->arch.eff_db);
if (dr6 != 0) {
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
kvm_run->debug.arch.pc = eip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -7311,6 +7341,42 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
+/*
+ * Decode to be emulated instruction. Return EMULATION_OK if success.
+ */
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len)
+{
+ int r = EMULATION_OK;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ init_emulate_ctxt(vcpu);
+
+ /*
+ * We will reenter on the same instruction since we do not set
+ * complete_userspace_io. This does not handle watchpoints yet,
+ * those would be handled in the emulate_ops.
+ */
+ if (!(emulation_type & EMULTYPE_SKIP) &&
+ kvm_vcpu_check_breakpoint(vcpu, &r))
+ return r;
+
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
+
+ r = x86_decode_insn(ctxt, insn, insn_len);
+
+ trace_kvm_emulate_insn_start(vcpu);
+ ++vcpu->stat.insn_emulation;
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len)
{
@@ -7319,7 +7385,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool writeback = true;
bool write_fault_to_spt;
- if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
+ if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len)))
return 1;
vcpu->arch.l1tf_flush_l1d = true;
@@ -7330,32 +7396,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
*/
write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
vcpu->arch.write_fault_to_shadow_pgtable = false;
- kvm_clear_exception_queue(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
- init_emulate_ctxt(vcpu);
-
- /*
- * We will reenter on the same instruction since
- * we do not set complete_userspace_io. This does not
- * handle watchpoints yet, those would be handled in
- * the emulate_ops.
- */
- if (!(emulation_type & EMULTYPE_SKIP) &&
- kvm_vcpu_check_breakpoint(vcpu, &r))
- return r;
+ kvm_clear_exception_queue(vcpu);
- ctxt->interruptibility = 0;
- ctxt->have_exception = false;
- ctxt->exception.vector = -1;
- ctxt->perm_ok = false;
-
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-
- r = x86_decode_insn(ctxt, insn, insn_len);
-
- trace_kvm_emulate_insn_start(vcpu);
- ++vcpu->stat.insn_emulation;
+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
+ insn, insn_len);
if (r != EMULATION_OK) {
if ((emulation_type & EMULTYPE_TRAP_UD) ||
(emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
@@ -7462,7 +7508,7 @@ restart:
r = 1;
if (writeback) {
- unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
if (!ctxt->have_exception ||
@@ -7471,7 +7517,7 @@ restart:
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
if (kvm_x86_ops.update_emulated_instruction)
- kvm_x86_ops.update_emulated_instruction(vcpu);
+ static_call(kvm_x86_update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -7800,7 +7846,7 @@ static int kvm_is_user_mode(void)
int user_mode = 3;
if (__this_cpu_read(current_vcpu))
- user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
+ user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu));
return user_mode != 0;
}
@@ -7945,7 +7991,6 @@ int kvm_arch_init(void *opaque)
supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
}
- kvm_lapic_init();
if (pi_inject_timer == -1)
pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
#ifdef CONFIG_X86_64
@@ -7987,6 +8032,7 @@ void kvm_arch_exit(void)
kvm_mmu_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_fpu_cache);
+ WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
}
static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
@@ -8038,7 +8084,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
return -KVM_EOPNOTSUPP;
- if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+ if (!kvm_get_walltime_and_clockread(&ts, &cycle))
return -KVM_EOPNOTSUPP;
clock_pairing.sec = ts.tv_sec;
@@ -8114,7 +8160,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
unsigned long nr, a0, a1, a2, a3, ret;
int op_64_bit;
- if (kvm_hv_hypercall_enabled(vcpu->kvm))
+ if (kvm_xen_hypercall_enabled(vcpu->kvm))
+ return kvm_xen_hypercall(vcpu);
+
+ if (kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
nr = kvm_rax_read(vcpu);
@@ -8134,7 +8183,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0xFFFFFFFF;
}
- if (kvm_x86_ops.get_cpl(vcpu) != 0) {
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
ret = -KVM_EPERM;
goto out;
}
@@ -8191,7 +8240,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
- kvm_x86_ops.patch_hypercall(vcpu, instruction);
+ static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
&ctxt->exception);
@@ -8215,12 +8264,14 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->if_flag = !vcpu->arch.guest_state_protected
&& (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
- kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
kvm_run->ready_for_interrupt_injection =
pic_in_kernel(vcpu->kvm) ||
kvm_vcpu_ready_for_interrupt_injection(vcpu);
+
+ if (is_smm(vcpu))
+ kvm_run->flags |= KVM_RUN_X86_SMM;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -8246,7 +8297,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
tpr = kvm_lapic_get_cr8(vcpu);
- kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
+ static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
}
static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
@@ -8257,7 +8308,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
/* try to reinject previous events if any */
if (vcpu->arch.exception.injected) {
- kvm_x86_ops.queue_exception(vcpu);
+ static_call(kvm_x86_queue_exception)(vcpu);
can_inject = false;
}
/*
@@ -8276,10 +8327,10 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
*/
else if (!vcpu->arch.exception.pending) {
if (vcpu->arch.nmi_injected) {
- kvm_x86_ops.set_nmi(vcpu);
+ static_call(kvm_x86_set_nmi)(vcpu);
can_inject = false;
} else if (vcpu->arch.interrupt.injected) {
- kvm_x86_ops.set_irq(vcpu);
+ static_call(kvm_x86_set_irq)(vcpu);
can_inject = false;
}
}
@@ -8320,7 +8371,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
}
}
- kvm_x86_ops.queue_exception(vcpu);
+ static_call(kvm_x86_queue_exception)(vcpu);
can_inject = false;
}
@@ -8336,7 +8387,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
* The kvm_x86_ops hooks communicate this by returning -EBUSY.
*/
if (vcpu->arch.smi_pending) {
- r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
+ r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
goto busy;
if (r) {
@@ -8345,35 +8396,35 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
enter_smm(vcpu);
can_inject = false;
} else
- kvm_x86_ops.enable_smi_window(vcpu);
+ static_call(kvm_x86_enable_smi_window)(vcpu);
}
if (vcpu->arch.nmi_pending) {
- r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
+ r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
goto busy;
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
- kvm_x86_ops.set_nmi(vcpu);
+ static_call(kvm_x86_set_nmi)(vcpu);
can_inject = false;
- WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
+ WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
}
if (vcpu->arch.nmi_pending)
- kvm_x86_ops.enable_nmi_window(vcpu);
+ static_call(kvm_x86_enable_nmi_window)(vcpu);
}
if (kvm_cpu_has_injectable_intr(vcpu)) {
- r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
+ r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
goto busy;
if (r) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- kvm_x86_ops.set_irq(vcpu);
- WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
+ static_call(kvm_x86_set_irq)(vcpu);
+ WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
}
if (kvm_cpu_has_injectable_intr(vcpu))
- kvm_x86_ops.enable_irq_window(vcpu);
+ static_call(kvm_x86_enable_irq_window)(vcpu);
}
if (is_guest_mode(vcpu) &&
@@ -8398,7 +8449,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
* If an NMI is already in progress, limit further NMIs to just one.
* Otherwise, allow two (and we'll inject the first one immediately).
*/
- if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
@@ -8488,11 +8539,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7f7c, seg.limit);
put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
- kvm_x86_ops.get_gdt(vcpu, &dt);
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
put_smstate(u32, buf, 0x7f70, dt.size);
- kvm_x86_ops.get_idt(vcpu, &dt);
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
put_smstate(u32, buf, 0x7f58, dt.address);
put_smstate(u32, buf, 0x7f54, dt.size);
@@ -8542,7 +8593,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
- kvm_x86_ops.get_idt(vcpu, &dt);
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
put_smstate(u32, buf, 0x7e84, dt.size);
put_smstate(u64, buf, 0x7e88, dt.address);
@@ -8552,7 +8603,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
- kvm_x86_ops.get_gdt(vcpu, &dt);
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
put_smstate(u32, buf, 0x7e64, dt.size);
put_smstate(u64, buf, 0x7e68, dt.address);
@@ -8582,30 +8633,30 @@ static void enter_smm(struct kvm_vcpu *vcpu)
* vCPU state (e.g. leave guest mode) after we've saved the state into
* the SMM state-save area.
*/
- kvm_x86_ops.pre_enter_smm(vcpu, buf);
+ static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
- if (kvm_x86_ops.get_nmi_mask(vcpu))
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- kvm_x86_ops.set_nmi_mask(vcpu, true);
+ static_call(kvm_x86_set_nmi_mask)(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- kvm_x86_ops.set_cr0(vcpu, cr0);
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
vcpu->arch.cr0 = cr0;
- kvm_x86_ops.set_cr4(vcpu, 0);
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
/* Undocumented: IDT limit is set to zero on entry to SMM. */
dt.address = dt.size = 0;
- kvm_x86_ops.set_idt(vcpu, &dt);
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
- __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+ kvm_set_dr(vcpu, 7, DR7_FIXED_1);
cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
cs.base = vcpu->arch.smbase;
@@ -8634,7 +8685,7 @@ static void enter_smm(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- kvm_x86_ops.set_efer(vcpu, 0);
+ static_call(kvm_x86_set_efer)(vcpu, 0);
#endif
kvm_update_cpuid_runtime(vcpu);
@@ -8672,7 +8723,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
kvm_apic_update_apicv(vcpu);
- kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
+ static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
@@ -8689,7 +8740,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
unsigned long old, new, expected;
if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
- !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
+ !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
return;
old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
@@ -8709,7 +8760,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
trace_kvm_apicv_update_request(activate, bit);
if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
- kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
+ static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate);
/*
* Sending request to update APICV for all other vcpus,
@@ -8735,7 +8786,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
if (vcpu->arch.apicv_active)
- kvm_x86_ops.sync_pir_to_irr(vcpu);
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -8753,9 +8804,12 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
- vcpu_to_synic(vcpu)->vec_bitmap, 256);
- kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+ if (to_hv_vcpu(vcpu))
+ bitmap_or((ulong *)eoi_exit_bitmap,
+ vcpu->arch.ioapic_handled_vectors,
+ to_hv_synic(vcpu)->vec_bitmap, 256);
+
+ static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
}
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -8780,7 +8834,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
if (!kvm_x86_ops.set_apic_access_page_addr)
return;
- kvm_x86_ops.set_apic_access_page_addr(vcpu);
+ static_call(kvm_x86_set_apic_access_page_addr)(vcpu);
}
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -8905,8 +8959,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
- vcpu->run->hyperv = vcpu->arch.hyperv.exit;
+ vcpu->run->hyperv = hv_vcpu->exit;
r = 0;
goto out;
}
@@ -8923,10 +8979,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- kvm_x86_ops.msr_filter_changed(vcpu);
+ static_call(kvm_x86_msr_filter_changed)(vcpu);
}
- if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+ if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
+ kvm_xen_has_interrupt(vcpu)) {
++vcpu->stat.req_event;
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
@@ -8936,7 +8993,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
inject_pending_event(vcpu, &req_immediate_exit);
if (req_int_win)
- kvm_x86_ops.enable_irq_window(vcpu);
+ static_call(kvm_x86_enable_irq_window)(vcpu);
if (kvm_lapic_enabled(vcpu)) {
update_cr8_intercept(vcpu);
@@ -8951,7 +9008,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
- kvm_x86_ops.prepare_guest_switch(vcpu);
+ static_call(kvm_x86_prepare_guest_switch)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -8982,7 +9039,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* notified with kvm_vcpu_kick.
*/
if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- kvm_x86_ops.sync_pir_to_irr(vcpu);
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -8996,7 +9053,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit) {
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_x86_ops.request_immediate_exit(vcpu);
+ static_call(kvm_x86_request_immediate_exit)(vcpu);
}
fpregs_assert_state_consistent();
@@ -9013,7 +9070,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
- exit_fastpath = kvm_x86_ops.run(vcpu);
+ for (;;) {
+ exit_fastpath = static_call(kvm_x86_run)(vcpu);
+ if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+ break;
+
+ if (unlikely(kvm_vcpu_exit_request(vcpu))) {
+ exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
+ break;
+ }
+
+ if (vcpu->arch.apicv_active)
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ }
/*
* Do this here before restoring debug registers on the host. And
@@ -9023,7 +9092,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
- kvm_x86_ops.sync_dirty_debug_regs(vcpu);
+ static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
@@ -9045,7 +9114,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
- kvm_x86_ops.handle_exit_irqoff(vcpu);
+ static_call(kvm_x86_handle_exit_irqoff)(vcpu);
/*
* Consume any pending interrupts, including the possible source of
@@ -9087,13 +9156,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
- r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
+ r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
return r;
cancel_injection:
if (req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_x86_ops.cancel_injection(vcpu);
+ static_call(kvm_x86_cancel_injection)(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
@@ -9103,13 +9172,13 @@ out:
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
if (!kvm_arch_vcpu_runnable(vcpu) &&
- (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
+ (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (kvm_x86_ops.post_block)
- kvm_x86_ops.post_block(vcpu);
+ static_call(kvm_x86_post_block)(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
@@ -9330,6 +9399,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
vcpu_load(vcpu);
kvm_sigset_activate(vcpu);
+ kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
@@ -9504,10 +9574,10 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
- kvm_x86_ops.get_idt(vcpu, &dt);
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
sregs->idt.limit = dt.size;
sregs->idt.base = dt.address;
- kvm_x86_ops.get_gdt(vcpu, &dt);
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
sregs->gdt.limit = dt.size;
sregs->gdt.base = dt.address;
@@ -9625,7 +9695,7 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
*/
if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
return false;
- if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
+ if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3))
return false;
} else {
/*
@@ -9660,10 +9730,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
- kvm_x86_ops.set_idt(vcpu, &dt);
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
dt.size = sregs->gdt.limit;
dt.address = sregs->gdt.base;
- kvm_x86_ops.set_gdt(vcpu, &dt);
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
@@ -9673,14 +9743,14 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
kvm_set_cr8(vcpu, sregs->cr8);
mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
- kvm_x86_ops.set_efer(vcpu, sregs->efer);
+ static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
- kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
+ static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
- kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
+ static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
@@ -9788,7 +9858,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops.update_exception_bitmap(vcpu);
+ static_call(kvm_x86_update_exception_bitmap)(vcpu);
r = 0;
@@ -9966,7 +10036,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (kvm_apicv_activated(vcpu->kvm))
vcpu->arch.apicv_active = true;
} else
- static_key_slow_inc(&kvm_no_apic_vcpu);
+ static_branch_inc(&kvm_has_noapic_vcpu);
r = -ENOMEM;
@@ -10004,7 +10074,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
fx_init(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+ vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
@@ -10014,9 +10084,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.pending_external_vector = -1;
vcpu->arch.preempted_in_kernel = false;
- kvm_hv_vcpu_init(vcpu);
-
- r = kvm_x86_ops.vcpu_create(vcpu);
+ r = static_call(kvm_x86_vcpu_create)(vcpu);
if (r)
goto free_guest_fpu;
@@ -10052,8 +10120,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- kvm_hv_vcpu_postcreate(vcpu);
-
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
@@ -10079,7 +10145,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvmclock_reset(vcpu);
- kvm_x86_ops.vcpu_free(vcpu);
+ static_call(kvm_x86_vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -10096,7 +10162,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
kvfree(vcpu->arch.cpuid_entries);
if (!lapic_in_kernel(vcpu))
- static_key_slow_dec(&kvm_no_apic_vcpu);
+ static_branch_dec(&kvm_has_noapic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -10115,7 +10181,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
- vcpu->arch.dr6 = DR6_INIT;
+ vcpu->arch.dr6 = DR6_ACTIVE_LOW;
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -10168,7 +10234,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.ia32_xss = 0;
- kvm_x86_ops.vcpu_reset(vcpu, init_event);
+ static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -10194,7 +10260,7 @@ int kvm_arch_hardware_enable(void)
bool stable, backwards_tsc = false;
kvm_user_return_msr_cpu_online();
- ret = kvm_x86_ops.hardware_enable();
+ ret = static_call(kvm_x86_hardware_enable)();
if (ret != 0)
return ret;
@@ -10276,7 +10342,7 @@ int kvm_arch_hardware_enable(void)
void kvm_arch_hardware_disable(void)
{
- kvm_x86_ops.hardware_disable();
+ static_call(kvm_x86_hardware_disable)();
drop_user_return_notifiers();
}
@@ -10295,6 +10361,7 @@ int kvm_arch_hardware_setup(void *opaque)
return r;
memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+ kvm_ops_static_call_update();
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
supported_xss = 0;
@@ -10323,7 +10390,7 @@ int kvm_arch_hardware_setup(void *opaque)
void kvm_arch_hardware_unsetup(void)
{
- kvm_x86_ops.hardware_unsetup();
+ static_call(kvm_x86_hardware_unsetup)();
}
int kvm_arch_check_processor_compat(void *opaque)
@@ -10351,8 +10418,8 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-struct static_key kvm_no_apic_vcpu __read_mostly;
-EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
+__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
@@ -10363,12 +10430,12 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
pmu->need_cleanup = true;
kvm_make_request(KVM_REQ_PMU, vcpu);
}
- kvm_x86_ops.sched_in(vcpu, cpu);
+ static_call(kvm_x86_sched_in)(vcpu, cpu);
}
void kvm_arch_free_vm(struct kvm *kvm)
{
- kfree(kvm->arch.hyperv.hv_pa_pg);
+ kfree(to_kvm_hv(kvm)->hv_pa_pg);
vfree(kvm);
}
@@ -10407,7 +10474,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
- return kvm_x86_ops.vm_init(kvm);
+ return static_call(kvm_x86_vm_init)(kvm);
}
int kvm_arch_post_init_vm(struct kvm *kvm)
@@ -10552,8 +10619,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
mutex_unlock(&kvm->slots_lock);
}
- if (kvm_x86_ops.vm_destroy)
- kvm_x86_ops.vm_destroy(kvm);
+ static_call_cond(kvm_x86_vm_destroy)(kvm);
for (i = 0; i < kvm->arch.msr_filter.count; i++)
kfree(kvm->arch.msr_filter.ranges[i].bitmap);
kvm_pic_destroy(kvm);
@@ -10563,6 +10629,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
kvm_page_track_cleanup(kvm);
+ kvm_xen_destroy_vm(kvm);
kvm_hv_destroy_vm(kvm);
}
@@ -10744,7 +10811,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
*/
if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
if (kvm_x86_ops.slot_enable_log_dirty) {
- kvm_x86_ops.slot_enable_log_dirty(kvm, new);
+ static_call(kvm_x86_slot_enable_log_dirty)(kvm, new);
} else {
int level =
kvm_dirty_log_manual_protect_and_init_set(kvm) ?
@@ -10761,8 +10828,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
kvm_mmu_slot_remove_write_access(kvm, new, level);
}
} else {
- if (kvm_x86_ops.slot_disable_log_dirty)
- kvm_x86_ops.slot_disable_log_dirty(kvm, new);
+ static_call_cond(kvm_x86_slot_disable_log_dirty)(kvm, new);
}
}
@@ -10801,7 +10867,7 @@ static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
kvm_x86_ops.guest_apic_has_interrupt &&
- kvm_x86_ops.guest_apic_has_interrupt(vcpu));
+ static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
}
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
@@ -10820,12 +10886,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
(vcpu->arch.nmi_pending &&
- kvm_x86_ops.nmi_allowed(vcpu, false)))
+ static_call(kvm_x86_nmi_allowed)(vcpu, false)))
return true;
if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
(vcpu->arch.smi_pending &&
- kvm_x86_ops.smi_allowed(vcpu, false)))
+ static_call(kvm_x86_smi_allowed)(vcpu, false)))
return true;
if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -10859,7 +10925,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
kvm_test_request(KVM_REQ_EVENT, vcpu))
return true;
- if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
+ if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
return true;
return false;
@@ -10877,7 +10943,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops.interrupt_allowed(vcpu, false);
+ return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
}
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -10903,7 +10969,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
- rflags = kvm_x86_ops.get_rflags(vcpu);
+ rflags = static_call(kvm_x86_get_rflags)(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
rflags &= ~X86_EFLAGS_TF;
return rflags;
@@ -10915,7 +10981,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
- kvm_x86_ops.set_rflags(vcpu, rflags);
+ static_call(kvm_x86_set_rflags)(vcpu, rflags);
}
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
@@ -11045,7 +11111,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
return false;
if (!kvm_pv_async_pf_enabled(vcpu) ||
- (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
+ (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0))
return false;
return true;
@@ -11190,7 +11256,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
- ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
+ ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
if (ret)
@@ -11215,7 +11281,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
@@ -11226,7 +11292,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
+ return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
}
bool kvm_vector_hashing_enabled(void)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 0f727b50bd3d..39eb04887141 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -98,7 +98,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
if (!is_long_mode(vcpu))
return false;
- kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
return cs_l;
}
@@ -129,7 +129,7 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops.tlb_flush_current(vcpu);
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
}
static inline int is_pae(struct kvm_vcpu *vcpu)
@@ -244,9 +244,10 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
{
- return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu);
+ return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
}
+void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs);
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -273,6 +274,8 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
bool kvm_vector_hashing_enabled(void);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len);
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
@@ -296,6 +299,8 @@ extern int pi_inject_timer;
extern struct static_key kvm_no_apic_vcpu;
+extern bool report_ignored_msrs;
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
@@ -391,7 +396,6 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
int kvm_spec_ctrl_test_value(u64 value);
bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
new file mode 100644
index 000000000000..af8f6562fce4
--- /dev/null
+++ b/arch/x86/kvm/xen.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#include "x86.h"
+#include "xen.h"
+#include "hyperv.h"
+
+#include <linux/kvm_host.h>
+
+#include <trace/events/kvm.h>
+#include <xen/interface/xen.h>
+
+#include "trace.h"
+
+DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
+
+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+{
+ gpa_t gpa = gfn_to_gpa(gfn);
+ int wc_ofs, sec_hi_ofs;
+ int ret;
+ int idx = srcu_read_lock(&kvm->srcu);
+
+ ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache,
+ gpa, PAGE_SIZE);
+ if (ret)
+ goto out;
+
+ kvm->arch.xen.shinfo_set = true;
+
+ /* Paranoia checks on the 32-bit struct layout */
+ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
+ BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ /* 32-bit location by default */
+ wc_ofs = offsetof(struct compat_shared_info, wc);
+ sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi);
+
+#ifdef CONFIG_X86_64
+ /* Paranoia checks on the 64-bit struct layout */
+ BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
+ BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
+
+ if (kvm->arch.xen.long_mode) {
+ wc_ofs = offsetof(struct shared_info, wc);
+ sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi);
+ }
+#endif
+
+ kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return ret;
+}
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+{
+ u8 rc = 0;
+
+ /*
+ * If the global upcall vector (HVMIRQ_callback_vector) is set and
+ * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
+ */
+ struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
+ struct kvm_memslots *slots = kvm_memslots(v->kvm);
+ unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
+
+ /* No need for compat handling here */
+ BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
+ offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
+ BUILD_BUG_ON(sizeof(rc) !=
+ sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending));
+ BUILD_BUG_ON(sizeof(rc) !=
+ sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending));
+
+ /*
+ * For efficiency, this mirrors the checks for using the valid
+ * cache in kvm_read_guest_offset_cached(), but just uses
+ * __get_user() instead. And falls back to the slow path.
+ */
+ if (likely(slots->generation == ghc->generation &&
+ !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
+ /* Fast path */
+ __get_user(rc, (u8 __user *)ghc->hva + offset);
+ } else {
+ /* Slow path */
+ kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
+ sizeof(rc));
+ }
+
+ return rc;
+}
+
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&kvm->lock);
+
+ switch (data->type) {
+ case KVM_XEN_ATTR_TYPE_LONG_MODE:
+ if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
+ r = -EINVAL;
+ } else {
+ kvm->arch.xen.long_mode = !!data->u.long_mode;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ if (data->u.shared_info.gfn == GPA_INVALID) {
+ kvm->arch.xen.shinfo_set = false;
+ r = 0;
+ break;
+ }
+ r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+ break;
+
+
+ case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+ if (data->u.vector && data->u.vector < 0x10)
+ r = -EINVAL;
+ else {
+ kvm->arch.xen.upcall_vector = data->u.vector;
+ r = 0;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&kvm->lock);
+
+ switch (data->type) {
+ case KVM_XEN_ATTR_TYPE_LONG_MODE:
+ data->u.long_mode = kvm->arch.xen.long_mode;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ if (kvm->arch.xen.shinfo_set)
+ data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
+ else
+ data->u.shared_info.gfn = GPA_INVALID;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+ data->u.vector = kvm->arch.xen.upcall_vector;
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+ int idx, r = -ENOENT;
+
+ mutex_lock(&vcpu->kvm->lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ switch (data->type) {
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ /* No compat necessary here. */
+ BUILD_BUG_ON(sizeof(struct vcpu_info) !=
+ sizeof(struct compat_vcpu_info));
+
+ if (data->u.gpa == GPA_INVALID) {
+ vcpu->arch.xen.vcpu_info_set = false;
+ break;
+ }
+
+ r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa,
+ sizeof(struct vcpu_info));
+ if (!r) {
+ vcpu->arch.xen.vcpu_info_set = true;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (data->u.gpa == GPA_INVALID) {
+ vcpu->arch.xen.vcpu_time_info_set = false;
+ break;
+ }
+
+ r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache,
+ data->u.gpa,
+ sizeof(struct pvclock_vcpu_time_info));
+ if (!r) {
+ vcpu->arch.xen.vcpu_time_info_set = true;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+}
+
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&vcpu->kvm->lock);
+
+ switch (data->type) {
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ if (vcpu->arch.xen.vcpu_info_set)
+ data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
+ else
+ data->u.gpa = GPA_INVALID;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (vcpu->arch.xen.vcpu_time_info_set)
+ data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
+ else
+ data->u.gpa = GPA_INVALID;
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+}
+
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u32 page_num = data & ~PAGE_MASK;
+ u64 page_addr = data & PAGE_MASK;
+ bool lm = is_long_mode(vcpu);
+
+ /* Latch long_mode for shared_info pages etc. */
+ vcpu->kvm->arch.xen.long_mode = lm;
+
+ /*
+ * If Xen hypercall intercept is enabled, fill the hypercall
+ * page with VMCALL/VMMCALL instructions since that's what
+ * we catch. Else the VMM has provided the hypercall pages
+ * with instructions of its own choosing, so use those.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ u8 instructions[32];
+ int i;
+
+ if (page_num)
+ return 1;
+
+ /* mov imm32, %eax */
+ instructions[0] = 0xb8;
+
+ /* vmcall / vmmcall */
+ kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
+
+ /* ret */
+ instructions[8] = 0xc3;
+
+ /* int3 to pad */
+ memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
+
+ for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
+ *(u32 *)&instructions[1] = i;
+ if (kvm_vcpu_write_guest(vcpu,
+ page_addr + (i * sizeof(instructions)),
+ instructions, sizeof(instructions)))
+ return 1;
+ }
+ } else {
+ /*
+ * Note, truncation is a non-issue as 'lm' is guaranteed to be
+ * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
+ */
+ hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
+ : kvm->arch.xen_hvm_config.blob_addr_32;
+ u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+ : kvm->arch.xen_hvm_config.blob_size_32;
+ u8 *page;
+
+ if (page_num >= blob_size)
+ return 1;
+
+ blob_addr += page_num * PAGE_SIZE;
+
+ page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
+ kfree(page);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
+{
+ if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
+ return -EINVAL;
+
+ /*
+ * With hypercall interception the kernel generates its own
+ * hypercall page so it must not be provided.
+ */
+ if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
+ (xhc->blob_addr_32 || xhc->blob_addr_64 ||
+ xhc->blob_size_32 || xhc->blob_size_64))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
+ static_branch_inc(&kvm_xen_enabled.key);
+ else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+
+ memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
+
+static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ kvm_rax_write(vcpu, result);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
+ return 1;
+
+ return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
+}
+
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
+{
+ bool longmode;
+ u64 input, params[6];
+
+ input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
+
+ /* Hyper-V hypercalls get bit 31 set in EAX */
+ if ((input & 0x80000000) &&
+ kvm_hv_hypercall_enabled(vcpu))
+ return kvm_hv_hypercall(vcpu);
+
+ longmode = is_64_bit_mode(vcpu);
+ if (!longmode) {
+ params[0] = (u32)kvm_rbx_read(vcpu);
+ params[1] = (u32)kvm_rcx_read(vcpu);
+ params[2] = (u32)kvm_rdx_read(vcpu);
+ params[3] = (u32)kvm_rsi_read(vcpu);
+ params[4] = (u32)kvm_rdi_read(vcpu);
+ params[5] = (u32)kvm_rbp_read(vcpu);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ params[0] = (u64)kvm_rdi_read(vcpu);
+ params[1] = (u64)kvm_rsi_read(vcpu);
+ params[2] = (u64)kvm_rdx_read(vcpu);
+ params[3] = (u64)kvm_r10_read(vcpu);
+ params[4] = (u64)kvm_r8_read(vcpu);
+ params[5] = (u64)kvm_r9_read(vcpu);
+ }
+#endif
+ trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
+ params[3], params[4], params[5]);
+
+ vcpu->run->exit_reason = KVM_EXIT_XEN;
+ vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
+ vcpu->run->xen.u.hcall.longmode = longmode;
+ vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu);
+ vcpu->run->xen.u.hcall.input = input;
+ vcpu->run->xen.u.hcall.params[0] = params[0];
+ vcpu->run->xen.u.hcall.params[1] = params[1];
+ vcpu->run->xen.u.hcall.params[2] = params[2];
+ vcpu->run->xen.u.hcall.params[3] = params[3];
+ vcpu->run->xen.u.hcall.params[4] = params[4];
+ vcpu->run->xen.u.hcall.params[5] = params[5];
+ vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io =
+ kvm_xen_hypercall_complete_userspace;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
new file mode 100644
index 000000000000..b66a921776f4
--- /dev/null
+++ b/arch/x86/kvm/xen.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#ifndef __ARCH_X86_KVM_XEN_H__
+#define __ARCH_X86_KVM_XEN_H__
+
+#include <linux/jump_label_ratelimit.h>
+
+extern struct static_key_false_deferred kvm_xen_enabled;
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
+void kvm_xen_destroy_vm(struct kvm *kvm);
+
+static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ (kvm->arch.xen_hvm_config.flags &
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
+}
+
+static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector)
+ return __kvm_xen_has_interrupt(vcpu);
+
+ return 0;
+}
+
+/* 32-bit compatibility definitions, also used natively in 32-bit build */
+#include <asm/pvclock-abi.h>
+#include <asm/xen/interface.h>
+
+struct compat_arch_vcpu_info {
+ unsigned int cr2;
+ unsigned int pad[5];
+};
+
+struct compat_vcpu_info {
+ uint8_t evtchn_upcall_pending;
+ uint8_t evtchn_upcall_mask;
+ uint16_t pad;
+ uint32_t evtchn_pending_sel;
+ struct compat_arch_vcpu_info arch;
+ struct pvclock_vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+struct compat_arch_shared_info {
+ unsigned int max_pfn;
+ unsigned int pfn_to_mfn_frame_list_list;
+ unsigned int nmi_reason;
+ unsigned int p2m_cr3;
+ unsigned int p2m_vaddr;
+ unsigned int p2m_generation;
+ uint32_t wc_sec_hi;
+};
+
+struct compat_shared_info {
+ struct compat_vcpu_info vcpu_info[MAX_VIRT_CPUS];
+ uint32_t evtchn_pending[32];
+ uint32_t evtchn_mask[32];
+ struct pvclock_wall_clock wc;
+ struct compat_arch_shared_info arch;
+};
+
+#endif /* __ARCH_X86_KVM_XEN_H__ */
diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h
index 584b0de6f2ca..41c449ece2d8 100644
--- a/arch/xtensa/include/asm/spinlock.h
+++ b/arch/xtensa/include/asm/spinlock.h
@@ -12,8 +12,8 @@
#define _XTENSA_SPINLOCK_H
#include <asm/barrier.h>
-#include <asm/qrwlock.h>
#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
#define smp_mb__after_spinlock() smp_mb()
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 476113e12489..cb9b4c4e371e 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -128,6 +128,7 @@ static int sev_cmd_buffer_len(int cmd)
case SEV_CMD_LAUNCH_UPDATE_SECRET: return sizeof(struct sev_data_launch_secret);
case SEV_CMD_DOWNLOAD_FIRMWARE: return sizeof(struct sev_data_download_firmware);
case SEV_CMD_GET_ID: return sizeof(struct sev_data_get_id);
+ case SEV_CMD_ATTESTATION_REPORT: return sizeof(struct sev_data_attestation_report);
default: return 0;
}
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 60f1a386dd06..b4348256ae95 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1703,7 +1703,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
return -EINVAL;
}
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
if (kvmgt_gfn_is_write_protected(info, gfn))
goto out;
@@ -1712,7 +1712,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
kvmgt_protect_table_add(info, gfn);
out:
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
return 0;
}
@@ -1737,7 +1737,7 @@ static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
return -EINVAL;
}
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
if (!kvmgt_gfn_is_write_protected(info, gfn))
goto out;
@@ -1746,7 +1746,7 @@ static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
kvmgt_protect_table_del(info, gfn);
out:
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
return 0;
}
@@ -1772,7 +1772,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm,
struct kvmgt_guest_info *info = container_of(node,
struct kvmgt_guest_info, track_node);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < slot->npages; i++) {
gfn = slot->base_gfn + i;
if (kvmgt_gfn_is_write_protected(info, gfn)) {
@@ -1781,7 +1781,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm,
kvmgt_protect_table_del(info, gfn);
}
}
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
diff --git a/fs/dax.c b/fs/dax.c
index 26d5dcd2d69e..b3d27fdc6775 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -810,11 +810,12 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
address = pgoff_address(index, vma);
/*
- * Note because we provide range to follow_pte it will call
+ * follow_invalidate_pte() will use the range to call
* mmu_notifier_invalidate_range_start() on our behalf before
* taking any lock.
*/
- if (follow_pte(vma->vm_mm, address, &range, &ptep, &pmdp, &ptl))
+ if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
+ &pmdp, &ptl))
continue;
/*
diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h
index 365345f9a9e3..07a36a874dca 100644
--- a/include/asm-generic/export.h
+++ b/include/asm-generic/export.h
@@ -33,7 +33,7 @@
*/
.macro ___EXPORT_SYMBOL name,val,sec
-#ifdef CONFIG_MODULES
+#if defined(CONFIG_MODULES) && !defined(__DISABLE_EXPORTS)
.section ___ksymtab\sec+\name,"a"
.balign KSYM_ALIGN
__ksymtab_\name:
diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 84ce841ce735..7ae0ece07b4e 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -15,6 +15,8 @@
#include <asm-generic/qrwlock_types.h>
+/* Must be included from asm/spinlock.h after defining arch_spin_is_locked. */
+
/*
* Writer states & reader shift and bias.
*/
@@ -116,15 +118,26 @@ static inline void queued_write_unlock(struct qrwlock *lock)
smp_store_release(&lock->wlocked, 0);
}
+/**
+ * queued_rwlock_is_contended - check if the lock is contended
+ * @lock : Pointer to queue rwlock structure
+ * Return: 1 if lock contended, 0 otherwise
+ */
+static inline int queued_rwlock_is_contended(struct qrwlock *lock)
+{
+ return arch_spin_is_locked(&lock->wait_lock);
+}
+
/*
* Remapping rwlock architecture specific functions to the corresponding
* queue rwlock functions.
*/
-#define arch_read_lock(l) queued_read_lock(l)
-#define arch_write_lock(l) queued_write_lock(l)
-#define arch_read_trylock(l) queued_read_trylock(l)
-#define arch_write_trylock(l) queued_write_trylock(l)
-#define arch_read_unlock(l) queued_read_unlock(l)
-#define arch_write_unlock(l) queued_write_unlock(l)
+#define arch_read_lock(l) queued_read_lock(l)
+#define arch_write_lock(l) queued_write_lock(l)
+#define arch_read_trylock(l) queued_read_trylock(l)
+#define arch_write_trylock(l) queued_write_trylock(l)
+#define arch_read_unlock(l) queued_read_unlock(l)
+#define arch_write_unlock(l) queued_write_unlock(l)
+#define arch_rwlock_is_contended(l) queued_rwlock_is_contended(l)
#endif /* __ASM_GENERIC_QRWLOCK_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f3b1013fb22c..e126ebda36d0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -425,9 +425,8 @@ struct kvm_irq_routing_table {
#define KVM_PRIVATE_MEM_SLOTS 0
#endif
-#ifndef KVM_MEM_SLOTS_NUM
-#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
-#endif
+#define KVM_MEM_SLOTS_NUM SHRT_MAX
+#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_PRIVATE_MEM_SLOTS)
#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
@@ -451,7 +450,12 @@ struct kvm_memslots {
};
struct kvm {
+#ifdef KVM_HAVE_MMU_RWLOCK
+ rwlock_t mmu_lock;
+#else
spinlock_t mmu_lock;
+#endif /* KVM_HAVE_MMU_RWLOCK */
+
struct mutex slots_lock;
struct mm_struct *mm; /* userspace tied to this vm */
struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ab959488bc0f..1696ee6ab22d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1663,9 +1663,11 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+ struct mmu_notifier_range *range, pte_t **ptepp,
+ pmd_t **pmdpp, spinlock_t **ptlp);
int follow_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
- spinlock_t **ptlp);
+ pte_t **ptepp, spinlock_t **ptlp);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address,
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 49d155cd2dfe..b801ead1e2bb 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -66,6 +66,7 @@ enum sev_cmd {
SEV_CMD_LAUNCH_MEASURE = 0x033,
SEV_CMD_LAUNCH_UPDATE_SECRET = 0x034,
SEV_CMD_LAUNCH_FINISH = 0x035,
+ SEV_CMD_ATTESTATION_REPORT = 0x036,
/* Guest migration commands (outgoing) */
SEV_CMD_SEND_START = 0x040,
@@ -483,6 +484,22 @@ struct sev_data_dbg {
u32 len; /* In */
} __packed;
+/**
+ * struct sev_data_attestation_report - SEV_ATTESTATION_REPORT command parameters
+ *
+ * @handle: handle of the VM
+ * @mnonce: a random nonce that will be included in the report.
+ * @address: physical address where the report will be copied.
+ * @len: length of the physical buffer.
+ */
+struct sev_data_attestation_report {
+ u32 handle; /* In */
+ u32 reserved;
+ u64 address; /* In */
+ u8 mnonce[16]; /* In */
+ u32 len; /* In/Out */
+} __packed;
+
#ifdef CONFIG_CRYPTO_DEV_SP_PSP
/**
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 3dcd617e65ae..7ce9a51ae5c0 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -128,4 +128,11 @@ do { \
1 : ({ local_irq_restore(flags); 0; }); \
})
+#ifdef arch_rwlock_is_contended
+#define rwlock_is_contended(lock) \
+ arch_rwlock_is_contended(&(lock)->raw_lock)
+#else
+#define rwlock_is_contended(lock) ((void)(lock), 0)
+#endif /* arch_rwlock_is_contended */
+
#endif /* __LINUX_RWLOCK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d568288abf9..26f499810dfa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1904,12 +1904,24 @@ static inline int _cond_resched(void) { return 0; }
})
extern int __cond_resched_lock(spinlock_t *lock);
+extern int __cond_resched_rwlock_read(rwlock_t *lock);
+extern int __cond_resched_rwlock_write(rwlock_t *lock);
#define cond_resched_lock(lock) ({ \
___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
__cond_resched_lock(lock); \
})
+#define cond_resched_rwlock_read(lock) ({ \
+ __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+ __cond_resched_rwlock_read(lock); \
+})
+
+#define cond_resched_rwlock_write(lock) ({ \
+ __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+ __cond_resched_rwlock_write(lock); \
+})
+
static inline void cond_resched_rcu(void)
{
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
@@ -1933,6 +1945,23 @@ static inline int spin_needbreak(spinlock_t *lock)
#endif
}
+/*
+ * Check if a rwlock is contended.
+ * Returns non-zero if there is another task waiting on the rwlock.
+ * Returns zero if the lock is not contended or the system / underlying
+ * rwlock implementation does not support contention detection.
+ * Technically does not depend on CONFIG_PREEMPTION, but a general need
+ * for low latency.
+ */
+static inline int rwlock_needbreak(rwlock_t *lock)
+{
+#ifdef CONFIG_PREEMPTION
+ return rwlock_is_contended(lock);
+#else
+ return 0;
+#endif
+}
+
static __always_inline bool need_resched(void)
{
return unlikely(tif_need_resched());
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 374c67875cdb..8b281f722e5b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -216,6 +216,20 @@ struct kvm_hyperv_exit {
} u;
};
+struct kvm_xen_exit {
+#define KVM_EXIT_XEN_HCALL 1
+ __u32 type;
+ union {
+ struct {
+ __u32 longmode;
+ __u32 cpl;
+ __u64 input;
+ __u64 result;
+ __u64 params[6];
+ } hcall;
+ } u;
+};
+
#define KVM_S390_GET_SKEYS_NONE 1
#define KVM_S390_SKEYS_MAX 1048576
@@ -252,6 +266,8 @@ struct kvm_hyperv_exit {
#define KVM_EXIT_X86_WRMSR 30
#define KVM_EXIT_DIRTY_RING_FULL 31
#define KVM_EXIT_AP_RESET_HOLD 32
+#define KVM_EXIT_X86_BUS_LOCK 33
+#define KVM_EXIT_XEN 34
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -428,6 +444,8 @@ struct kvm_run {
__u32 index; /* kernel -> user */
__u64 data; /* kernel <-> user */
} msr;
+ /* KVM_EXIT_XEN */
+ struct kvm_xen_exit xen;
/* Fix the size of the union. */
char padding[256];
};
@@ -1058,6 +1076,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
#define KVM_CAP_SYS_HYPERV_CPUID 191
#define KVM_CAP_DIRTY_LOG_RING 192
+#define KVM_CAP_X86_BUS_LOCK_EXIT 193
+#define KVM_CAP_PPC_DAWR1 194
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1131,6 +1151,10 @@ struct kvm_x86_mce {
#endif
#ifdef KVM_CAP_XEN_HVM
+#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0)
+#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
+
struct kvm_xen_hvm_config {
__u32 flags;
__u32 msr;
@@ -1565,6 +1589,45 @@ struct kvm_pv_cmd {
/* Available with KVM_CAP_DIRTY_LOG_RING */
#define KVM_RESET_DIRTY_RINGS _IO(KVMIO, 0xc7)
+/* Per-VM Xen attributes */
+#define KVM_XEN_HVM_GET_ATTR _IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr)
+#define KVM_XEN_HVM_SET_ATTR _IOW(KVMIO, 0xc9, struct kvm_xen_hvm_attr)
+
+struct kvm_xen_hvm_attr {
+ __u16 type;
+ __u16 pad[3];
+ union {
+ __u8 long_mode;
+ __u8 vector;
+ struct {
+ __u64 gfn;
+ } shared_info;
+ __u64 pad[8];
+ } u;
+};
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1
+#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2
+
+/* Per-vCPU Xen attributes */
+#define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
+#define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr)
+
+struct kvm_xen_vcpu_attr {
+ __u16 type;
+ __u16 pad[3];
+ union {
+ __u64 gpa;
+ __u64 pad[8];
+ } u;
+};
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1
+
/* Secure Encrypted Virtualization command */
enum sev_cmd_id {
/* Guest initialization commands */
@@ -1593,6 +1656,8 @@ enum sev_cmd_id {
KVM_SEV_DBG_ENCRYPT,
/* Guest certificates commands */
KVM_SEV_CERT_EXPORT,
+ /* Attestation report */
+ KVM_SEV_GET_ATTESTATION_REPORT,
KVM_SEV_NR_MAX,
};
@@ -1645,6 +1710,12 @@ struct kvm_sev_dbg {
__u32 len;
};
+struct kvm_sev_attestation_report {
+ __u8 mnonce[16];
+ __u64 uaddr;
+ __u32 len;
+};
+
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
@@ -1766,4 +1837,7 @@ struct kvm_dirty_gfn {
__u64 offset;
};
+#define KVM_BUS_LOCK_DETECTION_OFF (1 << 0)
+#define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1)
+
#endif /* __LINUX_KVM_H */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 8bfb242f433e..5ee37a296481 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -598,7 +598,9 @@ struct shared_info {
* their gettimeofday() syscall on this wallclock-base value.
*/
struct pvclock_wall_clock wc;
-
+#ifndef CONFIG_X86_32
+ uint32_t wc_sec_hi;
+#endif
struct arch_shared_info arch;
};
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fe9ca92faa2a..4786dd271b45 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -12,7 +12,6 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/spinlock.h>
-#include <asm/qrwlock.h>
/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f5ffc878411..ca2bb629595f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7028,6 +7028,46 @@ int __cond_resched_lock(spinlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_lock);
+int __cond_resched_rwlock_read(rwlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held_read(lock);
+
+ if (rwlock_needbreak(lock) || resched) {
+ read_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ read_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_read);
+
+int __cond_resched_rwlock_write(rwlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held_write(lock);
+
+ if (rwlock_needbreak(lock) || resched) {
+ write_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ write_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_write);
+
/**
* yield - yield the current processor to other threads.
*
diff --git a/mm/memory.c b/mm/memory.c
index b32f32bf584d..c32318dc11d4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4642,9 +4642,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-int follow_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
- spinlock_t **ptlp)
+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+ struct mmu_notifier_range *range, pte_t **ptepp,
+ pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4710,6 +4710,34 @@ out:
}
/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
+{
+ return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
+}
+EXPORT_SYMBOL_GPL(follow_pte);
+
+/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
* @address: user virtual address
@@ -4717,6 +4745,9 @@ out:
*
* Only IO mappings and raw PFN mappings are allowed.
*
+ * This function does not allow the caller to read the permissions
+ * of the PTE. Do not use it.
+ *
* Return: zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
@@ -4729,7 +4760,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return ret;
- ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl);
+ ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
*pfn = pte_pfn(*ptep);
@@ -4750,7 +4781,7 @@ int follow_phys(struct vm_area_struct *vma,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
- if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl))
+ if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
goto out;
pte = *ptep;
diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h
index c3af3f324c5a..9f18fa090f1f 100644
--- a/tools/arch/powerpc/include/uapi/asm/kvm.h
+++ b/tools/arch/powerpc/include/uapi/asm/kvm.h
@@ -644,6 +644,8 @@ struct kvm_ppc_cpu_char {
#define KVM_REG_PPC_MMCR3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc1)
#define KVM_REG_PPC_SIER2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc2)
#define KVM_REG_PPC_SIER3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3)
+#define KVM_REG_PPC_DAWR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4)
+#define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5)
/* Transactional Memory checkpointed state:
* This is all GPRs, all VSX regs and a subset of SPRs
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 374c67875cdb..abb89bbe5635 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1058,6 +1058,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
#define KVM_CAP_SYS_HYPERV_CPUID 191
#define KVM_CAP_DIRTY_LOG_RING 192
+#define KVM_CAP_PPC_DAWR1 194
#ifdef KVM_CAP_IRQ_ROUTING
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index ce8f4ad39684..3a84394829ea 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -7,6 +7,7 @@
/x86_64/cr4_cpuid_sync_test
/x86_64/debug_regs
/x86_64/evmcs_test
+/x86_64/get_cpuid_test
/x86_64/kvm_pv_test
/x86_64/hyperv_cpuid
/x86_64/mmio_warning_test
@@ -24,10 +25,15 @@
/x86_64/vmx_preemption_timer_test
/x86_64/vmx_set_nested_state_test
/x86_64/vmx_tsc_adjust_test
+/x86_64/xapic_ipi_test
+/x86_64/xen_shinfo_test
+/x86_64/xen_vmcall_test
/x86_64/xss_msr_test
+/x86_64/vmx_pmu_msrs_test
/demand_paging_test
/dirty_log_test
/dirty_log_perf_test
/kvm_create_max_vcpus
+/memslot_modification_stress_test
/set_memory_region_test
/steal_time
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index fe41c6a0fa67..8c8eda429576 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -40,6 +40,7 @@ LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_ha
TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
+TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
@@ -56,13 +57,18 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
+TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
TEST_GEN_PROGS_x86_64 += demand_paging_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
TEST_GEN_PROGS_x86_64 += set_memory_region_test
TEST_GEN_PROGS_x86_64 += steal_time
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index cdad1eca72f7..5f7a229c3af1 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -64,7 +64,7 @@ static void *vcpu_worker(void *data)
exit_reason_str(run->exit_reason));
}
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id,
ts_diff.tv_sec, ts_diff.tv_nsec);
@@ -95,7 +95,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
return r;
}
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid,
timespec_to_ns(ts_diff));
@@ -190,7 +190,7 @@ static void *uffd_handler_thread_fn(void *arg)
pages++;
}
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
pages, ts_diff.tv_sec, ts_diff.tv_nsec,
pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
@@ -250,6 +250,7 @@ static int setup_demand_paging(struct kvm_vm *vm,
struct test_params {
bool use_uffd;
useconds_t uffd_delay;
+ bool partition_vcpu_memory_access;
};
static void run_test(enum vm_guest_mode mode, void *arg)
@@ -265,7 +266,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
int vcpu_id;
int r;
- vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size);
+ vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
+ VM_MEM_SRC_ANONYMOUS);
perf_test_args.wr_fract = 1;
@@ -277,7 +279,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
TEST_ASSERT(vcpu_threads, "Memory allocation failed");
- perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size);
+ perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size,
+ p->partition_vcpu_memory_access);
if (p->use_uffd) {
uffd_handler_threads =
@@ -293,10 +296,19 @@ static void run_test(enum vm_guest_mode mode, void *arg)
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
vm_paddr_t vcpu_gpa;
void *vcpu_hva;
+ uint64_t vcpu_mem_size;
- vcpu_gpa = guest_test_phys_mem + (vcpu_id * guest_percpu_mem_size);
+
+ if (p->partition_vcpu_memory_access) {
+ vcpu_gpa = guest_test_phys_mem +
+ (vcpu_id * guest_percpu_mem_size);
+ vcpu_mem_size = guest_percpu_mem_size;
+ } else {
+ vcpu_gpa = guest_test_phys_mem;
+ vcpu_mem_size = guest_percpu_mem_size * nr_vcpus;
+ }
PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
- vcpu_id, vcpu_gpa, vcpu_gpa + guest_percpu_mem_size);
+ vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size);
/* Cache the HVA pointer of the region */
vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
@@ -313,7 +325,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
&uffd_handler_threads[vcpu_id],
pipefds[vcpu_id * 2],
p->uffd_delay, &uffd_args[vcpu_id],
- vcpu_hva, guest_percpu_mem_size);
+ vcpu_hva, vcpu_mem_size);
if (r < 0)
exit(-r);
}
@@ -339,7 +351,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id);
}
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
pr_info("All vCPU threads joined\n");
@@ -376,7 +388,7 @@ static void help(char *name)
{
puts("");
printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n"
- " [-b memory] [-v vcpus]\n", name);
+ " [-b memory] [-v vcpus] [-o]\n", name);
guest_modes_help();
printf(" -u: use User Fault FD to handle vCPU page\n"
" faults.\n");
@@ -387,6 +399,8 @@ static void help(char *name)
" demand paged by each vCPU. e.g. 10M or 3G.\n"
" Default: 1G\n");
printf(" -v: specify the number of vCPUs to run.\n");
+ printf(" -o: Overlap guest memory accesses instead of partitioning\n"
+ " them into a separate region of memory for each vCPU.\n");
puts("");
exit(0);
}
@@ -394,12 +408,14 @@ static void help(char *name)
int main(int argc, char *argv[])
{
int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
- struct test_params p = {};
+ struct test_params p = {
+ .partition_vcpu_memory_access = true,
+ };
int opt;
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "hm:ud:b:v:")) != -1) {
+ while ((opt = getopt(argc, argv, "hm:ud:b:v:o")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
@@ -419,6 +435,9 @@ int main(int argc, char *argv[])
TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
"Invalid number of vcpus, must be between 1 and %d", max_vcpus);
break;
+ case 'o':
+ p.partition_vcpu_memory_access = false;
+ break;
case 'h':
default:
help(argv[0]);
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index 2283a0ec74a9..04a2641261be 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -28,8 +28,8 @@ static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
/* Host variables */
static u64 dirty_log_manual_caps;
static bool host_quit;
-static uint64_t iteration;
-static uint64_t vcpu_last_completed_iteration[KVM_MAX_VCPUS];
+static int iteration;
+static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
static void *vcpu_worker(void *data)
{
@@ -48,11 +48,11 @@ static void *vcpu_worker(void *data)
run = vcpu_state(vm, vcpu_id);
while (!READ_ONCE(host_quit)) {
- uint64_t current_iteration = READ_ONCE(iteration);
+ int current_iteration = READ_ONCE(iteration);
clock_gettime(CLOCK_MONOTONIC, &start);
ret = _vcpu_run(vm, vcpu_id);
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC,
@@ -61,17 +61,17 @@ static void *vcpu_worker(void *data)
pr_debug("Got sync event from vCPU %d\n", vcpu_id);
vcpu_last_completed_iteration[vcpu_id] = current_iteration;
- pr_debug("vCPU %d updated last completed iteration to %lu\n",
+ pr_debug("vCPU %d updated last completed iteration to %d\n",
vcpu_id, vcpu_last_completed_iteration[vcpu_id]);
if (current_iteration) {
pages_count += vcpu_args->pages;
total = timespec_add(total, ts_diff);
- pr_debug("vCPU %d iteration %lu dirty memory time: %ld.%.9lds\n",
+ pr_debug("vCPU %d iteration %d dirty memory time: %ld.%.9lds\n",
vcpu_id, current_iteration, ts_diff.tv_sec,
ts_diff.tv_nsec);
} else {
- pr_debug("vCPU %d iteration %lu populate memory time: %ld.%.9lds\n",
+ pr_debug("vCPU %d iteration %d populate memory time: %ld.%.9lds\n",
vcpu_id, current_iteration, ts_diff.tv_sec,
ts_diff.tv_nsec);
}
@@ -81,7 +81,7 @@ static void *vcpu_worker(void *data)
}
avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_id]);
- pr_debug("\nvCPU %d dirtied 0x%lx pages over %lu iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
+ pr_debug("\nvCPU %d dirtied 0x%lx pages over %d iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id],
total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec);
@@ -92,6 +92,8 @@ struct test_params {
unsigned long iterations;
uint64_t phys_offset;
int wr_fract;
+ bool partition_vcpu_memory_access;
+ enum vm_mem_backing_src_type backing_src;
};
static void run_test(enum vm_guest_mode mode, void *arg)
@@ -111,7 +113,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
struct kvm_enable_cap cap = {};
struct timespec clear_dirty_log_total = (struct timespec){0};
- vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size);
+ vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
+ p->backing_src);
perf_test_args.wr_fract = p->wr_fract;
@@ -129,7 +132,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
TEST_ASSERT(vcpu_threads, "Memory allocation failed");
- perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size);
+ perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size,
+ p->partition_vcpu_memory_access);
sync_global_to_guest(vm, perf_test_args);
@@ -139,17 +143,21 @@ static void run_test(enum vm_guest_mode mode, void *arg)
clock_gettime(CLOCK_MONOTONIC, &start);
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ vcpu_last_completed_iteration[vcpu_id] = -1;
+
pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
&perf_test_args.vcpu_args[vcpu_id]);
}
- /* Allow the vCPU to populate memory */
- pr_debug("Starting iteration %lu - Populating\n", iteration);
- while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration)
- pr_debug("Waiting for vcpu_last_completed_iteration == %lu\n",
- iteration);
+ /* Allow the vCPUs to populate memory */
+ pr_debug("Starting iteration %d - Populating\n", iteration);
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) !=
+ iteration)
+ ;
+ }
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
pr_info("Populate memory time: %ld.%.9lds\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
@@ -157,7 +165,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
clock_gettime(CLOCK_MONOTONIC, &start);
vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX,
KVM_MEM_LOG_DIRTY_PAGES);
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
pr_info("Enabling dirty logging time: %ld.%.9lds\n\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
@@ -169,25 +177,25 @@ static void run_test(enum vm_guest_mode mode, void *arg)
clock_gettime(CLOCK_MONOTONIC, &start);
iteration++;
- pr_debug("Starting iteration %lu\n", iteration);
+ pr_debug("Starting iteration %d\n", iteration);
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
- while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration)
- pr_debug("Waiting for vCPU %d vcpu_last_completed_iteration == %lu\n",
- vcpu_id, iteration);
+ while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id])
+ != iteration)
+ ;
}
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff);
- pr_info("Iteration %lu dirty memory time: %ld.%.9lds\n",
+ pr_info("Iteration %d dirty memory time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
clock_gettime(CLOCK_MONOTONIC, &start);
kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap);
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
get_dirty_log_total = timespec_add(get_dirty_log_total,
ts_diff);
- pr_info("Iteration %lu get dirty log time: %ld.%.9lds\n",
+ pr_info("Iteration %d get dirty log time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
if (dirty_log_manual_caps) {
@@ -195,26 +203,26 @@ static void run_test(enum vm_guest_mode mode, void *arg)
kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap, 0,
host_num_pages);
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
clear_dirty_log_total = timespec_add(clear_dirty_log_total,
ts_diff);
- pr_info("Iteration %lu clear dirty log time: %ld.%.9lds\n",
+ pr_info("Iteration %d clear dirty log time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
}
}
- /* Tell the vcpu thread to quit */
- host_quit = true;
- for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
- pthread_join(vcpu_threads[vcpu_id], NULL);
-
/* Disable dirty logging */
clock_gettime(CLOCK_MONOTONIC, &start);
vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, 0);
- ts_diff = timespec_diff_now(start);
+ ts_diff = timespec_elapsed(start);
pr_info("Disabling dirty logging time: %ld.%.9lds\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
+ /* Tell the vcpu thread to quit */
+ host_quit = true;
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+ pthread_join(vcpu_threads[vcpu_id], NULL);
+
avg = timespec_div(get_dirty_log_total, p->iterations);
pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
p->iterations, get_dirty_log_total.tv_sec,
@@ -236,7 +244,7 @@ static void help(char *name)
{
puts("");
printf("usage: %s [-h] [-i iterations] [-p offset] "
- "[-m mode] [-b vcpu bytes] [-v vcpus]\n", name);
+ "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]\n", name);
puts("");
printf(" -i: specify iteration counts (default: %"PRIu64")\n",
TEST_HOST_LOOP_N);
@@ -251,6 +259,11 @@ static void help(char *name)
" 1/<fraction of pages to write>.\n"
" (default: 1 i.e. all pages are written to.)\n");
printf(" -v: specify the number of vCPUs to run.\n");
+ printf(" -o: Overlap guest memory accesses instead of partitioning\n"
+ " them into a separate region of memory for each vCPU.\n");
+ printf(" -s: specify the type of memory that should be used to\n"
+ " back the guest data region.\n\n");
+ backing_src_help();
puts("");
exit(0);
}
@@ -261,6 +274,8 @@ int main(int argc, char *argv[])
struct test_params p = {
.iterations = TEST_HOST_LOOP_N,
.wr_fract = 1,
+ .partition_vcpu_memory_access = true,
+ .backing_src = VM_MEM_SRC_ANONYMOUS,
};
int opt;
@@ -271,10 +286,10 @@ int main(int argc, char *argv[])
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:")) != -1) {
+ while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:")) != -1) {
switch (opt) {
case 'i':
- p.iterations = strtol(optarg, NULL, 10);
+ p.iterations = atoi(optarg);
break;
case 'p':
p.phys_offset = strtoull(optarg, NULL, 0);
@@ -295,6 +310,11 @@ int main(int argc, char *argv[])
TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
"Invalid number of vcpus, must be between 1 and %d", max_vcpus);
break;
+ case 'o':
+ p.partition_vcpu_memory_access = false;
+ case 's':
+ p.backing_src = parse_backing_src_type(optarg);
+ break;
case 'h':
default:
help(argv[0]);
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 5cbb861525ed..2d7eb6989e83 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -79,12 +79,6 @@ struct vm_guest_mode_params {
};
extern const struct vm_guest_mode_params vm_guest_mode_params[];
-enum vm_mem_backing_src_type {
- VM_MEM_SRC_ANONYMOUS,
- VM_MEM_SRC_ANONYMOUS_THP,
- VM_MEM_SRC_ANONYMOUS_HUGETLB,
-};
-
int kvm_check_cap(long cap);
int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h
new file mode 100644
index 000000000000..b020547403fd
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/numaif.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/numaif.h
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Header file that provides access to NUMA API functions not explicitly
+ * exported to user space.
+ */
+
+#ifndef SELFTEST_KVM_NUMAIF_H
+#define SELFTEST_KVM_NUMAIF_H
+
+#define __NR_get_mempolicy 239
+#define __NR_migrate_pages 256
+
+/* System calls */
+long get_mempolicy(int *policy, const unsigned long *nmask,
+ unsigned long maxnode, void *addr, int flags)
+{
+ return syscall(__NR_get_mempolicy, policy, nmask,
+ maxnode, addr, flags);
+}
+
+long migrate_pages(int pid, unsigned long maxnode,
+ const unsigned long *frommask,
+ const unsigned long *tomask)
+{
+ return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask);
+}
+
+/* Policies */
+#define MPOL_DEFAULT 0
+#define MPOL_PREFERRED 1
+#define MPOL_BIND 2
+#define MPOL_INTERLEAVE 3
+
+#define MPOL_MAX MPOL_INTERLEAVE
+
+/* Flags for get_mem_policy */
+#define MPOL_F_NODE (1<<0) /* return next il node or node of address */
+ /* Warning: MPOL_F_NODE is unsupported and
+ * subject to change. Don't use.
+ */
+#define MPOL_F_ADDR (1<<1) /* look up vma using address */
+#define MPOL_F_MEMS_ALLOWED (1<<2) /* query nodes allowed in cpuset */
+
+/* Flags for mbind */
+#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
+#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
+#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
+
+#endif /* SELFTEST_KVM_NUMAIF_H */
diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h
index b1188823c31b..005f2143adeb 100644
--- a/tools/testing/selftests/kvm/include/perf_test_util.h
+++ b/tools/testing/selftests/kvm/include/perf_test_util.h
@@ -44,8 +44,11 @@ extern struct perf_test_args perf_test_args;
extern uint64_t guest_test_phys_mem;
struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
- uint64_t vcpu_memory_bytes);
+ uint64_t vcpu_memory_bytes,
+ enum vm_mem_backing_src_type backing_src);
void perf_test_destroy_vm(struct kvm_vm *vm);
-void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes);
+void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus,
+ uint64_t vcpu_memory_bytes,
+ bool partition_vcpu_memory_access);
#endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index ffffa560436b..b7f41399f22c 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -64,7 +64,21 @@ int64_t timespec_to_ns(struct timespec ts);
struct timespec timespec_add_ns(struct timespec ts, int64_t ns);
struct timespec timespec_add(struct timespec ts1, struct timespec ts2);
struct timespec timespec_sub(struct timespec ts1, struct timespec ts2);
-struct timespec timespec_diff_now(struct timespec start);
+struct timespec timespec_elapsed(struct timespec start);
struct timespec timespec_div(struct timespec ts, int divisor);
+enum vm_mem_backing_src_type {
+ VM_MEM_SRC_ANONYMOUS,
+ VM_MEM_SRC_ANONYMOUS_THP,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB,
+};
+
+struct vm_mem_backing_src_alias {
+ const char *name;
+ enum vm_mem_backing_src_type type;
+};
+
+void backing_src_help(void);
+enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
+
#endif /* SELFTEST_KVM_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 90cd5984751b..0b30b4e15c38 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -263,6 +263,19 @@ static inline void outl(uint16_t port, uint32_t value)
__asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value));
}
+static inline void cpuid(uint32_t *eax, uint32_t *ebx,
+ uint32_t *ecx, uint32_t *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx)
+ : "memory");
+}
+
#define SET_XMM(__var, __xmm) \
asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm)
@@ -338,8 +351,10 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_x86_state *state);
struct kvm_msr_list *kvm_get_msr_index_list(void);
-
+uint64_t kvm_get_feature_msr(uint64_t msr_index);
struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
+
+struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_cpuid2 *cpuid);
@@ -391,6 +406,10 @@ bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent);
uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
uint64_t a3);
+struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
+void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
+struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
+
/*
* Basic CPU control in CR0
*/
@@ -406,8 +425,27 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
#define X86_CR0_CD (1UL<<30) /* Cache Disable */
#define X86_CR0_PG (1UL<<31) /* Paging */
+#define APIC_DEFAULT_GPA 0xfee00000ULL
+
+/* APIC base address MSR and fields */
+#define MSR_IA32_APICBASE 0x0000001b
+#define MSR_IA32_APICBASE_BSP (1<<8)
+#define MSR_IA32_APICBASE_EXTD (1<<10)
+#define MSR_IA32_APICBASE_ENABLE (1<<11)
+#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
+#define GET_APIC_BASE(x) (((x) >> 12) << 12)
+
#define APIC_BASE_MSR 0x800
#define X2APIC_ENABLE (1UL << 10)
+#define APIC_ID 0x20
+#define APIC_LVR 0x30
+#define GET_APIC_ID_FIELD(x) (((x) >> 24) & 0xFF)
+#define APIC_TASKPRI 0x80
+#define APIC_PROCPRI 0xA0
+#define APIC_EOI 0xB0
+#define APIC_SPIV 0xF0
+#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
+#define APIC_SPIV_APIC_ENABLED (1 << 8)
#define APIC_ICR 0x300
#define APIC_DEST_SELF 0x40000
#define APIC_DEST_ALLINC 0x80000
@@ -432,6 +470,7 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
#define APIC_DM_EXTINT 0x00700
#define APIC_VECTOR_MASK 0x000FF
#define APIC_ICR2 0x310
+#define SET_APIC_DEST_FIELD(x) ((x) << 24)
/* VMX_EPT_VPID_CAP bits */
#define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index fa5a90e6c6f0..d787cb802b4a 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1801,6 +1801,7 @@ static struct exit_reason {
{KVM_EXIT_DIRTY_RING_FULL, "DIRTY_RING_FULL"},
{KVM_EXIT_X86_RDMSR, "RDMSR"},
{KVM_EXIT_X86_WRMSR, "WRMSR"},
+ {KVM_EXIT_XEN, "XEN"},
#ifdef KVM_EXIT_MEMORY_NOT_PRESENT
{KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
#endif
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
index 9be1944c2d1c..81490b9b4e32 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -49,7 +49,8 @@ static void guest_code(uint32_t vcpu_id)
}
struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
- uint64_t vcpu_memory_bytes)
+ uint64_t vcpu_memory_bytes,
+ enum vm_mem_backing_src_type backing_src)
{
struct kvm_vm *vm;
uint64_t guest_num_pages;
@@ -93,8 +94,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
/* Add an extra memory slot for testing */
- vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
- guest_test_phys_mem,
+ vm_userspace_mem_region_add(vm, backing_src, guest_test_phys_mem,
PERF_TEST_MEM_SLOT_INDEX,
guest_num_pages, 0);
@@ -112,7 +112,9 @@ void perf_test_destroy_vm(struct kvm_vm *vm)
kvm_vm_free(vm);
}
-void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes)
+void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus,
+ uint64_t vcpu_memory_bytes,
+ bool partition_vcpu_memory_access)
{
vm_paddr_t vcpu_gpa;
struct perf_test_vcpu_args *vcpu_args;
@@ -122,13 +124,22 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_by
vcpu_args = &perf_test_args.vcpu_args[vcpu_id];
vcpu_args->vcpu_id = vcpu_id;
- vcpu_args->gva = guest_test_virt_mem +
- (vcpu_id * vcpu_memory_bytes);
- vcpu_args->pages = vcpu_memory_bytes /
- perf_test_args.guest_page_size;
+ if (partition_vcpu_memory_access) {
+ vcpu_args->gva = guest_test_virt_mem +
+ (vcpu_id * vcpu_memory_bytes);
+ vcpu_args->pages = vcpu_memory_bytes /
+ perf_test_args.guest_page_size;
+ vcpu_gpa = guest_test_phys_mem +
+ (vcpu_id * vcpu_memory_bytes);
+ } else {
+ vcpu_args->gva = guest_test_virt_mem;
+ vcpu_args->pages = (vcpus * vcpu_memory_bytes) /
+ perf_test_args.guest_page_size;
+ vcpu_gpa = guest_test_phys_mem;
+ }
- vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes);
pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
- vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes);
+ vcpu_id, vcpu_gpa, vcpu_gpa +
+ (vcpu_args->pages * perf_test_args.guest_page_size));
}
}
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 8e04c0b1608e..906c955384e2 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -10,6 +10,7 @@
#include <limits.h>
#include <stdlib.h>
#include <time.h>
+#include "linux/kernel.h"
#include "test_util.h"
@@ -84,7 +85,7 @@ struct timespec timespec_sub(struct timespec ts1, struct timespec ts2)
return timespec_add_ns((struct timespec){0}, ns1 - ns2);
}
-struct timespec timespec_diff_now(struct timespec start)
+struct timespec timespec_elapsed(struct timespec start)
{
struct timespec end;
@@ -109,3 +110,31 @@ void print_skip(const char *fmt, ...)
va_end(ap);
puts(", skipping test");
}
+
+const struct vm_mem_backing_src_alias backing_src_aliases[] = {
+ {"anonymous", VM_MEM_SRC_ANONYMOUS,},
+ {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
+ {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
+};
+
+void backing_src_help(void)
+{
+ int i;
+
+ printf("Available backing src types:\n");
+ for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
+ printf("\t%s\n", backing_src_aliases[i].name);
+}
+
+enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
+ if (!strcmp(type_name, backing_src_aliases[i].name))
+ return backing_src_aliases[i].type;
+
+ backing_src_help();
+ TEST_FAIL("Unknown backing src type: %s", type_name);
+ return -1;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 95e1a757c629..de0c76177d02 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -670,6 +670,82 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
}
/*
+ * KVM Get MSR
+ *
+ * Input Args:
+ * msr_index - Index of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced.
+ *
+ * Get value of MSR for VCPU.
+ */
+uint64_t kvm_get_feature_msr(uint64_t msr_index)
+{
+ struct {
+ struct kvm_msrs header;
+ struct kvm_msr_entry entry;
+ } buffer = {};
+ int r, kvm_fd;
+
+ buffer.header.nmsrs = 1;
+ buffer.entry.index = msr_index;
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);
+ TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
+ " rc: %i errno: %i", r, errno);
+
+ close(kvm_fd);
+ return buffer.entry.data;
+}
+
+/*
+ * VM VCPU CPUID Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU id
+ *
+ * Output Args: None
+ *
+ * Return: KVM CPUID (KVM_GET_CPUID2)
+ *
+ * Set the VCPU's CPUID.
+ */
+struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ struct kvm_cpuid2 *cpuid;
+ int rc, max_ent;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ cpuid = allocate_kvm_cpuid2();
+ max_ent = cpuid->nent;
+
+ for (cpuid->nent = 1; cpuid->nent <= max_ent; cpuid->nent++) {
+ rc = ioctl(vcpu->fd, KVM_GET_CPUID2, cpuid);
+ if (!rc)
+ break;
+
+ TEST_ASSERT(rc == -1 && errno == E2BIG,
+ "KVM_GET_CPUID2 should either succeed or give E2BIG: %d %d",
+ rc, errno);
+ }
+
+ TEST_ASSERT(rc == 0, "KVM_GET_CPUID2 failed, rc: %i errno: %i",
+ rc, errno);
+
+ return cpuid;
+}
+
+
+
+/*
* Locate a cpuid entry.
*
* Input Args:
@@ -1224,3 +1300,71 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
: "b"(a0), "c"(a1), "d"(a2), "S"(a3));
return r;
}
+
+struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void)
+{
+ static struct kvm_cpuid2 *cpuid;
+ int ret;
+ int kvm_fd;
+
+ if (cpuid)
+ return cpuid;
+
+ cpuid = allocate_kvm_cpuid2();
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+ TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n",
+ ret, errno);
+
+ close(kvm_fd);
+ return cpuid;
+}
+
+void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ static struct kvm_cpuid2 *cpuid_full;
+ struct kvm_cpuid2 *cpuid_sys, *cpuid_hv;
+ int i, nent = 0;
+
+ if (!cpuid_full) {
+ cpuid_sys = kvm_get_supported_cpuid();
+ cpuid_hv = kvm_get_supported_hv_cpuid();
+
+ cpuid_full = malloc(sizeof(*cpuid_full) +
+ (cpuid_sys->nent + cpuid_hv->nent) *
+ sizeof(struct kvm_cpuid_entry2));
+ if (!cpuid_full) {
+ perror("malloc");
+ abort();
+ }
+
+ /* Need to skip KVM CPUID leaves 0x400000xx */
+ for (i = 0; i < cpuid_sys->nent; i++) {
+ if (cpuid_sys->entries[i].function >= 0x40000000 &&
+ cpuid_sys->entries[i].function < 0x40000100)
+ continue;
+ cpuid_full->entries[nent] = cpuid_sys->entries[i];
+ nent++;
+ }
+
+ memcpy(&cpuid_full->entries[nent], cpuid_hv->entries,
+ cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2));
+ cpuid_full->nent = nent + cpuid_hv->nent;
+ }
+
+ vcpu_set_cpuid(vm, vcpuid, cpuid_full);
+}
+
+struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ static struct kvm_cpuid2 *cpuid;
+
+ cpuid = allocate_kvm_cpuid2();
+
+ vcpu_ioctl(vm, vcpuid, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+
+ return cpuid;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c
index 3a5c72ed2b79..827fe6028dd4 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/svm.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c
@@ -74,7 +74,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r
wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa);
memset(vmcb, 0, sizeof(*vmcb));
- asm volatile ("vmsave\n\t" : : "a" (vmcb_gpa) : "memory");
+ asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory");
vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr);
vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr);
vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr);
@@ -131,19 +131,19 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r
void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa)
{
asm volatile (
- "vmload\n\t"
+ "vmload %[vmcb_gpa]\n\t"
"mov rflags, %%r15\n\t" // rflags
"mov %%r15, 0x170(%[vmcb])\n\t"
"mov guest_regs, %%r15\n\t" // rax
"mov %%r15, 0x1f8(%[vmcb])\n\t"
LOAD_GPR_C
- "vmrun\n\t"
+ "vmrun %[vmcb_gpa]\n\t"
SAVE_GPR_C
"mov 0x170(%[vmcb]), %%r15\n\t" // rflags
"mov %%r15, rflags\n\t"
"mov 0x1f8(%[vmcb]), %%r15\n\t" // rax
"mov %%r15, guest_regs\n\t"
- "vmsave\n\t"
+ "vmsave %[vmcb_gpa]\n\t"
: : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa)
: "r15", "memory");
}
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
new file mode 100644
index 000000000000..6096bf0a5b34
--- /dev/null
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM memslot modification stress test
+ * Adapted from demand_paging_test.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2020, Google, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <asm/unistd.h>
+#include <time.h>
+#include <poll.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/userfaultfd.h>
+
+#include "perf_test_util.h"
+#include "processor.h"
+#include "test_util.h"
+#include "guest_modes.h"
+
+#define DUMMY_MEMSLOT_INDEX 7
+
+#define DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS 10
+
+
+static int nr_vcpus = 1;
+static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+
+static bool run_vcpus = true;
+
+static void *vcpu_worker(void *data)
+{
+ int ret;
+ struct perf_test_vcpu_args *vcpu_args =
+ (struct perf_test_vcpu_args *)data;
+ int vcpu_id = vcpu_args->vcpu_id;
+ struct kvm_vm *vm = perf_test_args.vm;
+ struct kvm_run *run;
+
+ vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+ run = vcpu_state(vm, vcpu_id);
+
+ /* Let the guest access its memory until a stop signal is received */
+ while (READ_ONCE(run_vcpus)) {
+ ret = _vcpu_run(vm, vcpu_id);
+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+
+ if (get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC)
+ continue;
+
+ TEST_ASSERT(false,
+ "Invalid guest sync status: exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+ }
+
+ return NULL;
+}
+
+struct memslot_antagonist_args {
+ struct kvm_vm *vm;
+ useconds_t delay;
+ uint64_t nr_modifications;
+};
+
+static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay,
+ uint64_t nr_modifications, uint64_t gpa)
+{
+ int i;
+
+ for (i = 0; i < nr_modifications; i++) {
+ usleep(delay);
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa,
+ DUMMY_MEMSLOT_INDEX, 1, 0);
+
+ vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX);
+ }
+}
+
+struct test_params {
+ useconds_t memslot_modification_delay;
+ uint64_t nr_memslot_modifications;
+ bool partition_vcpu_memory_access;
+};
+
+static void run_test(enum vm_guest_mode mode, void *arg)
+{
+ struct test_params *p = arg;
+ pthread_t *vcpu_threads;
+ struct kvm_vm *vm;
+ int vcpu_id;
+
+ vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
+ VM_MEM_SRC_ANONYMOUS);
+
+ perf_test_args.wr_fract = 1;
+
+ vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
+ TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+ perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size,
+ p->partition_vcpu_memory_access);
+
+ /* Export the shared variables to the guest */
+ sync_global_to_guest(vm, perf_test_args);
+
+ pr_info("Finished creating vCPUs\n");
+
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+ pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+ &perf_test_args.vcpu_args[vcpu_id]);
+
+ pr_info("Started all vCPUs\n");
+
+ add_remove_memslot(vm, p->memslot_modification_delay,
+ p->nr_memslot_modifications,
+ guest_test_phys_mem +
+ (guest_percpu_mem_size * nr_vcpus) +
+ perf_test_args.host_page_size +
+ perf_test_args.guest_page_size);
+
+ run_vcpus = false;
+
+ /* Wait for the vcpu threads to quit */
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+ pthread_join(vcpu_threads[vcpu_id], NULL);
+
+ pr_info("All vCPU threads joined\n");
+
+ ucall_uninit(vm);
+ kvm_vm_free(vm);
+
+ free(vcpu_threads);
+}
+
+static void help(char *name)
+{
+ puts("");
+ printf("usage: %s [-h] [-m mode] [-d delay_usec]\n"
+ " [-b memory] [-v vcpus] [-o] [-i iterations]\n", name);
+ guest_modes_help();
+ printf(" -d: add a delay between each iteration of adding and\n"
+ " deleting a memslot in usec.\n");
+ printf(" -b: specify the size of the memory region which should be\n"
+ " accessed by each vCPU. e.g. 10M or 3G.\n"
+ " Default: 1G\n");
+ printf(" -v: specify the number of vCPUs to run.\n");
+ printf(" -o: Overlap guest memory accesses instead of partitioning\n"
+ " them into a separate region of memory for each vCPU.\n");
+ printf(" -i: specify the number of iterations of adding and removing\n"
+ " a memslot.\n"
+ " Default: %d\n", DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS);
+ puts("");
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+ int opt;
+ struct test_params p = {
+ .memslot_modification_delay = 0,
+ .nr_memslot_modifications =
+ DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS,
+ .partition_vcpu_memory_access = true
+ };
+
+ guest_modes_append_default();
+
+ while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) {
+ switch (opt) {
+ case 'm':
+ guest_modes_cmdline(optarg);
+ break;
+ case 'd':
+ p.memslot_modification_delay = strtoul(optarg, NULL, 0);
+ TEST_ASSERT(p.memslot_modification_delay >= 0,
+ "A negative delay is not supported.");
+ break;
+ case 'b':
+ guest_percpu_mem_size = parse_size(optarg);
+ break;
+ case 'v':
+ nr_vcpus = atoi(optarg);
+ TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+ "Invalid number of vcpus, must be between 1 and %d",
+ max_vcpus);
+ break;
+ case 'o':
+ p.partition_vcpu_memory_access = false;
+ break;
+ case 'i':
+ p.nr_memslot_modifications = atoi(optarg);
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ break;
+ }
+ }
+
+ for_each_guest_mode(run_test, &p);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/settings b/tools/testing/selftests/kvm/settings
new file mode 100644
index 000000000000..6091b45d226b
--- /dev/null
+++ b/tools/testing/selftests/kvm/settings
@@ -0,0 +1 @@
+timeout=120
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
index 37b8a78f6b74..ca22ee6d19cb 100644
--- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
@@ -99,6 +99,7 @@ int main(int argc, char *argv[])
exit(KSFT_SKIP);
}
+ vcpu_set_hv_cpuid(vm, VCPU_ID);
vcpu_enable_evmcs(vm, VCPU_ID);
run = vcpu_state(vm, VCPU_ID);
@@ -142,7 +143,7 @@ int main(int argc, char *argv[])
/* Restore state in a new VM. */
kvm_vm_restart(vm, O_RDWR);
vm_vcpu_add(vm, VCPU_ID);
- vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_set_hv_cpuid(vm, VCPU_ID);
vcpu_enable_evmcs(vm, VCPU_ID);
vcpu_load_state(vm, VCPU_ID, state);
run = vcpu_state(vm, VCPU_ID);
diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
new file mode 100644
index 000000000000..9b78e8889638
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Red Hat Inc.
+ *
+ * Generic tests for KVM CPUID set/get ioctls
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+/* CPUIDs known to differ */
+struct {
+ u32 function;
+ u32 index;
+} mangled_cpuids[] = {
+ {.function = 0xd, .index = 0},
+};
+
+static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
+{
+ int i;
+ u32 eax, ebx, ecx, edx;
+
+ for (i = 0; i < guest_cpuid->nent; i++) {
+ eax = guest_cpuid->entries[i].function;
+ ecx = guest_cpuid->entries[i].index;
+
+ cpuid(&eax, &ebx, &ecx, &edx);
+
+ GUEST_ASSERT(eax == guest_cpuid->entries[i].eax &&
+ ebx == guest_cpuid->entries[i].ebx &&
+ ecx == guest_cpuid->entries[i].ecx &&
+ edx == guest_cpuid->entries[i].edx);
+ }
+
+}
+
+static void test_cpuid_40000000(struct kvm_cpuid2 *guest_cpuid)
+{
+ u32 eax = 0x40000000, ebx, ecx = 0, edx;
+
+ cpuid(&eax, &ebx, &ecx, &edx);
+
+ GUEST_ASSERT(eax == 0x40000001);
+}
+
+static void guest_main(struct kvm_cpuid2 *guest_cpuid)
+{
+ GUEST_SYNC(1);
+
+ test_guest_cpuids(guest_cpuid);
+
+ GUEST_SYNC(2);
+
+ test_cpuid_40000000(guest_cpuid);
+
+ GUEST_DONE();
+}
+
+static bool is_cpuid_mangled(struct kvm_cpuid_entry2 *entrie)
+{
+ int i;
+
+ for (i = 0; i < sizeof(mangled_cpuids); i++) {
+ if (mangled_cpuids[i].function == entrie->function &&
+ mangled_cpuids[i].index == entrie->index)
+ return true;
+ }
+
+ return false;
+}
+
+static void check_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *entrie)
+{
+ int i;
+
+ for (i = 0; i < cpuid->nent; i++) {
+ if (cpuid->entries[i].function == entrie->function &&
+ cpuid->entries[i].index == entrie->index) {
+ if (is_cpuid_mangled(entrie))
+ return;
+
+ TEST_ASSERT(cpuid->entries[i].eax == entrie->eax &&
+ cpuid->entries[i].ebx == entrie->ebx &&
+ cpuid->entries[i].ecx == entrie->ecx &&
+ cpuid->entries[i].edx == entrie->edx,
+ "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x",
+ entrie->function, entrie->index,
+ cpuid->entries[i].eax, cpuid->entries[i].ebx,
+ cpuid->entries[i].ecx, cpuid->entries[i].edx,
+ entrie->eax, entrie->ebx, entrie->ecx, entrie->edx);
+ return;
+ }
+ }
+
+ TEST_ASSERT(false, "CPUID 0x%x.%x not found", entrie->function, entrie->index);
+}
+
+static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2)
+{
+ int i;
+
+ for (i = 0; i < cpuid1->nent; i++)
+ check_cpuid(cpuid2, &cpuid1->entries[i]);
+
+ for (i = 0; i < cpuid2->nent; i++)
+ check_cpuid(cpuid1, &cpuid2->entries[i]);
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
+{
+ struct ucall uc;
+
+ _vcpu_run(vm, vcpuid);
+
+ switch (get_ucall(vm, vcpuid, &uc)) {
+ case UCALL_SYNC:
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage + 1,
+ "Stage %d: Unexpected register values vmexit, got %lx",
+ stage + 1, (ulong)uc.args[1]);
+ return;
+ case UCALL_DONE:
+ return;
+ case UCALL_ABORT:
+ TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", (const char *)uc.args[0],
+ __FILE__, uc.args[1], uc.args[2], uc.args[3]);
+ default:
+ TEST_ASSERT(false, "Unexpected exit: %s",
+ exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+ }
+}
+
+struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid)
+{
+ int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]);
+ vm_vaddr_t gva = vm_vaddr_alloc(vm, size,
+ getpagesize(), 0, 0);
+ struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva);
+
+ memcpy(guest_cpuids, cpuid, size);
+
+ *p_gva = gva;
+ return guest_cpuids;
+}
+
+int main(void)
+{
+ struct kvm_cpuid2 *supp_cpuid, *cpuid2;
+ vm_vaddr_t cpuid_gva;
+ struct kvm_vm *vm;
+ int stage;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_main);
+
+ supp_cpuid = kvm_get_supported_cpuid();
+ cpuid2 = vcpu_get_cpuid(vm, VCPU_ID);
+
+ compare_cpuids(supp_cpuid, cpuid2);
+
+ vcpu_alloc_cpuid(vm, &cpuid_gva, cpuid2);
+
+ vcpu_args_set(vm, VCPU_ID, 1, cpuid_gva);
+
+ for (stage = 0; stage < 3; stage++)
+ run_vcpu(vm, VCPU_ID, stage);
+
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
index 88a595b7fbdd..7e2d2d17d2ed 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
@@ -125,30 +125,6 @@ void test_hv_cpuid_e2big(struct kvm_vm *vm, bool system)
" it should have: %d %d", system ? "KVM" : "vCPU", ret, errno);
}
-
-struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm, bool system)
-{
- int nent = 20; /* should be enough */
- static struct kvm_cpuid2 *cpuid;
-
- cpuid = malloc(sizeof(*cpuid) + nent * sizeof(struct kvm_cpuid_entry2));
-
- if (!cpuid) {
- perror("malloc");
- abort();
- }
-
- cpuid->nent = nent;
-
- if (!system)
- vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
- else
- kvm_ioctl(vm, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
-
- return cpuid;
-}
-
-
int main(int argc, char *argv[])
{
struct kvm_vm *vm;
@@ -167,7 +143,7 @@ int main(int argc, char *argv[])
/* Test vCPU ioctl version */
test_hv_cpuid_e2big(vm, false);
- hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm, false);
+ hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, VCPU_ID);
test_hv_cpuid(hv_cpuid_entries, false);
free(hv_cpuid_entries);
@@ -177,7 +153,7 @@ int main(int argc, char *argv[])
goto do_sys;
}
vcpu_enable_evmcs(vm, VCPU_ID);
- hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm, false);
+ hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, VCPU_ID);
test_hv_cpuid(hv_cpuid_entries, true);
free(hv_cpuid_entries);
@@ -190,9 +166,8 @@ do_sys:
test_hv_cpuid_e2big(vm, true);
- hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm, true);
+ hv_cpuid_entries = kvm_get_supported_hv_cpuid();
test_hv_cpuid(hv_cpuid_entries, nested_vmx_supported());
- free(hv_cpuid_entries);
out:
kvm_vm_free(vm);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c
new file mode 100644
index 000000000000..23051d84b907
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * VMX-pmu related msrs test
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Test to check the effect of various CPUID settings
+ * on the MSR_IA32_PERF_CAPABILITIES MSR, and check that
+ * whatever we write with KVM_SET_MSR is _not_ modified
+ * in the guest and test it can be retrieved with KVM_GET_MSR.
+ *
+ * Test to check that invalid LBR formats are rejected.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <sys/ioctl.h>
+
+#include "kvm_util.h"
+#include "vmx.h"
+
+#define VCPU_ID 0
+
+#define X86_FEATURE_PDCM (1<<15)
+#define PMU_CAP_FW_WRITES (1ULL << 13)
+#define PMU_CAP_LBR_FMT 0x3f
+
+union cpuid10_eax {
+ struct {
+ unsigned int version_id:8;
+ unsigned int num_counters:8;
+ unsigned int bit_width:8;
+ unsigned int mask_length:8;
+ } split;
+ unsigned int full;
+};
+
+union perf_capabilities {
+ struct {
+ u64 lbr_format:6;
+ u64 pebs_trap:1;
+ u64 pebs_arch_reg:1;
+ u64 pebs_format:4;
+ u64 smm_freeze:1;
+ u64 full_width_write:1;
+ u64 pebs_baseline:1;
+ u64 perf_metrics:1;
+ u64 pebs_output_pt_available:1;
+ u64 anythread_deprecated:1;
+ };
+ u64 capabilities;
+};
+
+static void guest_code(void)
+{
+ wrmsr(MSR_IA32_PERF_CAPABILITIES, PMU_CAP_LBR_FMT);
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_cpuid2 *cpuid;
+ struct kvm_cpuid_entry2 *entry_1_0;
+ struct kvm_cpuid_entry2 *entry_a_0;
+ bool pdcm_supported = false;
+ struct kvm_vm *vm;
+ int ret;
+ union cpuid10_eax eax;
+ union perf_capabilities host_cap;
+
+ host_cap.capabilities = kvm_get_feature_msr(MSR_IA32_PERF_CAPABILITIES);
+ host_cap.capabilities &= (PMU_CAP_FW_WRITES | PMU_CAP_LBR_FMT);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ cpuid = kvm_get_supported_cpuid();
+
+ if (kvm_get_cpuid_max_basic() >= 0xa) {
+ entry_1_0 = kvm_get_supported_cpuid_index(1, 0);
+ entry_a_0 = kvm_get_supported_cpuid_index(0xa, 0);
+ pdcm_supported = entry_1_0 && !!(entry_1_0->ecx & X86_FEATURE_PDCM);
+ eax.full = entry_a_0->eax;
+ }
+ if (!pdcm_supported) {
+ print_skip("MSR_IA32_PERF_CAPABILITIES is not supported by the vCPU");
+ exit(KSFT_SKIP);
+ }
+ if (!eax.split.version_id) {
+ print_skip("PMU is not supported by the vCPU");
+ exit(KSFT_SKIP);
+ }
+
+ /* testcase 1, set capabilities when we have PDCM bit */
+ vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+ vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES);
+
+ /* check capabilities can be retrieved with KVM_GET_MSR */
+ ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES);
+
+ /* check whatever we write with KVM_SET_MSR is _not_ modified */
+ vcpu_run(vm, VCPU_ID);
+ ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES);
+
+ /* testcase 2, check valid LBR formats are accepted */
+ vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0);
+ ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), 0);
+
+ vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, host_cap.lbr_format);
+ ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), (u64)host_cap.lbr_format);
+
+ /* testcase 3, check invalid LBR format is rejected */
+ ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_LBR_FMT);
+ TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
+
+ /* testcase 4, set capabilities when we don't have PDCM bit */
+ entry_1_0->ecx &= ~X86_FEATURE_PDCM;
+ vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+ ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+ TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
+
+ /* testcase 5, set capabilities when we don't have PMU version bits */
+ entry_1_0->ecx |= X86_FEATURE_PDCM;
+ eax.split.version_id = 0;
+ entry_1_0->ecx = eax.full;
+ vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+ ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES);
+ TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
+
+ vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0);
+ ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), 0);
+
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c
new file mode 100644
index 000000000000..2f964cdc273c
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * xapic_ipi_test
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Test that when the APIC is in xAPIC mode, a vCPU can send an IPI to wake
+ * another vCPU that is halted when KVM's backing page for the APIC access
+ * address has been moved by mm.
+ *
+ * The test starts two vCPUs: one that sends IPIs and one that continually
+ * executes HLT. The sender checks that the halter has woken from the HLT and
+ * has reentered HLT before sending the next IPI. While the vCPUs are running,
+ * the host continually calls migrate_pages to move all of the process' pages
+ * amongst the available numa nodes on the machine.
+ *
+ * Migration is a command line option. When used on non-numa machines will
+ * exit with error. Test is still usefull on non-numa for testing IPIs.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <getopt.h>
+#include <pthread.h>
+#include <inttypes.h>
+#include <string.h>
+#include <time.h>
+
+#include "kvm_util.h"
+#include "numaif.h"
+#include "processor.h"
+#include "test_util.h"
+#include "vmx.h"
+
+/* Default running time for the test */
+#define DEFAULT_RUN_SECS 3
+
+/* Default delay between migrate_pages calls (microseconds) */
+#define DEFAULT_DELAY_USECS 500000
+
+#define HALTER_VCPU_ID 0
+#define SENDER_VCPU_ID 1
+
+volatile uint32_t *apic_base = (volatile uint32_t *)APIC_DEFAULT_GPA;
+
+/*
+ * Vector for IPI from sender vCPU to halting vCPU.
+ * Value is arbitrary and was chosen for the alternating bit pattern. Any
+ * value should work.
+ */
+#define IPI_VECTOR 0xa5
+
+/*
+ * Incremented in the IPI handler. Provides evidence to the sender that the IPI
+ * arrived at the destination
+ */
+static volatile uint64_t ipis_rcvd;
+
+/* Data struct shared between host main thread and vCPUs */
+struct test_data_page {
+ uint32_t halter_apic_id;
+ volatile uint64_t hlt_count;
+ volatile uint64_t wake_count;
+ uint64_t ipis_sent;
+ uint64_t migrations_attempted;
+ uint64_t migrations_completed;
+ uint32_t icr;
+ uint32_t icr2;
+ uint32_t halter_tpr;
+ uint32_t halter_ppr;
+
+ /*
+ * Record local version register as a cross-check that APIC access
+ * worked. Value should match what KVM reports (APIC_VERSION in
+ * arch/x86/kvm/lapic.c). If test is failing, check that values match
+ * to determine whether APIC access exits are working.
+ */
+ uint32_t halter_lvr;
+};
+
+struct thread_params {
+ struct test_data_page *data;
+ struct kvm_vm *vm;
+ uint32_t vcpu_id;
+ uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */
+};
+
+uint32_t read_apic_reg(uint reg)
+{
+ return apic_base[reg >> 2];
+}
+
+void write_apic_reg(uint reg, uint32_t val)
+{
+ apic_base[reg >> 2] = val;
+}
+
+void disable_apic(void)
+{
+ wrmsr(MSR_IA32_APICBASE,
+ rdmsr(MSR_IA32_APICBASE) &
+ ~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD));
+}
+
+void enable_xapic(void)
+{
+ uint64_t val = rdmsr(MSR_IA32_APICBASE);
+
+ /* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */
+ if (val & MSR_IA32_APICBASE_EXTD) {
+ disable_apic();
+ wrmsr(MSR_IA32_APICBASE,
+ rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE);
+ } else if (!(val & MSR_IA32_APICBASE_ENABLE)) {
+ wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE);
+ }
+
+ /*
+ * Per SDM: reset value of spurious interrupt vector register has the
+ * APIC software enabled bit=0. It must be enabled in addition to the
+ * enable bit in the MSR.
+ */
+ val = read_apic_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED;
+ write_apic_reg(APIC_SPIV, val);
+}
+
+void verify_apic_base_addr(void)
+{
+ uint64_t msr = rdmsr(MSR_IA32_APICBASE);
+ uint64_t base = GET_APIC_BASE(msr);
+
+ GUEST_ASSERT(base == APIC_DEFAULT_GPA);
+}
+
+static void halter_guest_code(struct test_data_page *data)
+{
+ verify_apic_base_addr();
+ enable_xapic();
+
+ data->halter_apic_id = GET_APIC_ID_FIELD(read_apic_reg(APIC_ID));
+ data->halter_lvr = read_apic_reg(APIC_LVR);
+
+ /*
+ * Loop forever HLTing and recording halts & wakes. Disable interrupts
+ * each time around to minimize window between signaling the pending
+ * halt to the sender vCPU and executing the halt. No need to disable on
+ * first run as this vCPU executes first and the host waits for it to
+ * signal going into first halt before starting the sender vCPU. Record
+ * TPR and PPR for diagnostic purposes in case the test fails.
+ */
+ for (;;) {
+ data->halter_tpr = read_apic_reg(APIC_TASKPRI);
+ data->halter_ppr = read_apic_reg(APIC_PROCPRI);
+ data->hlt_count++;
+ asm volatile("sti; hlt; cli");
+ data->wake_count++;
+ }
+}
+
+/*
+ * Runs on halter vCPU when IPI arrives. Write an arbitrary non-zero value to
+ * enable diagnosing errant writes to the APIC access address backing page in
+ * case of test failure.
+ */
+static void guest_ipi_handler(struct ex_regs *regs)
+{
+ ipis_rcvd++;
+ write_apic_reg(APIC_EOI, 77);
+}
+
+static void sender_guest_code(struct test_data_page *data)
+{
+ uint64_t last_wake_count;
+ uint64_t last_hlt_count;
+ uint64_t last_ipis_rcvd_count;
+ uint32_t icr_val;
+ uint32_t icr2_val;
+ uint64_t tsc_start;
+
+ verify_apic_base_addr();
+ enable_xapic();
+
+ /*
+ * Init interrupt command register for sending IPIs
+ *
+ * Delivery mode=fixed, per SDM:
+ * "Delivers the interrupt specified in the vector field to the target
+ * processor."
+ *
+ * Destination mode=physical i.e. specify target by its local APIC
+ * ID. This vCPU assumes that the halter vCPU has already started and
+ * set data->halter_apic_id.
+ */
+ icr_val = (APIC_DEST_PHYSICAL | APIC_DM_FIXED | IPI_VECTOR);
+ icr2_val = SET_APIC_DEST_FIELD(data->halter_apic_id);
+ data->icr = icr_val;
+ data->icr2 = icr2_val;
+
+ last_wake_count = data->wake_count;
+ last_hlt_count = data->hlt_count;
+ last_ipis_rcvd_count = ipis_rcvd;
+ for (;;) {
+ /*
+ * Send IPI to halter vCPU.
+ * First IPI can be sent unconditionally because halter vCPU
+ * starts earlier.
+ */
+ write_apic_reg(APIC_ICR2, icr2_val);
+ write_apic_reg(APIC_ICR, icr_val);
+ data->ipis_sent++;
+
+ /*
+ * Wait up to ~1 sec for halter to indicate that it has:
+ * 1. Received the IPI
+ * 2. Woken up from the halt
+ * 3. Gone back into halt
+ * Current CPUs typically run at 2.x Ghz which is ~2
+ * billion ticks per second.
+ */
+ tsc_start = rdtsc();
+ while (rdtsc() - tsc_start < 2000000000) {
+ if ((ipis_rcvd != last_ipis_rcvd_count) &&
+ (data->wake_count != last_wake_count) &&
+ (data->hlt_count != last_hlt_count))
+ break;
+ }
+
+ GUEST_ASSERT((ipis_rcvd != last_ipis_rcvd_count) &&
+ (data->wake_count != last_wake_count) &&
+ (data->hlt_count != last_hlt_count));
+
+ last_wake_count = data->wake_count;
+ last_hlt_count = data->hlt_count;
+ last_ipis_rcvd_count = ipis_rcvd;
+ }
+}
+
+static void *vcpu_thread(void *arg)
+{
+ struct thread_params *params = (struct thread_params *)arg;
+ struct ucall uc;
+ int old;
+ int r;
+ unsigned int exit_reason;
+
+ r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+ TEST_ASSERT(r == 0,
+ "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
+ params->vcpu_id, r);
+
+ fprintf(stderr, "vCPU thread running vCPU %u\n", params->vcpu_id);
+ vcpu_run(params->vm, params->vcpu_id);
+ exit_reason = vcpu_state(params->vm, params->vcpu_id)->exit_reason;
+
+ TEST_ASSERT(exit_reason == KVM_EXIT_IO,
+ "vCPU %u exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO",
+ params->vcpu_id, exit_reason, exit_reason_str(exit_reason));
+
+ if (get_ucall(params->vm, params->vcpu_id, &uc) == UCALL_ABORT) {
+ TEST_ASSERT(false,
+ "vCPU %u exited with error: %s.\n"
+ "Sending vCPU sent %lu IPIs to halting vCPU\n"
+ "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
+ "Halter TPR=%#x PPR=%#x LVR=%#x\n"
+ "Migrations attempted: %lu\n"
+ "Migrations completed: %lu\n",
+ params->vcpu_id, (const char *)uc.args[0],
+ params->data->ipis_sent, params->data->hlt_count,
+ params->data->wake_count,
+ *params->pipis_rcvd, params->data->halter_tpr,
+ params->data->halter_ppr, params->data->halter_lvr,
+ params->data->migrations_attempted,
+ params->data->migrations_completed);
+ }
+
+ return NULL;
+}
+
+static void cancel_join_vcpu_thread(pthread_t thread, uint32_t vcpu_id)
+{
+ void *retval;
+ int r;
+
+ r = pthread_cancel(thread);
+ TEST_ASSERT(r == 0,
+ "pthread_cancel on vcpu_id=%d failed with errno=%d",
+ vcpu_id, r);
+
+ r = pthread_join(thread, &retval);
+ TEST_ASSERT(r == 0,
+ "pthread_join on vcpu_id=%d failed with errno=%d",
+ vcpu_id, r);
+ TEST_ASSERT(retval == PTHREAD_CANCELED,
+ "expected retval=%p, got %p", PTHREAD_CANCELED,
+ retval);
+}
+
+void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs,
+ uint64_t *pipis_rcvd)
+{
+ long pages_not_moved;
+ unsigned long nodemask = 0;
+ unsigned long nodemasks[sizeof(nodemask) * 8];
+ int nodes = 0;
+ time_t start_time, last_update, now;
+ time_t interval_secs = 1;
+ int i, r;
+ int from, to;
+ unsigned long bit;
+ uint64_t hlt_count;
+ uint64_t wake_count;
+ uint64_t ipis_sent;
+
+ fprintf(stderr, "Calling migrate_pages every %d microseconds\n",
+ delay_usecs);
+
+ /* Get set of first 64 numa nodes available */
+ r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8,
+ 0, MPOL_F_MEMS_ALLOWED);
+ TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno);
+
+ fprintf(stderr, "Numa nodes found amongst first %lu possible nodes "
+ "(each 1-bit indicates node is present): %#lx\n",
+ sizeof(nodemask) * 8, nodemask);
+
+ /* Init array of masks containing a single-bit in each, one for each
+ * available node. migrate_pages called below requires specifying nodes
+ * as bit masks.
+ */
+ for (i = 0, bit = 1; i < sizeof(nodemask) * 8; i++, bit <<= 1) {
+ if (nodemask & bit) {
+ nodemasks[nodes] = nodemask & bit;
+ nodes++;
+ }
+ }
+
+ TEST_ASSERT(nodes > 1,
+ "Did not find at least 2 numa nodes. Can't do migration\n");
+
+ fprintf(stderr, "Migrating amongst %d nodes found\n", nodes);
+
+ from = 0;
+ to = 1;
+ start_time = time(NULL);
+ last_update = start_time;
+
+ ipis_sent = data->ipis_sent;
+ hlt_count = data->hlt_count;
+ wake_count = data->wake_count;
+
+ while ((int)(time(NULL) - start_time) < run_secs) {
+ data->migrations_attempted++;
+
+ /*
+ * migrate_pages with PID=0 will migrate all pages of this
+ * process between the nodes specified as bitmasks. The page
+ * backing the APIC access address belongs to this process
+ * because it is allocated by KVM in the context of the
+ * KVM_CREATE_VCPU ioctl. If that assumption ever changes this
+ * test may break or give a false positive signal.
+ */
+ pages_not_moved = migrate_pages(0, sizeof(nodemasks[from]),
+ &nodemasks[from],
+ &nodemasks[to]);
+ if (pages_not_moved < 0)
+ fprintf(stderr,
+ "migrate_pages failed, errno=%d\n", errno);
+ else if (pages_not_moved > 0)
+ fprintf(stderr,
+ "migrate_pages could not move %ld pages\n",
+ pages_not_moved);
+ else
+ data->migrations_completed++;
+
+ from = to;
+ to++;
+ if (to == nodes)
+ to = 0;
+
+ now = time(NULL);
+ if (((now - start_time) % interval_secs == 0) &&
+ (now != last_update)) {
+ last_update = now;
+ fprintf(stderr,
+ "%lu seconds: Migrations attempted=%lu completed=%lu, "
+ "IPIs sent=%lu received=%lu, HLTs=%lu wakes=%lu\n",
+ now - start_time, data->migrations_attempted,
+ data->migrations_completed,
+ data->ipis_sent, *pipis_rcvd,
+ data->hlt_count, data->wake_count);
+
+ TEST_ASSERT(ipis_sent != data->ipis_sent &&
+ hlt_count != data->hlt_count &&
+ wake_count != data->wake_count,
+ "IPI, HLT and wake count have not increased "
+ "in the last %lu seconds. "
+ "HLTer is likely hung.\n", interval_secs);
+
+ ipis_sent = data->ipis_sent;
+ hlt_count = data->hlt_count;
+ wake_count = data->wake_count;
+ }
+ usleep(delay_usecs);
+ }
+}
+
+void get_cmdline_args(int argc, char *argv[], int *run_secs,
+ bool *migrate, int *delay_usecs)
+{
+ for (;;) {
+ int opt = getopt(argc, argv, "s:d:m");
+
+ if (opt == -1)
+ break;
+ switch (opt) {
+ case 's':
+ *run_secs = parse_size(optarg);
+ break;
+ case 'm':
+ *migrate = true;
+ break;
+ case 'd':
+ *delay_usecs = parse_size(optarg);
+ break;
+ default:
+ TEST_ASSERT(false,
+ "Usage: -s <runtime seconds>. Default is %d seconds.\n"
+ "-m adds calls to migrate_pages while vCPUs are running."
+ " Default is no migrations.\n"
+ "-d <delay microseconds> - delay between migrate_pages() calls."
+ " Default is %d microseconds.\n",
+ DEFAULT_RUN_SECS, DEFAULT_DELAY_USECS);
+ }
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ int r;
+ int wait_secs;
+ const int max_halter_wait = 10;
+ int run_secs = 0;
+ int delay_usecs = 0;
+ struct test_data_page *data;
+ vm_vaddr_t test_data_page_vaddr;
+ bool migrate = false;
+ pthread_t threads[2];
+ struct thread_params params[2];
+ struct kvm_vm *vm;
+ uint64_t *pipis_rcvd;
+
+ get_cmdline_args(argc, argv, &run_secs, &migrate, &delay_usecs);
+ if (run_secs <= 0)
+ run_secs = DEFAULT_RUN_SECS;
+ if (delay_usecs <= 0)
+ delay_usecs = DEFAULT_DELAY_USECS;
+
+ vm = vm_create_default(HALTER_VCPU_ID, 0, halter_guest_code);
+ params[0].vm = vm;
+ params[1].vm = vm;
+
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vm, HALTER_VCPU_ID);
+ vm_handle_exception(vm, IPI_VECTOR, guest_ipi_handler);
+
+ virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA, 0);
+
+ vm_vcpu_add_default(vm, SENDER_VCPU_ID, sender_guest_code);
+
+ test_data_page_vaddr = vm_vaddr_alloc(vm, 0x1000, 0x1000, 0, 0);
+ data =
+ (struct test_data_page *)addr_gva2hva(vm, test_data_page_vaddr);
+ memset(data, 0, sizeof(*data));
+ params[0].data = data;
+ params[1].data = data;
+
+ vcpu_args_set(vm, HALTER_VCPU_ID, 1, test_data_page_vaddr);
+ vcpu_args_set(vm, SENDER_VCPU_ID, 1, test_data_page_vaddr);
+
+ pipis_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ipis_rcvd);
+ params[0].pipis_rcvd = pipis_rcvd;
+ params[1].pipis_rcvd = pipis_rcvd;
+
+ /* Start halter vCPU thread and wait for it to execute first HLT. */
+ params[0].vcpu_id = HALTER_VCPU_ID;
+ r = pthread_create(&threads[0], NULL, vcpu_thread, &params[0]);
+ TEST_ASSERT(r == 0,
+ "pthread_create halter failed errno=%d", errno);
+ fprintf(stderr, "Halter vCPU thread started\n");
+
+ wait_secs = 0;
+ while ((wait_secs < max_halter_wait) && !data->hlt_count) {
+ sleep(1);
+ wait_secs++;
+ }
+
+ TEST_ASSERT(data->hlt_count,
+ "Halter vCPU did not execute first HLT within %d seconds",
+ max_halter_wait);
+
+ fprintf(stderr,
+ "Halter vCPU thread reported its APIC ID: %u after %d seconds.\n",
+ data->halter_apic_id, wait_secs);
+
+ params[1].vcpu_id = SENDER_VCPU_ID;
+ r = pthread_create(&threads[1], NULL, vcpu_thread, &params[1]);
+ TEST_ASSERT(r == 0, "pthread_create sender failed errno=%d", errno);
+
+ fprintf(stderr,
+ "IPI sender vCPU thread started. Letting vCPUs run for %d seconds.\n",
+ run_secs);
+
+ if (!migrate)
+ sleep(run_secs);
+ else
+ do_migrations(data, run_secs, delay_usecs, pipis_rcvd);
+
+ /*
+ * Cancel threads and wait for them to stop.
+ */
+ cancel_join_vcpu_thread(threads[0], HALTER_VCPU_ID);
+ cancel_join_vcpu_thread(threads[1], SENDER_VCPU_ID);
+
+ fprintf(stderr,
+ "Test successful after running for %d seconds.\n"
+ "Sending vCPU sent %lu IPIs to halting vCPU\n"
+ "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
+ "Halter APIC ID=%#x\n"
+ "Sender ICR value=%#x ICR2 value=%#x\n"
+ "Halter TPR=%#x PPR=%#x LVR=%#x\n"
+ "Migrations attempted: %lu\n"
+ "Migrations completed: %lu\n",
+ run_secs, data->ipis_sent,
+ data->hlt_count, data->wake_count, *pipis_rcvd,
+ data->halter_apic_id,
+ data->icr, data->icr2,
+ data->halter_tpr, data->halter_ppr, data->halter_lvr,
+ data->migrations_attempted, data->migrations_completed);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
new file mode 100644
index 000000000000..9246ea310587
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_vmcall_test
+ *
+ * Copyright © 2021 Amazon.com, Inc. or its affiliates.
+ *
+ * Xen shared_info / pvclock testing
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <stdint.h>
+#include <time.h>
+
+#define VCPU_ID 5
+
+#define SHINFO_REGION_GPA 0xc0000000ULL
+#define SHINFO_REGION_SLOT 10
+#define PAGE_SIZE 4096
+
+#define PVTIME_ADDR (SHINFO_REGION_GPA + PAGE_SIZE)
+
+static struct kvm_vm *vm;
+
+#define XEN_HYPERCALL_MSR 0x40000000
+
+struct pvclock_vcpu_time_info {
+ u32 version;
+ u32 pad0;
+ u64 tsc_timestamp;
+ u64 system_time;
+ u32 tsc_to_system_mul;
+ s8 tsc_shift;
+ u8 flags;
+ u8 pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+ u32 version;
+ u32 sec;
+ u32 nsec;
+} __attribute__((__packed__));
+
+static void guest_code(void)
+{
+ GUEST_DONE();
+}
+
+static int cmp_timespec(struct timespec *a, struct timespec *b)
+{
+ if (a->tv_sec > b->tv_sec)
+ return 1;
+ else if (a->tv_sec < b->tv_sec)
+ return -1;
+ else if (a->tv_nsec > b->tv_nsec)
+ return 1;
+ else if (a->tv_nsec < b->tv_nsec)
+ return -1;
+ else
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ struct timespec min_ts, max_ts, vm_ts;
+
+ if (!(kvm_check_cap(KVM_CAP_XEN_HVM) &
+ KVM_XEN_HVM_CONFIG_SHARED_INFO) ) {
+ print_skip("KVM_XEN_HVM_CONFIG_SHARED_INFO not available");
+ exit(KSFT_SKIP);
+ }
+
+ clock_gettime(CLOCK_REALTIME, &min_ts);
+
+ vm = vm_create_default(VCPU_ID, 0, (void *) guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ /* Map a region for the shared_info page */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0);
+
+ struct kvm_xen_hvm_config hvmc = {
+ .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
+ .msr = XEN_HYPERCALL_MSR,
+ };
+ vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
+
+ struct kvm_xen_hvm_attr lm = {
+ .type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+ .u.long_mode = 1,
+ };
+ vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
+
+ struct kvm_xen_hvm_attr ha = {
+ .type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+ .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE,
+ };
+ vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
+
+ struct kvm_xen_vcpu_attr vi = {
+ .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
+ .u.gpa = SHINFO_REGION_GPA + 0x40,
+ };
+ vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &vi);
+
+ struct kvm_xen_vcpu_attr pvclock = {
+ .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
+ .u.gpa = PVTIME_ADDR,
+ };
+ vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &pvclock);
+
+ for (;;) {
+ volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ vcpu_run(vm, VCPU_ID);
+
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s", (const char *)uc.args[0]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+ }
+ }
+
+ done:
+ clock_gettime(CLOCK_REALTIME, &max_ts);
+
+ /*
+ * Just a *really* basic check that things are being put in the
+ * right place. The actual calculations are much the same for
+ * Xen as they are for the KVM variants, so no need to check.
+ */
+ struct pvclock_wall_clock *wc;
+ struct pvclock_vcpu_time_info *ti, *ti2;
+
+ wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00);
+ ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20);
+ ti2 = addr_gpa2hva(vm, PVTIME_ADDR);
+
+ vm_ts.tv_sec = wc->sec;
+ vm_ts.tv_nsec = wc->nsec;
+ TEST_ASSERT(wc->version && !(wc->version & 1),
+ "Bad wallclock version %x", wc->version);
+ TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
+ TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
+
+ TEST_ASSERT(ti->version && !(ti->version & 1),
+ "Bad time_info version %x", ti->version);
+ TEST_ASSERT(ti2->version && !(ti2->version & 1),
+ "Bad time_info version %x", ti->version);
+
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
new file mode 100644
index 000000000000..8389e0bfd711
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * xen_vmcall_test
+ *
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates.
+ *
+ * Userspace hypercall testing
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 5
+
+#define HCALL_REGION_GPA 0xc0000000ULL
+#define HCALL_REGION_SLOT 10
+#define PAGE_SIZE 4096
+
+static struct kvm_vm *vm;
+
+#define INPUTVALUE 17
+#define ARGVALUE(x) (0xdeadbeef5a5a0000UL + x)
+#define RETVALUE 0xcafef00dfbfbffffUL
+
+#define XEN_HYPERCALL_MSR 0x40000200
+#define HV_GUEST_OS_ID_MSR 0x40000000
+#define HV_HYPERCALL_MSR 0x40000001
+
+#define HVCALL_SIGNAL_EVENT 0x005d
+#define HV_STATUS_INVALID_ALIGNMENT 4
+
+static void guest_code(void)
+{
+ unsigned long rax = INPUTVALUE;
+ unsigned long rdi = ARGVALUE(1);
+ unsigned long rsi = ARGVALUE(2);
+ unsigned long rdx = ARGVALUE(3);
+ unsigned long rcx;
+ register unsigned long r10 __asm__("r10") = ARGVALUE(4);
+ register unsigned long r8 __asm__("r8") = ARGVALUE(5);
+ register unsigned long r9 __asm__("r9") = ARGVALUE(6);
+
+ /* First a direct invocation of 'vmcall' */
+ __asm__ __volatile__("vmcall" :
+ "=a"(rax) :
+ "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
+ "r"(r10), "r"(r8), "r"(r9));
+ GUEST_ASSERT(rax == RETVALUE);
+
+ /* Fill in the Xen hypercall page */
+ __asm__ __volatile__("wrmsr" : : "c" (XEN_HYPERCALL_MSR),
+ "a" (HCALL_REGION_GPA & 0xffffffff),
+ "d" (HCALL_REGION_GPA >> 32));
+
+ /* Set Hyper-V Guest OS ID */
+ __asm__ __volatile__("wrmsr" : : "c" (HV_GUEST_OS_ID_MSR),
+ "a" (0x5a), "d" (0));
+
+ /* Hyper-V hypercall page */
+ u64 msrval = HCALL_REGION_GPA + PAGE_SIZE + 1;
+ __asm__ __volatile__("wrmsr" : : "c" (HV_HYPERCALL_MSR),
+ "a" (msrval & 0xffffffff),
+ "d" (msrval >> 32));
+
+ /* Invoke a Xen hypercall */
+ __asm__ __volatile__("call *%1" : "=a"(rax) :
+ "r"(HCALL_REGION_GPA + INPUTVALUE * 32),
+ "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
+ "r"(r10), "r"(r8), "r"(r9));
+ GUEST_ASSERT(rax == RETVALUE);
+
+ /* Invoke a Hyper-V hypercall */
+ rax = 0;
+ rcx = HVCALL_SIGNAL_EVENT; /* code */
+ rdx = 0x5a5a5a5a; /* ingpa (badly aligned) */
+ __asm__ __volatile__("call *%1" : "=a"(rax) :
+ "r"(HCALL_REGION_GPA + PAGE_SIZE),
+ "a"(rax), "c"(rcx), "d"(rdx),
+ "r"(r8));
+ GUEST_ASSERT(rax == HV_STATUS_INVALID_ALIGNMENT);
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ if (!(kvm_check_cap(KVM_CAP_XEN_HVM) &
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) ) {
+ print_skip("KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL not available");
+ exit(KSFT_SKIP);
+ }
+
+ vm = vm_create_default(VCPU_ID, 0, (void *) guest_code);
+ vcpu_set_hv_cpuid(vm, VCPU_ID);
+
+ struct kvm_xen_hvm_config hvmc = {
+ .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
+ .msr = XEN_HYPERCALL_MSR,
+ };
+ vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
+
+ /* Map a region for the hypercall pages */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ HCALL_REGION_GPA, HCALL_REGION_SLOT, 2, 0);
+ virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2, 0);
+
+ for (;;) {
+ volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ vcpu_run(vm, VCPU_ID);
+
+ if (run->exit_reason == KVM_EXIT_XEN) {
+ ASSERT_EQ(run->xen.type, KVM_EXIT_XEN_HCALL);
+ ASSERT_EQ(run->xen.u.hcall.cpl, 0);
+ ASSERT_EQ(run->xen.u.hcall.longmode, 1);
+ ASSERT_EQ(run->xen.u.hcall.input, INPUTVALUE);
+ ASSERT_EQ(run->xen.u.hcall.params[0], ARGVALUE(1));
+ ASSERT_EQ(run->xen.u.hcall.params[1], ARGVALUE(2));
+ ASSERT_EQ(run->xen.u.hcall.params[2], ARGVALUE(3));
+ ASSERT_EQ(run->xen.u.hcall.params[3], ARGVALUE(4));
+ ASSERT_EQ(run->xen.u.hcall.params[4], ARGVALUE(5));
+ ASSERT_EQ(run->xen.u.hcall.params[5], ARGVALUE(6));
+ run->xen.u.hcall.result = RETVALUE;
+ continue;
+ }
+
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s", (const char *)uc.args[0]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+ }
+ }
+done:
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
index 9d01299563ee..7aafefc50aa7 100644
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@@ -9,6 +9,7 @@
#include <linux/vmalloc.h>
#include <linux/kvm_dirty_ring.h>
#include <trace/events/kvm.h>
+#include "mmu_lock.h"
int __weak kvm_cpu_dirty_log_size(void)
{
@@ -60,17 +61,16 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
if (!memslot || (offset + __fls(mask)) >= memslot->npages)
return;
- spin_lock(&kvm->mmu_lock);
+ KVM_MMU_LOCK(kvm);
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
- spin_unlock(&kvm->mmu_lock);
+ KVM_MMU_UNLOCK(kvm);
}
int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size)
{
- ring->dirty_gfns = vmalloc(size);
+ ring->dirty_gfns = vzalloc(size);
if (!ring->dirty_gfns)
return -ENOMEM;
- memset(ring->dirty_gfns, 0, size);
ring->size = size / sizeof(struct kvm_dirty_gfn);
ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries();
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8367d88ce39b..001b9de4e727 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -58,6 +58,7 @@
#include "coalesced_mmio.h"
#include "async_pf.h"
+#include "mmu_lock.h"
#include "vfio.h"
#define CREATE_TRACE_POINTS
@@ -459,13 +460,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
int idx;
idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+
+ KVM_MMU_LOCK(kvm);
+
kvm->mmu_notifier_seq++;
if (kvm_set_spte_hva(kvm, address, pte))
kvm_flush_remote_tlbs(kvm);
- spin_unlock(&kvm->mmu_lock);
+ KVM_MMU_UNLOCK(kvm);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -476,7 +479,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
int need_tlb_flush = 0, idx;
idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+ KVM_MMU_LOCK(kvm);
/*
* The count increase must become visible at unlock time as no
* spte can be established without taking the mmu_lock and
@@ -489,7 +492,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
if (need_tlb_flush || kvm->tlbs_dirty)
kvm_flush_remote_tlbs(kvm);
- spin_unlock(&kvm->mmu_lock);
+ KVM_MMU_UNLOCK(kvm);
srcu_read_unlock(&kvm->srcu, idx);
return 0;
@@ -500,7 +503,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- spin_lock(&kvm->mmu_lock);
+ KVM_MMU_LOCK(kvm);
/*
* This sequence increase will notify the kvm page fault that
* the page that is going to be mapped in the spte could have
@@ -514,7 +517,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
* in conjunction with the smp_rmb in mmu_notifier_retry().
*/
kvm->mmu_notifier_count--;
- spin_unlock(&kvm->mmu_lock);
+ KVM_MMU_UNLOCK(kvm);
BUG_ON(kvm->mmu_notifier_count < 0);
}
@@ -528,13 +531,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
int young, idx;
idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+ KVM_MMU_LOCK(kvm);
young = kvm_age_hva(kvm, start, end);
if (young)
kvm_flush_remote_tlbs(kvm);
- spin_unlock(&kvm->mmu_lock);
+ KVM_MMU_UNLOCK(kvm);
srcu_read_unlock(&kvm->srcu, idx);
return young;
@@ -549,7 +552,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
int young, idx;
idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+ KVM_MMU_LOCK(kvm);
/*
* Even though we do not flush TLB, this will still adversely
* affect performance on pre-Haswell Intel EPT, where there is
@@ -564,7 +567,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
* more sophisticated heuristic later.
*/
young = kvm_age_hva(kvm, start, end);
- spin_unlock(&kvm->mmu_lock);
+ KVM_MMU_UNLOCK(kvm);
srcu_read_unlock(&kvm->srcu, idx);
return young;
@@ -578,9 +581,9 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
int young, idx;
idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+ KVM_MMU_LOCK(kvm);
young = kvm_test_age_hva(kvm, address);
- spin_unlock(&kvm->mmu_lock);
+ KVM_MMU_UNLOCK(kvm);
srcu_read_unlock(&kvm->srcu, idx);
return young;
@@ -745,7 +748,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
if (!kvm)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&kvm->mmu_lock);
+ KVM_MMU_LOCK_INIT(kvm);
mmgrab(current->mm);
kvm->mm = current->mm;
kvm_eventfd_init(kvm);
@@ -1525,7 +1528,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
memset(dirty_bitmap_buffer, 0, n);
- spin_lock(&kvm->mmu_lock);
+ KVM_MMU_LOCK(kvm);
for (i = 0; i < n / sizeof(long); i++) {
unsigned long mask;
gfn_t offset;
@@ -1541,7 +1544,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
offset, mask);
}
- spin_unlock(&kvm->mmu_lock);
+ KVM_MMU_UNLOCK(kvm);
}
if (flush)
@@ -1636,7 +1639,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
return -EFAULT;
- spin_lock(&kvm->mmu_lock);
+ KVM_MMU_LOCK(kvm);
for (offset = log->first_page, i = offset / BITS_PER_LONG,
n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
i++, offset += BITS_PER_LONG) {
@@ -1659,7 +1662,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
offset, mask);
}
}
- spin_unlock(&kvm->mmu_lock);
+ KVM_MMU_UNLOCK(kvm);
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -1903,10 +1906,12 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
bool write_fault, bool *writable,
kvm_pfn_t *p_pfn)
{
- unsigned long pfn;
+ kvm_pfn_t pfn;
+ pte_t *ptep;
+ spinlock_t *ptl;
int r;
- r = follow_pfn(vma, addr, &pfn);
+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
if (r) {
/*
* get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
@@ -1921,14 +1926,19 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
if (r)
return r;
- r = follow_pfn(vma, addr, &pfn);
+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
if (r)
return r;
+ }
+ if (write_fault && !pte_write(*ptep)) {
+ pfn = KVM_PFN_ERR_RO_FAULT;
+ goto out;
}
if (writable)
- *writable = true;
+ *writable = pte_write(*ptep);
+ pfn = pte_pfn(*ptep);
/*
* Get a reference here because callers of *hva_to_pfn* and
@@ -1943,6 +1953,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
*/
kvm_get_pfn(pfn);
+out:
+ pte_unmap_unlock(ptep, ptl);
*p_pfn = pfn;
return 0;
}
diff --git a/virt/kvm/mmu_lock.h b/virt/kvm/mmu_lock.h
new file mode 100644
index 000000000000..9e1308f9734c
--- /dev/null
+++ b/virt/kvm/mmu_lock.h
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef KVM_MMU_LOCK_H
+#define KVM_MMU_LOCK_H 1
+
+/*
+ * Architectures can choose whether to use an rwlock or spinlock
+ * for the mmu_lock. These macros, for use in common code
+ * only, avoids using #ifdefs in places that must deal with
+ * multiple architectures.
+ */
+
+#ifdef KVM_HAVE_MMU_RWLOCK
+#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock)
+#define KVM_MMU_LOCK(kvm) write_lock(&(kvm)->mmu_lock)
+#define KVM_MMU_UNLOCK(kvm) write_unlock(&(kvm)->mmu_lock)
+#else
+#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock)
+#define KVM_MMU_LOCK(kvm) spin_lock(&(kvm)->mmu_lock)
+#define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock)
+#endif /* KVM_HAVE_MMU_RWLOCK */
+
+#endif